Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
512 changes: 512 additions & 0 deletions .Rhistory

Large diffs are not rendered by default.

2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
.RData

148 changes: 145 additions & 3 deletions main.go
Original file line number Diff line number Diff line change
@@ -1,7 +1,11 @@
package main

import (
"encoding/json"
"fmt"
"io/ioutil"
"os"
"strings"
"time"
)

Expand All @@ -19,8 +23,8 @@ func timeTrack(start time.Time, name string) {
//It also calls the compute and output the returned struct
//to stdout.
func main() {
defer timeTrack(time.Now(), "compute diff")
fmt.Println(computeDiff())
//defer timeTrack(time.Now(), "compute diff")
//fmt.Println(computeDiff())

defer timeTrack(time.Now(), "compute AST")
fmt.Println(computeAST())
Expand All @@ -36,12 +40,150 @@ func main() {
// list of function calls seen in the diffs and their number of calls
func computeDiff() *diffResult {

//s :=make([]string, 1)




return nil
}

//computeAST go through the AST and returns
//a astResult struct that contains all the variable declarations
func computeAST() *astResult {

return nil
vars := make([]variableDescription, 0);

//path to ast json file
path := "./ast/astChallenge.json"
jsonFile, err := os.Open(path)

if err != nil {
fmt.Println(err)
}

fmt.Println("Successfully opened "+path)
defer jsonFile.Close()

byteValue, _ := ioutil.ReadAll(jsonFile)

var root Root
json.Unmarshal(byteValue, &root)

allNodes := decl(root.Root)

for i := 0; i < len(allNodes); i++ {
if(allNodes[i].Type == "LocalDeclarationStatement") {
variable := getVar(allNodes[i])
vars = append(vars, variableDescription{variable[1], variable[0]})
}
}

r := astResult{vars}


return &r

}

//This function gets all nodes in the ast
func decl(v Type) []Type {
returnTypes := make([]Type, 1)

if len(v.Children) == 0 {
returnTypes = append(returnTypes, v)
} else {
returnTypes = append(returnTypes, v)
for i := 0; i < len(v.Children); i++ {
appends := decl(v.Children[i])
for j := 0; j < len(appends); j++ {
returnTypes = append(returnTypes, appends[j])
}
}
}

return returnTypes
}

//This function takes specific nodes from the ast, and searches successive nodes
//for variables.
//This works via a keyword search. Essentially, the function searches all nodes indexed after
//the input node until a node is found that contains an "IdentifierToken" or "*Keyword" in the "Type" field
//in our example, nodes with these "Types" ALWAYS contain some information about a declared variable
//in particular, this is the case when searching through the children of a "LocalDeclarationStatement", which is
//the only time this function is called.
//Regrettably, there's an argument to be made here that this is essentially hard-coding, but the only discernible
//pattern among variables in the ast is the presence of certain keywords, so this is the most obvious approach,
//as well as the quickest.
//Because this is a keyword search, and our example does not contain user-defined variable types, this function
//will not find user-defined variable types. This could be fixed once I know what keywords to handle for these types.
func getVar(v Type) [2]string {

var identifier string
var datatype string

nodes := decl(v)


for i := 0; i < len(nodes); i++{
if nodes[i].Type == "VariableDeclarator" {
j:= 1
for i+j < len(nodes) && nodes[i+j].Type != "IdentifierToken" && !strings.Contains(nodes[i+j].Type, "Keyword") {
j++
}
if i+j < len(nodes) {
if(strings.Contains(nodes[i+j].Type, "Keyword")){
datatype = nodes[i+j].Value
} else {
identifier = nodes[i+j].Value
}
}
continue
} else if nodes[i].Type == "VariableDeclaration" {
j:= 1
for i+j < len(nodes) && nodes[i+j].Type != "IdentifierToken" && !strings.Contains(nodes[i+j].Type, "Keyword") {
j++
}
if i+j < len(nodes) {
if(strings.Contains(nodes[i+j].Type, "Keyword")){
datatype = nodes[i+j].Value
} else {
identifier = nodes[i+j].Value
}

//This handles the case where a variable is declared as "var x = new type y"
//in our example, this only occurs when an array is declared, so we handle that possibility here
if(nodes[i+j].Value == "var"){
for i+j < len(nodes) && nodes[i+j].Type != "CloseBracketToken" {
if(strings.Contains(nodes[i+j].Type, "Keyword")) {
datatype = nodes[i+j].Value
}
if nodes[i+j].Type == "OpenBracketToken" {
datatype = strings.Join([]string{datatype, "["}, "")
}
j++
if nodes[i+j].Type == "CloseBracketToken" {
datatype = strings.Join([]string{datatype, "]"}, "")
}
}
}
}
continue
}
}

return [...]string{datatype, identifier}

}


type Root struct{
uuid string`json:"uuid"`
Root Type`json:"Root"`
}

type Type struct{
Type string `json:"Type"`
Value string `json:"ValueText"`
Children []Type `json:"Children"`
}
36 changes: 36 additions & 0 deletions seq.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
library(glmnet)
library(neuralnet)
library(NeuralNetTools)
library(nnet)
library(randomForest)

set.seed(212919156)

#read in the sample data
sample <- read.csv("seq/sample.csv")
sample$class = factor(sample$class)

#Use a random forest to generate a model to predict class
rf <- randomForest(factor(class)~., data=data.frame(sample[,-1]))

#Now look at the Variable Importance Plot of the Random Forest to determine relevant variables to the model
varImpPlot(rf)

#based on the variable importance plot, the following variables are below an arbitrary 200 MeanGiniDecrease threshold
#So, we'll disregard them and remake the data without
simdat <- sample[,-c(1,4,24,17,21,23,14,19,22,25,26,27,28,29,30,31)]

#now using only those, we can run a logistic regression
simlog <- glm(class ~ ., family="binomial", data=simdat)

#Now get the confusion matrix for both
#First the randomForest
table(predict(rf, newdata=sample[,-1]), sample$class)

#Then the logistic regression
table(predict(simlog, type="response") > 0.5, sample$class)

#obviously, the randomForest performs better for predictions, so given data about the next events, we would use the randomForest
#model to predict the class. Random Forests have built in cross-validation as well, due to Out-of-Bag estimation,
#so there's no need to additionally cross-validate this model.

61 changes: 61 additions & 0 deletions seq.Rmd
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
---
title: "Seq.Rmd"
author: "Jeff B"
date: "May 24, 2019"
output: pdf_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```

#Sequence Question

This question is a sequence classification question. Considering the large number of variables, however, we can start by examining it as a straightforward classification problem. This will allow us some of the benefits of straightforward classification, most importantly variable selection. This part was done in R.
The sequence classification portion was done in Python, and can be found in the seq folder as seq.py

Start by importing the necessary libraries and setting the seed for reproducibility
```{r, echo=FALSE, warning=FALSE}
library(glmnet)
library(neuralnet)
library(NeuralNetTools)
library(nnet)
library(randomForest)

set.seed(212919156)
```



Next, we read in the data, and generate a RandomForest model for this data. RandomForests have built in cross-validation via Out-of-Bag estimates, so we don't need to do any extra cross-validation. We can also look at the variable importance plot here to determine the most important variables. Somewhat arbitrarily, we'll set a cutoff at 200 MeanDecreaseGini. This leaves us with 15 relevant variables.
```{r}
sample <- read.csv("seq/sample.csv")
sample$class = factor(sample$class)

#Use a random forest to generate a model to predict class
rf <- randomForest(factor(class)~., data=data.frame(sample[,-1]))

#Now look at the Variable Importance Plot of the Random Forest to determine relevant variables to the model
varImpPlot(rf)
```

For comparison, we'll run a logistic regression using only variables above the threshold

```{r}
simdat <- sample[,-c(1,4,24,17,21,23,14,19,22,25,26,27,28,29,30,31)]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The first column is an ID so of course, it's not relevant by itself but it can be used to link the information here with the one in the res.csv file. Which you neither used nor seem to looked at. Any reason?


#now using only those, we can run a logistic regression
simlog <- glm(class ~ ., family="binomial", data=simdat)
```

Now we'll get the confusion matrix for both
```{r}
#First the randomForest
table(predict(rf, newdata=sample[,-1]), sample$class)

#Then the logistic regression
table(predict(simlog, type="response") > 0.5, sample$class)
```

The randomForest model perdorms better for predictions, so given data about the next event, we would use the randomForest model to predict the class.
Considering we don't have that information, this process has still been useful, since we will use only those variables we've deemed relevant in our sequence prediction problem.
Binary file added seq.pdf
Binary file not shown.
1 change: 1 addition & 0 deletions seq/.~lock.sample.csv#
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
,jeff,Rupert,24.05.2019 12:01,file:///home/jeff/.config/libreoffice/4;
Loading