MathieuNls · jeffbulmer · May 23, 2019 · May 27, 2019 · blaisegarant · Jun 6, 2019
diff --git a/.Rhistory b/.Rhistory
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,2 @@
+.RData
+
diff --git a/main.go b/main.go
@@ -1,7 +1,11 @@
 package main
 
 import (
+	"encoding/json"
 	"fmt"
+	"io/ioutil"
+	"os"
+	"strings"
 	"time"
 )
 
@@ -19,8 +23,8 @@ func timeTrack(start time.Time, name string) {
 //It also calls the compute and output the returned struct
 //to stdout.
 func main() {
-	defer timeTrack(time.Now(), "compute diff")
-	fmt.Println(computeDiff())
+	//defer timeTrack(time.Now(), "compute diff")
+	//fmt.Println(computeDiff())
 
 	defer timeTrack(time.Now(), "compute AST")
 	fmt.Println(computeAST())
@@ -36,12 +40,150 @@ func main() {
 //	list of function calls seen in the diffs and their number of calls
 func computeDiff() *diffResult {
 
+	//s :=make([]string, 1)
+
+
+
+
 	return nil
 }
 
 //computeAST go through the AST and returns
 //a astResult struct that contains all the variable declarations
 func computeAST() *astResult {
 
-	return nil
+	vars := make([]variableDescription, 0);
+
+	//path to ast json file
+	path := "./ast/astChallenge.json"
+	jsonFile, err := os.Open(path)
+
+	if err != nil {
+		fmt.Println(err)
+	}
+
+	fmt.Println("Successfully opened "+path)
+	defer jsonFile.Close()
+
+	byteValue, _ := ioutil.ReadAll(jsonFile)
+
+	var root Root
+	json.Unmarshal(byteValue, &root)
+
+	allNodes := decl(root.Root)
+
+	for i := 0; i < len(allNodes); i++ {
+		if(allNodes[i].Type == "LocalDeclarationStatement") {
+			variable := getVar(allNodes[i])
+			vars = append(vars, variableDescription{variable[1], variable[0]})
+		}
+	}
+
+	r := astResult{vars}
+
+
+	return &r
+
+}
+
+//This function gets all nodes in the ast
+func decl(v Type) []Type {
+	returnTypes := make([]Type, 1)
+
+	if len(v.Children) == 0 {
+		returnTypes = append(returnTypes, v)
+	} else {
+		returnTypes = append(returnTypes, v)
+		for i := 0; i < len(v.Children); i++ {
+			appends := decl(v.Children[i])
+			for j := 0; j < len(appends); j++ {
+				returnTypes = append(returnTypes, appends[j])
+			}
+		}
+	}
+
+	return returnTypes
+}
+
+//This function takes specific nodes from the ast, and searches successive nodes
+//for variables.
+//This works via a keyword search. Essentially, the function searches all nodes indexed after
+//the input node until a node is found that contains an "IdentifierToken" or "*Keyword" in the "Type" field
+//in our example, nodes with these "Types" ALWAYS contain some information about a declared variable
+//in particular, this is the case when searching through the children of a "LocalDeclarationStatement", which is
+//the only time this function is called.
+//Regrettably, there's an argument to be made here that this is essentially hard-coding, but the only discernible
+//pattern among variables in the ast is the presence of certain keywords, so this is the most obvious approach,
+//as well as the quickest.
+//Because this is a keyword search, and our example does not contain user-defined variable types, this function
+//will not find user-defined variable types. This could be fixed once I know what keywords to handle for these types.
+func getVar(v Type) [2]string {
+
+	var identifier string
+	var datatype string
+
+	nodes := decl(v)
+
+
+	for i := 0; i < len(nodes); i++{
+		if nodes[i].Type == "VariableDeclarator" {
+			j:= 1
+			for i+j < len(nodes) && nodes[i+j].Type != "IdentifierToken"  && !strings.Contains(nodes[i+j].Type, "Keyword") {
+				j++
+			}
+			if i+j < len(nodes) {
+				if(strings.Contains(nodes[i+j].Type, "Keyword")){
+					datatype = nodes[i+j].Value
+				} else {
+					identifier = nodes[i+j].Value
+				}
+			}
+			continue
+		} else if nodes[i].Type == "VariableDeclaration" {
+			 j:= 1
+			 for i+j < len(nodes) && nodes[i+j].Type != "IdentifierToken"  && !strings.Contains(nodes[i+j].Type, "Keyword") {
+			 	j++
+			 }
+			 if i+j < len(nodes) {
+			 	if(strings.Contains(nodes[i+j].Type, "Keyword")){
+			 		datatype = nodes[i+j].Value
+				} else {
+					identifier = nodes[i+j].Value
+				}
+
+			 	//This handles the case where a variable is declared as "var x = new type y"
+			 	//in our example, this only occurs when an array is declared, so we handle that possibility here
+			 	if(nodes[i+j].Value == "var"){
+			 		for i+j < len(nodes) && nodes[i+j].Type != "CloseBracketToken" {
+						if(strings.Contains(nodes[i+j].Type, "Keyword")) {
+							datatype = nodes[i+j].Value
+						}
+						if nodes[i+j].Type == "OpenBracketToken" {
+							datatype = strings.Join([]string{datatype, "["}, "")
+						}
+						j++
+						if nodes[i+j].Type == "CloseBracketToken" {
+							datatype = strings.Join([]string{datatype, "]"}, "")
+						}
+					}
+				}
+			 }
+			 continue
+		}
+	}
+
+	return [...]string{datatype, identifier}
+
+}
+
+
+type Root struct{
+	uuid string`json:"uuid"`
+	Root Type`json:"Root"`
+}
+
+type Type struct{
+	Type string `json:"Type"`
+	Value string `json:"ValueText"`
+	Children []Type `json:"Children"`
 }
diff --git a/seq.R b/seq.R
@@ -0,0 +1,36 @@
+library(glmnet)
+library(neuralnet)
+library(NeuralNetTools)
+library(nnet)
+library(randomForest)
+
+set.seed(212919156)
+
+#read in the sample data
+sample <- read.csv("seq/sample.csv")
+sample$class = factor(sample$class)
+
+#Use a random forest to generate a model to predict class
+rf <- randomForest(factor(class)~., data=data.frame(sample[,-1]))
+
+#Now look at the Variable Importance Plot of the Random Forest to determine relevant variables to the model
+varImpPlot(rf)
+
+#based on the variable importance plot, the following variables are below an arbitrary 200 MeanGiniDecrease threshold
+#So, we'll disregard them and remake the data without 
+simdat <- sample[,-c(1,4,24,17,21,23,14,19,22,25,26,27,28,29,30,31)]
+
+#now using only those, we can run a logistic regression
+simlog <- glm(class ~ ., family="binomial", data=simdat)
+
+#Now get the confusion matrix for both
+#First the randomForest
+table(predict(rf, newdata=sample[,-1]), sample$class)
+
+#Then the logistic regression
+table(predict(simlog, type="response") > 0.5, sample$class)
+
+#obviously, the randomForest performs better for predictions, so given data about the next events, we would use the randomForest
+#model to predict the class. Random Forests have built in cross-validation as well, due to Out-of-Bag estimation, 
+#so there's no need to additionally cross-validate this model. 
+
diff --git a/seq.Rmd b/seq.Rmd
@@ -0,0 +1,61 @@
+---
+title: "Seq.Rmd"
+author: "Jeff B"
+date: "May 24, 2019"
+output: pdf_document
+---
+
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = TRUE)
+```
+
+#Sequence Question
+
+This question is a sequence classification question. Considering the large number of variables, however, we can start by examining it as a straightforward classification problem. This will allow us some of the benefits of straightforward classification, most importantly variable selection. This part was done in R.
+The sequence classification portion was done in Python, and can be found in the seq folder as seq.py
+
+Start by importing the necessary libraries and setting the seed for reproducibility
+```{r, echo=FALSE, warning=FALSE}
+library(glmnet)
+library(neuralnet)
+library(NeuralNetTools)
+library(nnet)
+library(randomForest)
+
+set.seed(212919156)
+```
+
+
+
+Next, we read in the data, and generate a RandomForest model for this data. RandomForests have built in cross-validation via Out-of-Bag estimates, so we don't need to do any extra cross-validation. We can also look at the variable importance plot here to determine the most important variables. Somewhat arbitrarily, we'll set a cutoff at 200 MeanDecreaseGini. This leaves us with 15 relevant variables.
+```{r}
+sample <- read.csv("seq/sample.csv")
+sample$class = factor(sample$class)
+
+#Use a random forest to generate a model to predict class
+rf <- randomForest(factor(class)~., data=data.frame(sample[,-1]))
+
+#Now look at the Variable Importance Plot of the Random Forest to determine relevant variables to the model
+varImpPlot(rf)
+```
+
+For comparison, we'll run a logistic regression using only variables above the threshold
+
+```{r}
+simdat <- sample[,-c(1,4,24,17,21,23,14,19,22,25,26,27,28,29,30,31)]
+
+#now using only those, we can run a logistic regression
+simlog <- glm(class ~ ., family="binomial", data=simdat)
+```
+
+Now we'll get the confusion matrix for both
+```{r}
+#First the randomForest
+table(predict(rf, newdata=sample[,-1]), sample$class)
+
+#Then the logistic regression
+table(predict(simlog, type="response") > 0.5, sample$class)
+```
+
+The randomForest model perdorms better for predictions, so given data about the next event, we would use the randomForest model to predict the class. 
+Considering we don't have that information, this process has still been useful, since we will use only those variables we've deemed relevant in our sequence prediction problem. 
diff --git a/seq.pdf b/seq.pdf
diff --git a/seq/.~lock.sample.csv# b/seq/.~lock.sample.csv#
@@ -0,0 +1 @@
+,jeff,Rupert,24.05.2019 12:01,file:///home/jeff/.config/libreoffice/4;
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		,jeff,Rupert,24.05.2019 12:01,file:///home/jeff/.config/libreoffice/4;