diff --git a/.Rhistory b/.Rhistory
new file mode 100644
index 0000000..a9f59ee
--- /dev/null
+++ b/.Rhistory
@@ -0,0 +1,512 @@
+#msebs[i] <- (predict(bootsmod[[i]], newdata = data.frame(Serial.No.[-i] + GRE.Score[-i] + TOEFL.Score[-i] +LOR[-i] + CGPA[-i] + Research[-i]))-Chance.of.Admit[i])^2
+}
+newboots <- list()
+bootsmod <- list()
+msebs <- NA
+bootcoef <- matrix(nrow = 1000, ncol=length(linear$coefficients))
+for(i in 1:1000){
+newboots[[i]] <- test[sample(1:nrow(test), nrow(test), replace=TRUE),]
+bootsmod[[i]] <- lm(University.Rating~., data=newboots[[i]])
+for(j in 1:length(linear$coefficients)){
+bootcoef[i][j] <- bootsmod[[i]]$coefficients[j]
+print(j)
+}
+#msebs[i] <- (predict(bootsmod[[i]], newdata = data.frame(Serial.No.[-i] + GRE.Score[-i] + TOEFL.Score[-i] +LOR[-i] + CGPA[-i] + Research[-i]))-Chance.of.Admit[i])^2
+}
+newboots <- list()
+bootsmod <- list()
+msebs <- NA
+bootcoef <- matrix(nrow = 1000, ncol=length(linear$coefficients))
+for(i in 1:1000){
+newboots[[i]] <- test[sample(1:nrow(test), nrow(test), replace=TRUE),]
+bootsmod[[i]] <- lm(University.Rating~., data=newboots[[i]])
+bc <- NA
+for(j in 1:length(linear$coefficients)){
+bc[j] <- bootsmod[[i]]$coefficients[j]
+}
+bootcoef[i] <- bc
+#msebs[i] <- (predict(bootsmod[[i]], newdata = data.frame(Serial.No.[-i] + GRE.Score[-i] + TOEFL.Score[-i] +LOR[-i] + CGPA[-i] + Research[-i]))-Chance.of.Admit[i])^2
+}
+for(i in 1:length(bootcoef[1])){
+print(sd(bootcoef[,i]))
+}
+bootcoef
+newboots <- list()
+bootsmod <- list()
+msebs <- NA
+bootcoef <- matrix(nrow = 1000, ncol=length(linear$coefficients))
+for(i in 1:1000){
+newboots[[i]] <- test[sample(1:nrow(test), nrow(test), replace=TRUE),]
+bootsmod[[i]] <- lm(University.Rating~., data=newboots[[i]])
+bc <- c()
+for(j in 1:length(linear$coefficients)){
+bc <- c(bc, bootsmod[[i]]$coefficients[j])
+}
+bootcoef[i] <- bc
+#msebs[i] <- (predict(bootsmod[[i]], newdata = data.frame(Serial.No.[-i] + GRE.Score[-i] + TOEFL.Score[-i] +LOR[-i] + CGPA[-i] + Research[-i]))-Chance.of.Admit[i])^2
+}
+for(i in 1:length(bootcoef[1])){
+print(sd(bootcoef[,i]))
+}
+bootcoef
+bootsmod[[1]]$coefficients[1]
+bootsmod[[1]]$coefficients[2]
+bootsmod[[1]]$coefficients
+bc <- bootsmod[[1]]$coefficients[1]
+bc <- c(bc, bootsmod[[1]]$coefficients[1])
+bc
+bc <- c(bc, bootsmod[[1]]$coefficients[2])
+bc
+bootcoef[1] <- bc
+newboots <- list()
+bootsmod <- list()
+msebs <- NA
+bootcoef <- data.frame(nrow = 1000, ncol=length(linear$coefficients))
+for(i in 1:1000){
+newboots[[i]] <- test[sample(1:nrow(test), nrow(test), replace=TRUE),]
+bootsmod[[i]] <- lm(University.Rating~., data=newboots[[i]])
+bc <- c()
+for(j in 1:length(linear$coefficients)){
+bc <- c(bc, bootsmod[[i]]$coefficients[j])
+}
+bootcoef[i] <- bc
+#msebs[i] <- (predict(bootsmod[[i]], newdata = data.frame(Serial.No.[-i] + GRE.Score[-i] + TOEFL.Score[-i] +LOR[-i] + CGPA[-i] + Research[-i]))-Chance.of.Admit[i])^2
+}
+length(linear$coefficients)
+bootcoef
+?data.frame
+matrix?
+;
+?matrix
+newboots <- list()
+bootsmod <- list()
+msebs <- NA
+bootcoef <- matrix(nrow = 1000, ncol=length(linear$coefficients))
+for(i in 1:1000){
+newboots[[i]] <- test[sample(1:nrow(test), nrow(test), replace=TRUE),]
+bootsmod[[i]] <- lm(University.Rating~., data=newboots[[i]])
+bc <- c()
+for(j in 1:length(linear$coefficients)){
+bc <- c(bc, bootsmod[[i]]$coefficients[j])
+}
+bootcoef[i] <- bc
+#msebs[i] <- (predict(bootsmod[[i]], newdata = data.frame(Serial.No.[-i] + GRE.Score[-i] + TOEFL.Score[-i] +LOR[-i] + CGPA[-i] + Research[-i]))-Chance.of.Admit[i])^2
+}
+for(i in 1:length(bootcoef[1])){
+print(sd(bootcoef[,i]))
+}
+bootcoef[1]
+bootcoef[2]
+bootcoef[1][1]
+bootcoef[1][2]
+newboots <- list()
+bootsmod <- list()
+msebs <- NA
+bootcoef <- matrix(nrow = 1000, ncol=length(linear$coefficients))
+for(i in 1:1000){
+newboots[[i]] <- test[sample(1:nrow(test), nrow(test), replace=TRUE),]
+bootsmod[[i]] <- lm(University.Rating~., data=newboots[[i]])
+bc <- c()
+for(j in 1:length(linear$coefficients)){
+bootcoef[i][j] <- bootsmod[[i]]$coefficients[j]
+}
+#msebs[i] <- (predict(bootsmod[[i]], newdata = data.frame(Serial.No.[-i] + GRE.Score[-i] + TOEFL.Score[-i] +LOR[-i] + CGPA[-i] + Research[-i]))-Chance.of.Admit[i])^2
+}
+for(i in 1:length(bootcoef[1])){
+print(sd(bootcoef[,i]))
+}
+bootcoef
+newboots <- list()
+bootsmod <- list()
+msebs <- NA
+bootcoef <- matrix(nrow = 1000, ncol=length(linear$coefficients))
+for(i in 1:1000){
+newboots[[i]] <- test[sample(1:nrow(test), nrow(test), replace=TRUE),]
+bootsmod[[i]] <- lm(University.Rating~., data=newboots[[i]])
+bc <- c()
+for(j in 1:length(linear$coefficients)){
+bootcoef[i,j] <- bootsmod[[i]]$coefficients[j]
+}
+#msebs[i] <- (predict(bootsmod[[i]], newdata = data.frame(Serial.No.[-i] + GRE.Score[-i] + TOEFL.Score[-i] +LOR[-i] + CGPA[-i] + Research[-i]))-Chance.of.Admit[i])^2
+}
+for(i in 1:length(bootcoef[1])){
+print(sd(bootcoef[,i]))
+}
+bootcoef
+for(i in 1:length(bootcoef[1])){
++   print(sd(bootcoef[,i]))
++ }
+for(i in 1:length(bootcoef[1])){
++   print sd(bootcoef[,i])
++ }
+for(i in 1:length(bootcoef[1])){
++  sd(bootcoef[,i])
++ }
+sd(bootcoef[,1])
+sd(bootcoef[,2])
+sd(bootcoef[,3])
+newboots <- list()
+bootsmod <- list()
+msebs <- NA
+bootcoef <- matrix(nrow = 1000, ncol=length(linear$coefficients))
+for(i in 1:1000){
+newboots[[i]] <- test[sample(1:nrow(test), nrow(test), replace=TRUE),]
+bootsmod[[i]] <- lm(University.Rating~., data=newboots[[i]])
+bc <- c()
+for(j in 1:length(linear$coefficients)){
+bootcoef[i,j] <- bootsmod[[i]]$coefficients[j]
+}
+#msebs[i] <- (predict(bootsmod[[i]], newdata = data.frame(Serial.No.[-i] + GRE.Score[-i] + TOEFL.Score[-i] +LOR[-i] + CGPA[-i] + Research[-i]))-Chance.of.Admit[i])^2
+}
+sd(bootcoef[,1])
+sd(bootcoef[,2])
+sd(bootcoef[,3])
+sd(bootcoef[,4])
+sd(bootcoef[,5])
+sd(bootcoef[,6])
+sd(bootcoef[,7])
+newboots <- list()
+bootsmod <- list()
+msebs <- NA
+bootcoef <- matrix(nrow = 1000, ncol=length(linear$coefficients))
+for(i in 1:1000){
+newboots[[i]] <- test[sample(1:nrow(test), nrow(test), replace=TRUE),]
+bootsmod[[i]] <- lm(University.Rating~., data=newboots[[i]])
+for(j in 1:length(linear$coefficients)){
+bootcoef[i,j] <- bootsmod[[i]]$coefficients[j]
+}
+#msebs[i] <- (predict(bootsmod[[i]], newdata = data.frame(Serial.No.[-i] + GRE.Score[-i] + TOEFL.Score[-i] +LOR[-i] + CGPA[-i] + Research[-i]))-Chance.of.Admit[i])^2
+}
+as.matrix(c(sd(bootcoef[,1]),sd(bootcoef[,2]),sd(bootcoef[,3]),sd(bootcoef[,4]),sd(bootcoef[,5]),sd(bootcoef[,6]),sd(bootcoef[,7])), c(sd(coef[,1]),sd(coef[,2]),sd(coef[,3]),sd(coef[,4]),sd(coef[,5]),sd(coef[,6]),sd(coef[,7])))
+newboots <- list()
+bootsmod <- list()
+msebs <- NA
+bootcoef <- matrix(nrow = 1000, ncol=length(linear$coefficients))
+for(i in 1:1000){
+newboots[[i]] <- test[sample(1:nrow(test), nrow(test), replace=TRUE),]
+bootsmod[[i]] <- lm(University.Rating~., data=newboots[[i]])
+for(j in 1:length(linear$coefficients)){
+bootcoef[i,j] <- bootsmod[[i]]$coefficients[j]
+}
+#msebs[i] <- (predict(bootsmod[[i]], newdata = data.frame(Serial.No.[-i] + GRE.Score[-i] + TOEFL.Score[-i] +LOR[-i] + CGPA[-i] + Research[-i]))-Chance.of.Admit[i])^2
+}
+as.matrix(c(c(sd(bootcoef[,1]),sd(bootcoef[,2]),sd(bootcoef[,3]),sd(bootcoef[,4]),sd(bootcoef[,5]),sd(bootcoef[,6]),sd(bootcoef[,7])), c(sd(coef[,1]),sd(coef[,2]),sd(coef[,3]),sd(coef[,4]),sd(coef[,5]),sd(coef[,6]),sd(coef[,7]))))
+newboots <- list()
+bootsmod <- list()
+msebs <- NA
+bootcoef <- matrix(nrow = 1000, ncol=length(linear$coefficients))
+for(i in 1:1000){
+newboots[[i]] <- test[sample(1:nrow(test), nrow(test), replace=TRUE),]
+bootsmod[[i]] <- lm(University.Rating~., data=newboots[[i]])
+for(j in 1:length(linear$coefficients)){
+bootcoef[i,j] <- bootsmod[[i]]$coefficients[j]
+}
+#msebs[i] <- (predict(bootsmod[[i]], newdata = data.frame(Serial.No.[-i] + GRE.Score[-i] + TOEFL.Score[-i] +LOR[-i] + CGPA[-i] + Research[-i]))-Chance.of.Admit[i])^2
+}
+as.data.frame(c(sd(bootcoef[,1]),sd(bootcoef[,2]),sd(bootcoef[,3]),sd(bootcoef[,4]),sd(bootcoef[,5]),sd(bootcoef[,6]),sd(bootcoef[,7])), c(sd(coef[,1]),sd(coef[,2]),sd(coef[,3]),sd(coef[,4]),sd(coef[,5]),sd(coef[,6]),sd(coef[,7])))
+newboots <- list()
+bootsmod <- list()
+msebs <- NA
+bootcoef <- matrix(nrow = 1000, ncol=length(linear$coefficients))
+for(i in 1:1000){
+newboots[[i]] <- test[sample(1:nrow(test), nrow(test), replace=TRUE),]
+bootsmod[[i]] <- lm(University.Rating~., data=newboots[[i]])
+for(j in 1:length(linear$coefficients)){
+bootcoef[i,j] <- bootsmod[[i]]$coefficients[j]
+}
+#msebs[i] <- (predict(bootsmod[[i]], newdata = data.frame(Serial.No.[-i] + GRE.Score[-i] + TOEFL.Score[-i] +LOR[-i] + CGPA[-i] + Research[-i]))-Chance.of.Admit[i])^2
+}
+as.data.frame(c(c(sd(bootcoef[,1]),sd(bootcoef[,2]),sd(bootcoef[,3]),sd(bootcoef[,4]),sd(bootcoef[,5]),sd(bootcoef[,6]),sd(bootcoef[,7])), c(sd(coef[,1]),sd(coef[,2]),sd(coef[,3]),sd(coef[,4]),sd(coef[,5]),sd(coef[,6]),sd(coef[,7]))))
+set.seed(7861)
+cvlm <- list()
+msecv <- NA
+coef <-matrix(nrow = 1000, ncol=length(linear$coefficients))
+for(i in 1:nrow(test)){
+#Fit the linear model
+cvlm[[i]] <- lm(Chance.of.Admit[-i] ~ Serial.No.[-i] + GRE.Score[-i] + TOEFL.Score[-i] +LOR[-i] + CGPA[-i] + Research[-i])
+# Calculate MSE for ith model
+msecv[i] <- (predict(cvlm[[i]], newdata = data.frame(Serial.No.[-i] + GRE.Score[-i] + TOEFL.Score[-i] +LOR[-i] + CGPA[-i] + Research[-i]))-Chance.of.Admit[i])^2
+coef[[i]] <- cvlm[[i]]$coefficients
+for(j in 1:length(linear$coefficients)){
+coef[i,j] <- cvlm[[i]]$coefficients[j]
+}
+#msecv[i]
+}
+set.seed(7861)
+cvlm <- list()
+msecv <- NA
+coef <-matrix(nrow = 1000, ncol=length(linear$coefficients))
+for(i in 1:nrow(test)){
+#Fit the linear model
+cvlm[[i]] <- lm(Chance.of.Admit[-i] ~ Serial.No.[-i] + GRE.Score[-i] + TOEFL.Score[-i] +LOR[-i] + CGPA[-i] + Research[-i])
+# Calculate MSE for ith model
+msecv[i] <- (predict(cvlm[[i]], newdata = data.frame(Serial.No.[-i] + GRE.Score[-i] + TOEFL.Score[-i] +LOR[-i] + CGPA[-i] + Research[-i]))-Chance.of.Admit[i])^2
+#coef[[i]] <- cvlm[[i]]$coefficients
+for(j in 1:length(linear$coefficients)){
+coef[i,j] <- cvlm[[i]]$coefficients[j]
+}
+#msecv[i]
+}
+#output mean of MSE
+mean(msecv)
+newboots <- list()
+bootsmod <- list()
+msebs <- NA
+bootcoef <- matrix(nrow = 1000, ncol=length(linear$coefficients))
+for(i in 1:1000){
+newboots[[i]] <- test[sample(1:nrow(test), nrow(test), replace=TRUE),]
+bootsmod[[i]] <- lm(University.Rating~., data=newboots[[i]])
+for(j in 1:length(linear$coefficients)){
+bootcoef[i,j] <- bootsmod[[i]]$coefficients[j]
+}
+#msebs[i] <- (predict(bootsmod[[i]], newdata = data.frame(Serial.No.[-i] + GRE.Score[-i] + TOEFL.Score[-i] +LOR[-i] + CGPA[-i] + Research[-i]))-Chance.of.Admit[i])^2
+}
+as.data.frame(c(c(sd(bootcoef[,1]),sd(bootcoef[,2]),sd(bootcoef[,3]),sd(bootcoef[,4]),sd(bootcoef[,5]),sd(bootcoef[,6]),sd(bootcoef[,7])), c(sd(coef[,1]),sd(coef[,2]),sd(coef[,3]),sd(coef[,4]),sd(coef[,5]),sd(coef[,6]),sd(coef[,7]))))
+newboots <- list()
+bootsmod <- list()
+msebs <- NA
+bootcoef <- matrix(nrow = 1000, ncol=length(linear$coefficients))
+for(i in 1:1000){
+newboots[[i]] <- test[sample(1:nrow(test), nrow(test), replace=TRUE),]
+bootsmod[[i]] <- lm(University.Rating~., data=newboots[[i]])
+for(j in 1:length(linear$coefficients)){
+bootcoef[i,j] <- bootsmod[[i]]$coefficients[j]
+}
+#msebs[i] <- (predict(bootsmod[[i]], newdata = data.frame(Serial.No.[-i] + GRE.Score[-i] + TOEFL.Score[-i] +LOR[-i] + CGPA[-i] + Research[-i]))-Chance.of.Admit[i])^2
+}
+as.matrix(c(c(sd(bootcoef[,1]),sd(bootcoef[,2]),sd(bootcoef[,3]),sd(bootcoef[,4]),sd(bootcoef[,5]),sd(bootcoef[,6]),sd(bootcoef[,7])), c(sd(coef[,1]),sd(coef[,2]),sd(coef[,3]),sd(coef[,4]),sd(coef[,5]),sd(coef[,6]),sd(coef[,7]))))
+newboots <- list()
+bootsmod <- list()
+msebs <- NA
+bootcoef <- matrix(nrow = 1000, ncol=length(linear$coefficients))
+for(i in 1:1000){
+newboots[[i]] <- test[sample(1:nrow(test), nrow(test), replace=TRUE),]
+bootsmod[[i]] <- lm(University.Rating~., data=newboots[[i]])
+for(j in 1:length(linear$coefficients)){
+bootcoef[i,j] <- bootsmod[[i]]$coefficients[j]
+}
+#msebs[i] <- (predict(bootsmod[[i]], newdata = data.frame(Serial.No.[-i] + GRE.Score[-i] + TOEFL.Score[-i] +LOR[-i] + CGPA[-i] + Research[-i]))-Chance.of.Admit[i])^2
+}
+c(sd(bootcoef[,1]),sd(bootcoef[,2]),sd(bootcoef[,3]),sd(bootcoef[,4]),sd(bootcoef[,5]),sd(bootcoef[,6]),sd(bootcoef[,7]))
+c(sd(coef[,1]),sd(coef[,2]),sd(coef[,3]),sd(coef[,4]),sd(coef[,5]),sd(coef[,6]),sd(coef[,7]))
+coef
+sd(coef[,1])
+bootcoef
+c(sd(coef[,1]),sd(coef[,2]),sd(coef[,3]),sd(coef[,4]),sd(coef[,5]),sd(coef[,6]),sd(coef[,7]))
+c(sd(bootcoef[,1]),sd(bootcoef[,2]),sd(bootcoef[,3]),sd(bootcoef[,4]),sd(bootcoef[,5]),sd(bootcoef[,6]),sd(bootcoef[,7]))
+sd(coef[1,])
+coef[,1]
+set.seed(7861)
+cvlm <- list()
+msecv <- NA
+coef <-matrix(nrow = length(test), ncol=length(linear$coefficients))
+for(i in 1:nrow(test)){
+#Fit the linear model
+cvlm[[i]] <- lm(Chance.of.Admit[-i] ~ Serial.No.[-i] + GRE.Score[-i] + TOEFL.Score[-i] +LOR[-i] + CGPA[-i] + Research[-i])
+# Calculate MSE for ith model
+msecv[i] <- (predict(cvlm[[i]], newdata = data.frame(Serial.No.[-i] + GRE.Score[-i] + TOEFL.Score[-i] +LOR[-i] + CGPA[-i] + Research[-i]))-Chance.of.Admit[i])^2
+#coef[[i]] <- cvlm[[i]]$coefficients
+for(j in 1:length(linear$coefficients)){
+coef[i,j] <- cvlm[[i]]$coefficients[j]
+}
+#msecv[i]
+}
+length(test)
+set.seed(7861)
+cvlm <- list()
+msecv <- NA
+coef <-matrix(nrow = 500, ncol=length(linear$coefficients))
+for(i in 1:nrow(test)){
+#Fit the linear model
+cvlm[[i]] <- lm(Chance.of.Admit[-i] ~ Serial.No.[-i] + GRE.Score[-i] + TOEFL.Score[-i] +LOR[-i] + CGPA[-i] + Research[-i])
+# Calculate MSE for ith model
+msecv[i] <- (predict(cvlm[[i]], newdata = data.frame(Serial.No.[-i] + GRE.Score[-i] + TOEFL.Score[-i] +LOR[-i] + CGPA[-i] + Research[-i]))-Chance.of.Admit[i])^2
+#coef[[i]] <- cvlm[[i]]$coefficients
+for(j in 1:length(linear$coefficients)){
+coef[i,j] <- cvlm[[i]]$coefficients[j]
+}
+#msecv[i]
+}
+#output mean of MSE
+mean(msecv)
+newboots <- list()
+bootsmod <- list()
+msebs <- NA
+bootcoef <- matrix(nrow = 1000, ncol=length(linear$coefficients))
+for(i in 1:1000){
+newboots[[i]] <- test[sample(1:nrow(test), nrow(test), replace=TRUE),]
+bootsmod[[i]] <- lm(University.Rating~., data=newboots[[i]])
+for(j in 1:length(linear$coefficients)){
+bootcoef[i,j] <- bootsmod[[i]]$coefficients[j]
+}
+#msebs[i] <- (predict(bootsmod[[i]], newdata = data.frame(Serial.No.[-i] + GRE.Score[-i] + TOEFL.Score[-i] +LOR[-i] + CGPA[-i] + Research[-i]))-Chance.of.Admit[i])^2
+}
+c(sd(bootcoef[,1]),sd(bootcoef[,2]),sd(bootcoef[,3]),sd(bootcoef[,4]),sd(bootcoef[,5]),sd(bootcoef[,6]),sd(bootcoef[,7]))
+c(sd(coef[,1]),sd(coef[,2]),sd(coef[,3]),sd(coef[,4]),sd(coef[,5]),sd(coef[,6]),sd(coef[,7]))
+linear$coefficients
+summary(linear)$coefficients
+summary(linear)$coefficients[,2]
+matrix(c(sd(bootcoef[,1]),sd(bootcoef[,2]),sd(bootcoef[,3]),sd(bootcoef[,4]),sd(bootcoef[,5]),sd(bootcoef[,6]),sd(bootcoef[,7])),c(sd(coef[,1]),sd(coef[,2]),sd(coef[,3]),sd(coef[,4]),sd(coef[,5]),sd(coef[,6]),sd(coef[,7])), summary(linear)$coefficients[,2])
+matrix(c(sd(bootcoef[,1]),sd(bootcoef[,2]),sd(bootcoef[,3]),sd(bootcoef[,4]),sd(bootcoef[,5]),sd(bootcoef[,6]),sd(bootcoef[,7])),c(sd(coef[,1]),sd(coef[,2]),sd(coef[,3]),sd(coef[,4]),sd(coef[,5]),sd(coef[,6]),sd(coef[,7])))
+newboots <- list()
+bootsmod <- list()
+msebs <- NA
+bootcoef <- matrix(nrow = 1000, ncol=length(linear$coefficients))
+for(i in 1:1000){
+newboots[[i]] <- test[sample(1:nrow(test), nrow(test), replace=TRUE),]
+bootsmod[[i]] <- lm(University.Rating~., data=newboots[[i]])
+for(j in 1:length(linear$coefficients)){
+bootcoef[i,j] <- bootsmod[[i]]$coefficients[j]
+}
+#msebs[i] <- (predict(bootsmod[[i]], newdata = data.frame(Serial.No.[-i] + GRE.Score[-i] + TOEFL.Score[-i] +LOR[-i] + CGPA[-i] + Research[-i]))-Chance.of.Admit[i])^2
+}
+summary(linear)$coefficients[,2]
+c(sd(bootcoef[,1]),sd(bootcoef[,2]),sd(bootcoef[,3]),sd(bootcoef[,4]),sd(bootcoef[,5]),sd(bootcoef[,6]),sd(bootcoef[,7]))
+c(sd(coef[,1]),sd(coef[,2]),sd(coef[,3]),sd(coef[,4]),sd(coef[,5]),sd(coef[,6]),sd(coef[,7]))
+newboots <- list()
+bootsmod <- list()
+msebs <- NA
+B <- 5000
+bootcoef <- matrix(nrow = B, ncol=length(linear$coefficients))
+for(i in 1:B){
+newboots[[i]] <- test[sample(1:nrow(test), nrow(test), replace=TRUE),]
+bootsmod[[i]] <- lm(University.Rating~., data=newboots[[i]])
+for(j in 1:length(linear$coefficients)){
+bootcoef[i,j] <- bootsmod[[i]]$coefficients[j]
+}
+#msebs[i] <- (predict(bootsmod[[i]], newdata = data.frame(Serial.No.[-i] + GRE.Score[-i] + TOEFL.Score[-i] +LOR[-i] + CGPA[-i] + Research[-i]))-Chance.of.Admit[i])^2
+}
+summary(linear)$coefficients[,2]
+c(sd(bootcoef[,1]),sd(bootcoef[,2]),sd(bootcoef[,3]),sd(bootcoef[,4]),sd(bootcoef[,5]),sd(bootcoef[,6]),sd(bootcoef[,7]))
+c(sd(coef[,1]),sd(coef[,2]),sd(coef[,3]),sd(coef[,4]),sd(coef[,5]),sd(coef[,6]),sd(coef[,7]))
+knitr::opts_chunk$set(echo = TRUE)
+library(FNN)
+library(mvtnorm)
+library(mclust)
+library(cluster)
+library(fpc)
+library(boot)
+library(tree)
+library(MASS)
+library(randomForest)
+admissionsData <- read.csv("Admission_Predict_Ver1.1.csv")
+#summary (admissionsData)
+attach(admissionsData)
+#Admission_Predict_Ver1.1 <- read.csv("~/Google Drive/Year 3 - S2 Class Files/DATA 311/Project/graduate-admissions/Admission_Predict_Ver1.1.csv")
+#View(Admission_Predict_Ver1.1)
+head(admissionsData)
+admissionsData <- read.csv("Admission_Predict_Ver1.1.csv")
+#summary (admissionsData)
+attach(admissionsData)
+#Admission_Predict_Ver1.1 <- read.csv("~/Google Drive/Year 3 - S2 Class Files/DATA 311/Project/graduate-admissions/Admission_Predict_Ver1.1.csv")
+#View(Admission_Predict_Ver1.1)
+head(admissionsData[,-1])
+knitr::opts_chunk$set(echo = TRUE)
+library(FNN)
+library(mvtnorm)
+library(mclust)
+library(cluster)
+library(fpc)
+library(boot)
+library(tree)
+library(MASS)
+library(randomForest)
+hms <- hclust(na.omit(dg), method="single")
+knitr::opts_chunk$set(echo = TRUE)
+library(FNN)
+library(mvtnorm)
+library(mclust)
+library(cluster)
+library(fpc)
+library(boot)
+library(tree)
+library(MASS)
+library(randomForest)
+admissionsData <- read.csv("Admission_Predict_Ver1.1.csv")
+#summary (admissionsData)
+attach(admissionsData)
+#Admission_Predict_Ver1.1 <- read.csv("~/Google Drive/Year 3 - S2 Class Files/DATA 311/Project/graduate-admissions/Admission_Predict_Ver1.1.csv")
+#View(Admission_Predict_Ver1.1)
+head(admissionsData[,-1])
+dg<-daisy(admissionsData, metric="gower")
+pdist <- cmdscale(d=dg)
+plot(pdist)
+hms <- hclust(na.omit(dg), method="single")
+#plot(hms)
+pairs(pdist, col=cutree(hms,2))
+#plot(pdist)
+setwd("~/Documents/Ubisoft/Technical Test/clever-challenge")
+source('~/Documents/Ubisoft/Technical Test/clever-challenge/seq.R')
+varImpPlot(rf)
+glm(factor(class) ~ ., data=data.frame(sample[,-1]))
+simlog <- glm(factor(class) ~ sample[,-1], family="binomial")
+simlog <- glm(factor(class) ~ ., family="binomial", data=sample[,1])
+simlog <- glm(formula=factor(class) ~ ., family="binomial", data=sample[,1])
+simlog <- glm(formula=factor(class) ~ sample[,1], family="binomial", data=sample[,1])
+simlog <- glm(factor(class) ~ sample, family="binomial")
+simdat <- sample[,-1]
+simlog <- glm(factor(class) ~ simdat, family="binomial")
+View(simlog)
+simlog
+simdat <- as.data.frame(sample[,-1])
+simlog <- glm(factor(class) ~ simdat, family="binomial")
+simlog <- glm(class ~ simdat, family="binomial")
+simdat <- as.matrix(sample[,-1])
+simlog <- glm(class ~ simdat, family="binomial")
+simlog <- glm(factor(class) ~ simdat, family="binomial")
+simdat <- c(sample$f4, sample$timestamp, sample$f3.1, sample$f3, sample$f8, sample$f5, sample$f10, sample$f6, sample$f9, sample$f12, sample$f13, sample$f7, sample$f17, sample$f15, sample$f2)
+simdat <- c(sample$f4, sample$timestamp, sample$f3.1, sample$f3, sample$f8, sample$f5, sample$f10, sample$f6, sample$f9, sample$f12, sample$f13, sample$f7, sample$f17, sample$f15, sample$f2, sample$class)
+glm(factor(class) ~ simdat, family="binomial")
+glm(as.formula(factor(class) ~ simdat), family="binomial")
+?unique
+glm(class ~ simdat, family="binomial")
+glm(class ~ as.matrix(simdat), family="binomial")
+simdat$class
+simdat["class"]
+simdat <- as.matrix(sample$f4, sample$timestamp, sample$f3.1, sample$f3, sample$f8, sample$f5, sample$f10, sample$f6, sample$f9, sample$f12, sample$f13, sample$f7, sample$f17, sample$f15, sample$f2, sample$class)
+simdat
+View(simdat)
+simdat <- as.data.frame(sample$f4, sample$timestamp, sample$f3.1, sample$f3, sample$f8, sample$f5, sample$f10, sample$f6, sample$f9, sample$f12, sample$f13, sample$f7, sample$f17, sample$f15, sample$f2, sample$class)
+View(sample)
+simdat <- sample[,-c(1,4,24,17,21,23,14,19,22,25,26,27,28,29,30,31)]
+simdat
+glm(class ~ simdat, family="binomial")
+glm(class ~ ., family="binomial", data=as.matrix(simdat))
+glm(class ~ ., family="binomial", data=simdat)
+simlog <- glm(class ~ ., family="binomial", data=simdat)
+table(predict(simlog, type="response") > 0.5, sample$class)
+table(predict(rf, newdata=sample[,-1]), sample$class)
+library(gclus)
+??gclus
+?glcus
+?hclust
+plot(sample$event_id, sample$timestamp)
+plot(sample$timestamp, sample$f14)
+plot(sample$timestamp, sample$f4)
+plot(sample$timestamp, sample$class)
+?hclust
+dg<-daisy(sample[,-1], metric="gower")
+install.packages(insect)
+install.packages("insect")
+?insect
+library(insect)
+install.packages("sequential")
+install.packages("Sequential")
+library(Sequential)
+??Sequential
+?CV.G.Binomial
+??neuralnet
+??nn
+knitr::opts_chunk$set(echo = TRUE)
+library(glmnet)
+library(neuralnet)
+library(NeuralNetTools)
+library(nnet)
+library(randomForest)
+set.seed(212919156)
+sample <- read.csv("seq/sample.csv")
+sample$class = factor(sample$class)
+#Use a random forest to generate a model to predict class
+rf <- randomForest(factor(class)~., data=data.frame(sample[,-1]))
+#Now look at the Variable Importance Plot of the Random Forest to determine relevant variables to the model
+varImpPlot(rf)
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..c5c99fe
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,2 @@
+.RData
+
diff --git a/main.go b/main.go
index ac77289..7376df1 100644
--- a/main.go
+++ b/main.go
@@ -1,7 +1,11 @@
 package main
 
 import (
+	"encoding/json"
 	"fmt"
+	"io/ioutil"
+	"os"
+	"strings"
 	"time"
 )
 
@@ -19,8 +23,8 @@ func timeTrack(start time.Time, name string) {
 //It also calls the compute and output the returned struct
 //to stdout.
 func main() {
-	defer timeTrack(time.Now(), "compute diff")
-	fmt.Println(computeDiff())
+	//defer timeTrack(time.Now(), "compute diff")
+	//fmt.Println(computeDiff())
 
 	defer timeTrack(time.Now(), "compute AST")
 	fmt.Println(computeAST())
@@ -36,6 +40,11 @@ func main() {
 //	list of function calls seen in the diffs and their number of calls
 func computeDiff() *diffResult {
 
+	//s :=make([]string, 1)
+
+
+
+
 	return nil
 }
 
@@ -43,5 +52,138 @@ func computeDiff() *diffResult {
 //a astResult struct that contains all the variable declarations
 func computeAST() *astResult {
 
-	return nil
+	vars := make([]variableDescription, 0);
+
+	//path to ast json file
+	path := "./ast/astChallenge.json"
+	jsonFile, err := os.Open(path)
+
+	if err != nil {
+		fmt.Println(err)
+	}
+
+	fmt.Println("Successfully opened "+path)
+	defer jsonFile.Close()
+
+	byteValue, _ := ioutil.ReadAll(jsonFile)
+
+	var root Root
+	json.Unmarshal(byteValue, &root)
+
+	allNodes := decl(root.Root)
+
+	for i := 0; i < len(allNodes); i++ {
+		if(allNodes[i].Type == "LocalDeclarationStatement") {
+			variable := getVar(allNodes[i])
+			vars = append(vars, variableDescription{variable[1], variable[0]})
+		}
+	}
+
+	r := astResult{vars}
+
+
+	return &r
+
+}
+
+//This function gets all nodes in the ast
+func decl(v Type) []Type {
+	returnTypes := make([]Type, 1)
+
+	if len(v.Children) == 0 {
+		returnTypes = append(returnTypes, v)
+	} else {
+		returnTypes = append(returnTypes, v)
+		for i := 0; i < len(v.Children); i++ {
+			appends := decl(v.Children[i])
+			for j := 0; j < len(appends); j++ {
+				returnTypes = append(returnTypes, appends[j])
+			}
+		}
+	}
+
+	return returnTypes
+}
+
+//This function takes specific nodes from the ast, and searches successive nodes
+//for variables.
+//This works via a keyword search. Essentially, the function searches all nodes indexed after
+//the input node until a node is found that contains an "IdentifierToken" or "*Keyword" in the "Type" field
+//in our example, nodes with these "Types" ALWAYS contain some information about a declared variable
+//in particular, this is the case when searching through the children of a "LocalDeclarationStatement", which is
+//the only time this function is called.
+//Regrettably, there's an argument to be made here that this is essentially hard-coding, but the only discernible
+//pattern among variables in the ast is the presence of certain keywords, so this is the most obvious approach,
+//as well as the quickest.
+//Because this is a keyword search, and our example does not contain user-defined variable types, this function
+//will not find user-defined variable types. This could be fixed once I know what keywords to handle for these types.
+func getVar(v Type) [2]string {
+
+	var identifier string
+	var datatype string
+
+	nodes := decl(v)
+
+
+	for i := 0; i < len(nodes); i++{
+		if nodes[i].Type == "VariableDeclarator" {
+			j:= 1
+			for i+j < len(nodes) && nodes[i+j].Type != "IdentifierToken"  && !strings.Contains(nodes[i+j].Type, "Keyword") {
+				j++
+			}
+			if i+j < len(nodes) {
+				if(strings.Contains(nodes[i+j].Type, "Keyword")){
+					datatype = nodes[i+j].Value
+				} else {
+					identifier = nodes[i+j].Value
+				}
+			}
+			continue
+		} else if nodes[i].Type == "VariableDeclaration" {
+			 j:= 1
+			 for i+j < len(nodes) && nodes[i+j].Type != "IdentifierToken"  && !strings.Contains(nodes[i+j].Type, "Keyword") {
+			 	j++
+			 }
+			 if i+j < len(nodes) {
+			 	if(strings.Contains(nodes[i+j].Type, "Keyword")){
+			 		datatype = nodes[i+j].Value
+				} else {
+					identifier = nodes[i+j].Value
+				}
+
+			 	//This handles the case where a variable is declared as "var x = new type y"
+			 	//in our example, this only occurs when an array is declared, so we handle that possibility here
+			 	if(nodes[i+j].Value == "var"){
+			 		for i+j < len(nodes) && nodes[i+j].Type != "CloseBracketToken" {
+						if(strings.Contains(nodes[i+j].Type, "Keyword")) {
+							datatype = nodes[i+j].Value
+						}
+						if nodes[i+j].Type == "OpenBracketToken" {
+							datatype = strings.Join([]string{datatype, "["}, "")
+						}
+						j++
+						if nodes[i+j].Type == "CloseBracketToken" {
+							datatype = strings.Join([]string{datatype, "]"}, "")
+						}
+					}
+				}
+			 }
+			 continue
+		}
+	}
+
+	return [...]string{datatype, identifier}
+
+}
+
+
+type Root struct{
+	uuid string`json:"uuid"`
+	Root Type`json:"Root"`
+}
+
+type Type struct{
+	Type string `json:"Type"`
+	Value string `json:"ValueText"`
+	Children []Type `json:"Children"`
 }
diff --git a/seq.R b/seq.R
new file mode 100644
index 0000000..0048f20
--- /dev/null
+++ b/seq.R
@@ -0,0 +1,36 @@
+library(glmnet)
+library(neuralnet)
+library(NeuralNetTools)
+library(nnet)
+library(randomForest)
+
+set.seed(212919156)
+
+#read in the sample data
+sample <- read.csv("seq/sample.csv")
+sample$class = factor(sample$class)
+
+#Use a random forest to generate a model to predict class
+rf <- randomForest(factor(class)~., data=data.frame(sample[,-1]))
+
+#Now look at the Variable Importance Plot of the Random Forest to determine relevant variables to the model
+varImpPlot(rf)
+
+#based on the variable importance plot, the following variables are below an arbitrary 200 MeanGiniDecrease threshold
+#So, we'll disregard them and remake the data without 
+simdat <- sample[,-c(1,4,24,17,21,23,14,19,22,25,26,27,28,29,30,31)]
+
+#now using only those, we can run a logistic regression
+simlog <- glm(class ~ ., family="binomial", data=simdat)
+
+#Now get the confusion matrix for both
+#First the randomForest
+table(predict(rf, newdata=sample[,-1]), sample$class)
+
+#Then the logistic regression
+table(predict(simlog, type="response") > 0.5, sample$class)
+
+#obviously, the randomForest performs better for predictions, so given data about the next events, we would use the randomForest
+#model to predict the class. Random Forests have built in cross-validation as well, due to Out-of-Bag estimation, 
+#so there's no need to additionally cross-validate this model. 
+
diff --git a/seq.Rmd b/seq.Rmd
new file mode 100644
index 0000000..8de19f7
--- /dev/null
+++ b/seq.Rmd
@@ -0,0 +1,61 @@
+---
+title: "Seq.Rmd"
+author: "Jeff B"
+date: "May 24, 2019"
+output: pdf_document
+---
+
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = TRUE)
+```
+
+#Sequence Question
+
+This question is a sequence classification question. Considering the large number of variables, however, we can start by examining it as a straightforward classification problem. This will allow us some of the benefits of straightforward classification, most importantly variable selection. This part was done in R.
+The sequence classification portion was done in Python, and can be found in the seq folder as seq.py
+
+Start by importing the necessary libraries and setting the seed for reproducibility
+```{r, echo=FALSE, warning=FALSE}
+library(glmnet)
+library(neuralnet)
+library(NeuralNetTools)
+library(nnet)
+library(randomForest)
+
+set.seed(212919156)
+```
+
+
+
+Next, we read in the data, and generate a RandomForest model for this data. RandomForests have built in cross-validation via Out-of-Bag estimates, so we don't need to do any extra cross-validation. We can also look at the variable importance plot here to determine the most important variables. Somewhat arbitrarily, we'll set a cutoff at 200 MeanDecreaseGini. This leaves us with 15 relevant variables.
+```{r}
+sample <- read.csv("seq/sample.csv")
+sample$class = factor(sample$class)
+
+#Use a random forest to generate a model to predict class
+rf <- randomForest(factor(class)~., data=data.frame(sample[,-1]))
+
+#Now look at the Variable Importance Plot of the Random Forest to determine relevant variables to the model
+varImpPlot(rf)
+```
+
+For comparison, we'll run a logistic regression using only variables above the threshold
+
+```{r}
+simdat <- sample[,-c(1,4,24,17,21,23,14,19,22,25,26,27,28,29,30,31)]
+
+#now using only those, we can run a logistic regression
+simlog <- glm(class ~ ., family="binomial", data=simdat)
+```
+
+Now we'll get the confusion matrix for both
+```{r}
+#First the randomForest
+table(predict(rf, newdata=sample[,-1]), sample$class)
+
+#Then the logistic regression
+table(predict(simlog, type="response") > 0.5, sample$class)
+```
+
+The randomForest model perdorms better for predictions, so given data about the next event, we would use the randomForest model to predict the class. 
+Considering we don't have that information, this process has still been useful, since we will use only those variables we've deemed relevant in our sequence prediction problem. 
\ No newline at end of file
diff --git a/seq.pdf b/seq.pdf
new file mode 100644
index 0000000..f1c5396
Binary files /dev/null and b/seq.pdf differ
diff --git a/seq/.~lock.sample.csv# b/seq/.~lock.sample.csv#
new file mode 100644
index 0000000..cb84616
--- /dev/null
+++ b/seq/.~lock.sample.csv#
@@ -0,0 +1 @@
+,jeff,Rupert,24.05.2019 12:01,file:///home/jeff/.config/libreoffice/4;
\ No newline at end of file
diff --git a/seq/seq.py b/seq/seq.py
new file mode 100644
index 0000000..fa02122
--- /dev/null
+++ b/seq/seq.py
@@ -0,0 +1,142 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Fri May 24 11:51:19 2019
+
+@author: jeff
+"""
+
+import pandas
+import matplotlib.pyplot as plt
+import numpy
+import math
+from keras.models import Sequential 
+from keras.layers import Dense
+from keras.layers import Flatten
+from keras.layers import LSTM
+from sklearn.preprocessing import MinMaxScaler
+from sklearn.metrics import mean_squared_error
+from sklearn.model_selection import train_test_split
+from sklearn.datasets.samples_generator import make_blobs
+
+
+#columns to be used decided from basic classification in R
+dataset = pandas.read_csv('sample.csv', usecols=[1,2,4,5,6,7,8,9,10,11,12,13,15,16,18,20], engine='python')
+dataset_norm = (dataset - dataset.mean()) / (dataset.max() - dataset.min())
+dataset_norm["class"] = dataset["class"]
+#plt.plot(dataset_norm)
+
+#set seed for reproducibility
+numpy.random.seed(212919156)
+
+#plt.figure()
+#groups = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+#i = 1
+#for group in groups:
+#    plt.subplot(len(groups), 1, i)
+#    plt.plot(dataset.values[:, group])
+#    plt.title(dataset.columns[group], y=0.5, loc='right')
+#    i+=1
+#plt.show
+
+def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
+    n_vars =1 if type(data) is list else data.shape[1]
+    df = pandas.DataFrame(data)
+    cols, names = list(), list()
+    #input sequence
+    for i in range(n_in, 0, -1):
+        cols.append(df.shift(i))
+        names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]
+    #forecast sequence
+    for i in range(0, n_out):
+        cols.append(df.shift(-i))
+        if i == 0:
+            names += [('var%d(t)'%(j+1)) for j in range(n_vars)]
+        else:
+            names += [('var%d(t+%d)'%(j+1)) for j in range(n_vars)]
+    #aggregate
+    agg = pandas.concat(cols, axis=1)
+    agg.columns = names
+    #drop NaNs
+    if dropnan:
+        agg.dropna(inplace=True)
+    return agg
+
+scaler = MinMaxScaler(feature_range=(0,1))
+scaled = scaler.fit_transform(dataset_norm.values.astype('float32'))
+scaled[:,1] = scaled[:,1].astype('int')
+reframed = series_to_supervised(scaled, 1, 1)
+reframed.drop(reframed.columns[[16,18,19,20,21,22,23,24,25,26,27,28,29,30,31]], axis=1, inplace=True)
+
+#split into train and test sets
+train = reframed.values[1:-(math.floor(len(dataset)*0.3)),:]
+test = reframed.values[-(math.floor(len(dataset)*0.3)):,:]
+#split into inputs and outputs
+train_X, train_y = train[:,:-1], train[:,-1]
+test_X, test_y = test[:,:-1], test[:,-1]
+#reshape for 3D input
+train_X = train_X.reshape((train_X.shape[0], 1, train_X.shape[1]))
+test_X = test_X.reshape((test_X.shape[0], 1, test_X.shape[1]))
+
+
+#Recurrent Neural Network for sequence prediction
+model = Sequential()
+model.add(LSTM(100, input_shape=(train_X.shape[1], train_X.shape[2])))
+model.add(Dense(1, activation='sigmoid'))
+model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
+
+history = model.fit(train_X, train_y, epochs=500, batch_size=128, validation_data=(test_X, test_y), verbose=2, shuffle=False)
+
+plt.plot(history.history['loss'], label='train')
+plt.plot(history.history['val_loss'], label='test')
+plt.legend()
+plt.show()
+
+scores = model.evaluate(test_X, test_y, verbose=0)
+
+# make a prediction
+yhat = model.predict_classes(test_X)
+truth = test_X[:,:,1].astype('int')
+
+hits = 0;
+misses = 0;
+
+for i in range(len(truth)):
+    if truth[i] == yhat[i]:
+        hits = hits+1
+    else:
+        misses = misses+1
+        
+accuracy = hits/len(truth)
+
+print('Measured Accuracy of Model: %.3f' % (accuracy*100))
+print('Evaluated Accuracy of Model: %.3f' % (scores[1]*100) )
+
+
+
+Xnew, _ = make_blobs(n_samples=1, centers=3, n_features=train_X.shape[2], random_state=1)
+Xnew = Xnew.reshape((Xnew.shape[0],1,Xnew.shape[1]))
+ynew = model.predict_classes(Xnew)
+
+print("Predicting the Class of the next 1-event")
+for i in range(len(Xnew)):
+    print("Event %s, Predicted=%s" % (i, ynew[i]))
+
+
+
+#use dummy data to predict the next few classes
+Xnew, _ = make_blobs(n_samples=5, centers=3, n_features=train_X.shape[2], random_state=1)
+Xnew = Xnew.reshape((Xnew.shape[0],1,Xnew.shape[1]))
+ynew = model.predict_classes(Xnew)
+
+print("Predicting the Class of the next 5-events")
+for i in range(len(Xnew)):
+    print("Event %s, Predicted=%s" % (i, ynew[i]))
+    
+Xnew, _ = make_blobs(n_samples=10, centers=3, n_features=train_X.shape[2], random_state=1)
+Xnew = Xnew.reshape((Xnew.shape[0],1,Xnew.shape[1]))
+ynew = model.predict_classes(Xnew)
+
+print("Predicting the Class of the next 10-events")
+for i in range(len(Xnew)):
+    print("Event %s, Predicted=%s" % (i, ynew[i]))