diff --git a/.Rhistory b/.Rhistory new file mode 100644 index 0000000..a9f59ee --- /dev/null +++ b/.Rhistory @@ -0,0 +1,512 @@ +#msebs[i] <- (predict(bootsmod[[i]], newdata = data.frame(Serial.No.[-i] + GRE.Score[-i] + TOEFL.Score[-i] +LOR[-i] + CGPA[-i] + Research[-i]))-Chance.of.Admit[i])^2 +} +newboots <- list() +bootsmod <- list() +msebs <- NA +bootcoef <- matrix(nrow = 1000, ncol=length(linear$coefficients)) +for(i in 1:1000){ +newboots[[i]] <- test[sample(1:nrow(test), nrow(test), replace=TRUE),] +bootsmod[[i]] <- lm(University.Rating~., data=newboots[[i]]) +for(j in 1:length(linear$coefficients)){ +bootcoef[i][j] <- bootsmod[[i]]$coefficients[j] +print(j) +} +#msebs[i] <- (predict(bootsmod[[i]], newdata = data.frame(Serial.No.[-i] + GRE.Score[-i] + TOEFL.Score[-i] +LOR[-i] + CGPA[-i] + Research[-i]))-Chance.of.Admit[i])^2 +} +newboots <- list() +bootsmod <- list() +msebs <- NA +bootcoef <- matrix(nrow = 1000, ncol=length(linear$coefficients)) +for(i in 1:1000){ +newboots[[i]] <- test[sample(1:nrow(test), nrow(test), replace=TRUE),] +bootsmod[[i]] <- lm(University.Rating~., data=newboots[[i]]) +bc <- NA +for(j in 1:length(linear$coefficients)){ +bc[j] <- bootsmod[[i]]$coefficients[j] +} +bootcoef[i] <- bc +#msebs[i] <- (predict(bootsmod[[i]], newdata = data.frame(Serial.No.[-i] + GRE.Score[-i] + TOEFL.Score[-i] +LOR[-i] + CGPA[-i] + Research[-i]))-Chance.of.Admit[i])^2 +} +for(i in 1:length(bootcoef[1])){ +print(sd(bootcoef[,i])) +} +bootcoef +newboots <- list() +bootsmod <- list() +msebs <- NA +bootcoef <- matrix(nrow = 1000, ncol=length(linear$coefficients)) +for(i in 1:1000){ +newboots[[i]] <- test[sample(1:nrow(test), nrow(test), replace=TRUE),] +bootsmod[[i]] <- lm(University.Rating~., data=newboots[[i]]) +bc <- c() +for(j in 1:length(linear$coefficients)){ +bc <- c(bc, bootsmod[[i]]$coefficients[j]) +} +bootcoef[i] <- bc +#msebs[i] <- (predict(bootsmod[[i]], newdata = data.frame(Serial.No.[-i] + GRE.Score[-i] + TOEFL.Score[-i] +LOR[-i] + CGPA[-i] + Research[-i]))-Chance.of.Admit[i])^2 +} +for(i in 1:length(bootcoef[1])){ +print(sd(bootcoef[,i])) +} +bootcoef +bootsmod[[1]]$coefficients[1] +bootsmod[[1]]$coefficients[2] +bootsmod[[1]]$coefficients +bc <- bootsmod[[1]]$coefficients[1] +bc <- c(bc, bootsmod[[1]]$coefficients[1]) +bc +bc <- c(bc, bootsmod[[1]]$coefficients[2]) +bc +bootcoef[1] <- bc +newboots <- list() +bootsmod <- list() +msebs <- NA +bootcoef <- data.frame(nrow = 1000, ncol=length(linear$coefficients)) +for(i in 1:1000){ +newboots[[i]] <- test[sample(1:nrow(test), nrow(test), replace=TRUE),] +bootsmod[[i]] <- lm(University.Rating~., data=newboots[[i]]) +bc <- c() +for(j in 1:length(linear$coefficients)){ +bc <- c(bc, bootsmod[[i]]$coefficients[j]) +} +bootcoef[i] <- bc +#msebs[i] <- (predict(bootsmod[[i]], newdata = data.frame(Serial.No.[-i] + GRE.Score[-i] + TOEFL.Score[-i] +LOR[-i] + CGPA[-i] + Research[-i]))-Chance.of.Admit[i])^2 +} +length(linear$coefficients) +bootcoef +?data.frame +matrix? +; +?matrix +newboots <- list() +bootsmod <- list() +msebs <- NA +bootcoef <- matrix(nrow = 1000, ncol=length(linear$coefficients)) +for(i in 1:1000){ +newboots[[i]] <- test[sample(1:nrow(test), nrow(test), replace=TRUE),] +bootsmod[[i]] <- lm(University.Rating~., data=newboots[[i]]) +bc <- c() +for(j in 1:length(linear$coefficients)){ +bc <- c(bc, bootsmod[[i]]$coefficients[j]) +} +bootcoef[i] <- bc +#msebs[i] <- (predict(bootsmod[[i]], newdata = data.frame(Serial.No.[-i] + GRE.Score[-i] + TOEFL.Score[-i] +LOR[-i] + CGPA[-i] + Research[-i]))-Chance.of.Admit[i])^2 +} +for(i in 1:length(bootcoef[1])){ +print(sd(bootcoef[,i])) +} +bootcoef[1] +bootcoef[2] +bootcoef[1][1] +bootcoef[1][2] +newboots <- list() +bootsmod <- list() +msebs <- NA +bootcoef <- matrix(nrow = 1000, ncol=length(linear$coefficients)) +for(i in 1:1000){ +newboots[[i]] <- test[sample(1:nrow(test), nrow(test), replace=TRUE),] +bootsmod[[i]] <- lm(University.Rating~., data=newboots[[i]]) +bc <- c() +for(j in 1:length(linear$coefficients)){ +bootcoef[i][j] <- bootsmod[[i]]$coefficients[j] +} +#msebs[i] <- (predict(bootsmod[[i]], newdata = data.frame(Serial.No.[-i] + GRE.Score[-i] + TOEFL.Score[-i] +LOR[-i] + CGPA[-i] + Research[-i]))-Chance.of.Admit[i])^2 +} +for(i in 1:length(bootcoef[1])){ +print(sd(bootcoef[,i])) +} +bootcoef +newboots <- list() +bootsmod <- list() +msebs <- NA +bootcoef <- matrix(nrow = 1000, ncol=length(linear$coefficients)) +for(i in 1:1000){ +newboots[[i]] <- test[sample(1:nrow(test), nrow(test), replace=TRUE),] +bootsmod[[i]] <- lm(University.Rating~., data=newboots[[i]]) +bc <- c() +for(j in 1:length(linear$coefficients)){ +bootcoef[i,j] <- bootsmod[[i]]$coefficients[j] +} +#msebs[i] <- (predict(bootsmod[[i]], newdata = data.frame(Serial.No.[-i] + GRE.Score[-i] + TOEFL.Score[-i] +LOR[-i] + CGPA[-i] + Research[-i]))-Chance.of.Admit[i])^2 +} +for(i in 1:length(bootcoef[1])){ +print(sd(bootcoef[,i])) +} +bootcoef +for(i in 1:length(bootcoef[1])){ ++ print(sd(bootcoef[,i])) ++ } +for(i in 1:length(bootcoef[1])){ ++ print sd(bootcoef[,i]) ++ } +for(i in 1:length(bootcoef[1])){ ++ sd(bootcoef[,i]) ++ } +sd(bootcoef[,1]) +sd(bootcoef[,2]) +sd(bootcoef[,3]) +newboots <- list() +bootsmod <- list() +msebs <- NA +bootcoef <- matrix(nrow = 1000, ncol=length(linear$coefficients)) +for(i in 1:1000){ +newboots[[i]] <- test[sample(1:nrow(test), nrow(test), replace=TRUE),] +bootsmod[[i]] <- lm(University.Rating~., data=newboots[[i]]) +bc <- c() +for(j in 1:length(linear$coefficients)){ +bootcoef[i,j] <- bootsmod[[i]]$coefficients[j] +} +#msebs[i] <- (predict(bootsmod[[i]], newdata = data.frame(Serial.No.[-i] + GRE.Score[-i] + TOEFL.Score[-i] +LOR[-i] + CGPA[-i] + Research[-i]))-Chance.of.Admit[i])^2 +} +sd(bootcoef[,1]) +sd(bootcoef[,2]) +sd(bootcoef[,3]) +sd(bootcoef[,4]) +sd(bootcoef[,5]) +sd(bootcoef[,6]) +sd(bootcoef[,7]) +newboots <- list() +bootsmod <- list() +msebs <- NA +bootcoef <- matrix(nrow = 1000, ncol=length(linear$coefficients)) +for(i in 1:1000){ +newboots[[i]] <- test[sample(1:nrow(test), nrow(test), replace=TRUE),] +bootsmod[[i]] <- lm(University.Rating~., data=newboots[[i]]) +for(j in 1:length(linear$coefficients)){ +bootcoef[i,j] <- bootsmod[[i]]$coefficients[j] +} +#msebs[i] <- (predict(bootsmod[[i]], newdata = data.frame(Serial.No.[-i] + GRE.Score[-i] + TOEFL.Score[-i] +LOR[-i] + CGPA[-i] + Research[-i]))-Chance.of.Admit[i])^2 +} +as.matrix(c(sd(bootcoef[,1]),sd(bootcoef[,2]),sd(bootcoef[,3]),sd(bootcoef[,4]),sd(bootcoef[,5]),sd(bootcoef[,6]),sd(bootcoef[,7])), c(sd(coef[,1]),sd(coef[,2]),sd(coef[,3]),sd(coef[,4]),sd(coef[,5]),sd(coef[,6]),sd(coef[,7]))) +newboots <- list() +bootsmod <- list() +msebs <- NA +bootcoef <- matrix(nrow = 1000, ncol=length(linear$coefficients)) +for(i in 1:1000){ +newboots[[i]] <- test[sample(1:nrow(test), nrow(test), replace=TRUE),] +bootsmod[[i]] <- lm(University.Rating~., data=newboots[[i]]) +for(j in 1:length(linear$coefficients)){ +bootcoef[i,j] <- bootsmod[[i]]$coefficients[j] +} +#msebs[i] <- (predict(bootsmod[[i]], newdata = data.frame(Serial.No.[-i] + GRE.Score[-i] + TOEFL.Score[-i] +LOR[-i] + CGPA[-i] + Research[-i]))-Chance.of.Admit[i])^2 +} +as.matrix(c(c(sd(bootcoef[,1]),sd(bootcoef[,2]),sd(bootcoef[,3]),sd(bootcoef[,4]),sd(bootcoef[,5]),sd(bootcoef[,6]),sd(bootcoef[,7])), c(sd(coef[,1]),sd(coef[,2]),sd(coef[,3]),sd(coef[,4]),sd(coef[,5]),sd(coef[,6]),sd(coef[,7])))) +newboots <- list() +bootsmod <- list() +msebs <- NA +bootcoef <- matrix(nrow = 1000, ncol=length(linear$coefficients)) +for(i in 1:1000){ +newboots[[i]] <- test[sample(1:nrow(test), nrow(test), replace=TRUE),] +bootsmod[[i]] <- lm(University.Rating~., data=newboots[[i]]) +for(j in 1:length(linear$coefficients)){ +bootcoef[i,j] <- bootsmod[[i]]$coefficients[j] +} +#msebs[i] <- (predict(bootsmod[[i]], newdata = data.frame(Serial.No.[-i] + GRE.Score[-i] + TOEFL.Score[-i] +LOR[-i] + CGPA[-i] + Research[-i]))-Chance.of.Admit[i])^2 +} +as.data.frame(c(sd(bootcoef[,1]),sd(bootcoef[,2]),sd(bootcoef[,3]),sd(bootcoef[,4]),sd(bootcoef[,5]),sd(bootcoef[,6]),sd(bootcoef[,7])), c(sd(coef[,1]),sd(coef[,2]),sd(coef[,3]),sd(coef[,4]),sd(coef[,5]),sd(coef[,6]),sd(coef[,7]))) +newboots <- list() +bootsmod <- list() +msebs <- NA +bootcoef <- matrix(nrow = 1000, ncol=length(linear$coefficients)) +for(i in 1:1000){ +newboots[[i]] <- test[sample(1:nrow(test), nrow(test), replace=TRUE),] +bootsmod[[i]] <- lm(University.Rating~., data=newboots[[i]]) +for(j in 1:length(linear$coefficients)){ +bootcoef[i,j] <- bootsmod[[i]]$coefficients[j] +} +#msebs[i] <- (predict(bootsmod[[i]], newdata = data.frame(Serial.No.[-i] + GRE.Score[-i] + TOEFL.Score[-i] +LOR[-i] + CGPA[-i] + Research[-i]))-Chance.of.Admit[i])^2 +} +as.data.frame(c(c(sd(bootcoef[,1]),sd(bootcoef[,2]),sd(bootcoef[,3]),sd(bootcoef[,4]),sd(bootcoef[,5]),sd(bootcoef[,6]),sd(bootcoef[,7])), c(sd(coef[,1]),sd(coef[,2]),sd(coef[,3]),sd(coef[,4]),sd(coef[,5]),sd(coef[,6]),sd(coef[,7])))) +set.seed(7861) +cvlm <- list() +msecv <- NA +coef <-matrix(nrow = 1000, ncol=length(linear$coefficients)) +for(i in 1:nrow(test)){ +#Fit the linear model +cvlm[[i]] <- lm(Chance.of.Admit[-i] ~ Serial.No.[-i] + GRE.Score[-i] + TOEFL.Score[-i] +LOR[-i] + CGPA[-i] + Research[-i]) +# Calculate MSE for ith model +msecv[i] <- (predict(cvlm[[i]], newdata = data.frame(Serial.No.[-i] + GRE.Score[-i] + TOEFL.Score[-i] +LOR[-i] + CGPA[-i] + Research[-i]))-Chance.of.Admit[i])^2 +coef[[i]] <- cvlm[[i]]$coefficients +for(j in 1:length(linear$coefficients)){ +coef[i,j] <- cvlm[[i]]$coefficients[j] +} +#msecv[i] +} +set.seed(7861) +cvlm <- list() +msecv <- NA +coef <-matrix(nrow = 1000, ncol=length(linear$coefficients)) +for(i in 1:nrow(test)){ +#Fit the linear model +cvlm[[i]] <- lm(Chance.of.Admit[-i] ~ Serial.No.[-i] + GRE.Score[-i] + TOEFL.Score[-i] +LOR[-i] + CGPA[-i] + Research[-i]) +# Calculate MSE for ith model +msecv[i] <- (predict(cvlm[[i]], newdata = data.frame(Serial.No.[-i] + GRE.Score[-i] + TOEFL.Score[-i] +LOR[-i] + CGPA[-i] + Research[-i]))-Chance.of.Admit[i])^2 +#coef[[i]] <- cvlm[[i]]$coefficients +for(j in 1:length(linear$coefficients)){ +coef[i,j] <- cvlm[[i]]$coefficients[j] +} +#msecv[i] +} +#output mean of MSE +mean(msecv) +newboots <- list() +bootsmod <- list() +msebs <- NA +bootcoef <- matrix(nrow = 1000, ncol=length(linear$coefficients)) +for(i in 1:1000){ +newboots[[i]] <- test[sample(1:nrow(test), nrow(test), replace=TRUE),] +bootsmod[[i]] <- lm(University.Rating~., data=newboots[[i]]) +for(j in 1:length(linear$coefficients)){ +bootcoef[i,j] <- bootsmod[[i]]$coefficients[j] +} +#msebs[i] <- (predict(bootsmod[[i]], newdata = data.frame(Serial.No.[-i] + GRE.Score[-i] + TOEFL.Score[-i] +LOR[-i] + CGPA[-i] + Research[-i]))-Chance.of.Admit[i])^2 +} +as.data.frame(c(c(sd(bootcoef[,1]),sd(bootcoef[,2]),sd(bootcoef[,3]),sd(bootcoef[,4]),sd(bootcoef[,5]),sd(bootcoef[,6]),sd(bootcoef[,7])), c(sd(coef[,1]),sd(coef[,2]),sd(coef[,3]),sd(coef[,4]),sd(coef[,5]),sd(coef[,6]),sd(coef[,7])))) +newboots <- list() +bootsmod <- list() +msebs <- NA +bootcoef <- matrix(nrow = 1000, ncol=length(linear$coefficients)) +for(i in 1:1000){ +newboots[[i]] <- test[sample(1:nrow(test), nrow(test), replace=TRUE),] +bootsmod[[i]] <- lm(University.Rating~., data=newboots[[i]]) +for(j in 1:length(linear$coefficients)){ +bootcoef[i,j] <- bootsmod[[i]]$coefficients[j] +} +#msebs[i] <- (predict(bootsmod[[i]], newdata = data.frame(Serial.No.[-i] + GRE.Score[-i] + TOEFL.Score[-i] +LOR[-i] + CGPA[-i] + Research[-i]))-Chance.of.Admit[i])^2 +} +as.matrix(c(c(sd(bootcoef[,1]),sd(bootcoef[,2]),sd(bootcoef[,3]),sd(bootcoef[,4]),sd(bootcoef[,5]),sd(bootcoef[,6]),sd(bootcoef[,7])), c(sd(coef[,1]),sd(coef[,2]),sd(coef[,3]),sd(coef[,4]),sd(coef[,5]),sd(coef[,6]),sd(coef[,7])))) +newboots <- list() +bootsmod <- list() +msebs <- NA +bootcoef <- matrix(nrow = 1000, ncol=length(linear$coefficients)) +for(i in 1:1000){ +newboots[[i]] <- test[sample(1:nrow(test), nrow(test), replace=TRUE),] +bootsmod[[i]] <- lm(University.Rating~., data=newboots[[i]]) +for(j in 1:length(linear$coefficients)){ +bootcoef[i,j] <- bootsmod[[i]]$coefficients[j] +} +#msebs[i] <- (predict(bootsmod[[i]], newdata = data.frame(Serial.No.[-i] + GRE.Score[-i] + TOEFL.Score[-i] +LOR[-i] + CGPA[-i] + Research[-i]))-Chance.of.Admit[i])^2 +} +c(sd(bootcoef[,1]),sd(bootcoef[,2]),sd(bootcoef[,3]),sd(bootcoef[,4]),sd(bootcoef[,5]),sd(bootcoef[,6]),sd(bootcoef[,7])) +c(sd(coef[,1]),sd(coef[,2]),sd(coef[,3]),sd(coef[,4]),sd(coef[,5]),sd(coef[,6]),sd(coef[,7])) +coef +sd(coef[,1]) +bootcoef +c(sd(coef[,1]),sd(coef[,2]),sd(coef[,3]),sd(coef[,4]),sd(coef[,5]),sd(coef[,6]),sd(coef[,7])) +c(sd(bootcoef[,1]),sd(bootcoef[,2]),sd(bootcoef[,3]),sd(bootcoef[,4]),sd(bootcoef[,5]),sd(bootcoef[,6]),sd(bootcoef[,7])) +sd(coef[1,]) +coef[,1] +set.seed(7861) +cvlm <- list() +msecv <- NA +coef <-matrix(nrow = length(test), ncol=length(linear$coefficients)) +for(i in 1:nrow(test)){ +#Fit the linear model +cvlm[[i]] <- lm(Chance.of.Admit[-i] ~ Serial.No.[-i] + GRE.Score[-i] + TOEFL.Score[-i] +LOR[-i] + CGPA[-i] + Research[-i]) +# Calculate MSE for ith model +msecv[i] <- (predict(cvlm[[i]], newdata = data.frame(Serial.No.[-i] + GRE.Score[-i] + TOEFL.Score[-i] +LOR[-i] + CGPA[-i] + Research[-i]))-Chance.of.Admit[i])^2 +#coef[[i]] <- cvlm[[i]]$coefficients +for(j in 1:length(linear$coefficients)){ +coef[i,j] <- cvlm[[i]]$coefficients[j] +} +#msecv[i] +} +length(test) +set.seed(7861) +cvlm <- list() +msecv <- NA +coef <-matrix(nrow = 500, ncol=length(linear$coefficients)) +for(i in 1:nrow(test)){ +#Fit the linear model +cvlm[[i]] <- lm(Chance.of.Admit[-i] ~ Serial.No.[-i] + GRE.Score[-i] + TOEFL.Score[-i] +LOR[-i] + CGPA[-i] + Research[-i]) +# Calculate MSE for ith model +msecv[i] <- (predict(cvlm[[i]], newdata = data.frame(Serial.No.[-i] + GRE.Score[-i] + TOEFL.Score[-i] +LOR[-i] + CGPA[-i] + Research[-i]))-Chance.of.Admit[i])^2 +#coef[[i]] <- cvlm[[i]]$coefficients +for(j in 1:length(linear$coefficients)){ +coef[i,j] <- cvlm[[i]]$coefficients[j] +} +#msecv[i] +} +#output mean of MSE +mean(msecv) +newboots <- list() +bootsmod <- list() +msebs <- NA +bootcoef <- matrix(nrow = 1000, ncol=length(linear$coefficients)) +for(i in 1:1000){ +newboots[[i]] <- test[sample(1:nrow(test), nrow(test), replace=TRUE),] +bootsmod[[i]] <- lm(University.Rating~., data=newboots[[i]]) +for(j in 1:length(linear$coefficients)){ +bootcoef[i,j] <- bootsmod[[i]]$coefficients[j] +} +#msebs[i] <- (predict(bootsmod[[i]], newdata = data.frame(Serial.No.[-i] + GRE.Score[-i] + TOEFL.Score[-i] +LOR[-i] + CGPA[-i] + Research[-i]))-Chance.of.Admit[i])^2 +} +c(sd(bootcoef[,1]),sd(bootcoef[,2]),sd(bootcoef[,3]),sd(bootcoef[,4]),sd(bootcoef[,5]),sd(bootcoef[,6]),sd(bootcoef[,7])) +c(sd(coef[,1]),sd(coef[,2]),sd(coef[,3]),sd(coef[,4]),sd(coef[,5]),sd(coef[,6]),sd(coef[,7])) +linear$coefficients +summary(linear)$coefficients +summary(linear)$coefficients[,2] +matrix(c(sd(bootcoef[,1]),sd(bootcoef[,2]),sd(bootcoef[,3]),sd(bootcoef[,4]),sd(bootcoef[,5]),sd(bootcoef[,6]),sd(bootcoef[,7])),c(sd(coef[,1]),sd(coef[,2]),sd(coef[,3]),sd(coef[,4]),sd(coef[,5]),sd(coef[,6]),sd(coef[,7])), summary(linear)$coefficients[,2]) +matrix(c(sd(bootcoef[,1]),sd(bootcoef[,2]),sd(bootcoef[,3]),sd(bootcoef[,4]),sd(bootcoef[,5]),sd(bootcoef[,6]),sd(bootcoef[,7])),c(sd(coef[,1]),sd(coef[,2]),sd(coef[,3]),sd(coef[,4]),sd(coef[,5]),sd(coef[,6]),sd(coef[,7]))) +newboots <- list() +bootsmod <- list() +msebs <- NA +bootcoef <- matrix(nrow = 1000, ncol=length(linear$coefficients)) +for(i in 1:1000){ +newboots[[i]] <- test[sample(1:nrow(test), nrow(test), replace=TRUE),] +bootsmod[[i]] <- lm(University.Rating~., data=newboots[[i]]) +for(j in 1:length(linear$coefficients)){ +bootcoef[i,j] <- bootsmod[[i]]$coefficients[j] +} +#msebs[i] <- (predict(bootsmod[[i]], newdata = data.frame(Serial.No.[-i] + GRE.Score[-i] + TOEFL.Score[-i] +LOR[-i] + CGPA[-i] + Research[-i]))-Chance.of.Admit[i])^2 +} +summary(linear)$coefficients[,2] +c(sd(bootcoef[,1]),sd(bootcoef[,2]),sd(bootcoef[,3]),sd(bootcoef[,4]),sd(bootcoef[,5]),sd(bootcoef[,6]),sd(bootcoef[,7])) +c(sd(coef[,1]),sd(coef[,2]),sd(coef[,3]),sd(coef[,4]),sd(coef[,5]),sd(coef[,6]),sd(coef[,7])) +newboots <- list() +bootsmod <- list() +msebs <- NA +B <- 5000 +bootcoef <- matrix(nrow = B, ncol=length(linear$coefficients)) +for(i in 1:B){ +newboots[[i]] <- test[sample(1:nrow(test), nrow(test), replace=TRUE),] +bootsmod[[i]] <- lm(University.Rating~., data=newboots[[i]]) +for(j in 1:length(linear$coefficients)){ +bootcoef[i,j] <- bootsmod[[i]]$coefficients[j] +} +#msebs[i] <- (predict(bootsmod[[i]], newdata = data.frame(Serial.No.[-i] + GRE.Score[-i] + TOEFL.Score[-i] +LOR[-i] + CGPA[-i] + Research[-i]))-Chance.of.Admit[i])^2 +} +summary(linear)$coefficients[,2] +c(sd(bootcoef[,1]),sd(bootcoef[,2]),sd(bootcoef[,3]),sd(bootcoef[,4]),sd(bootcoef[,5]),sd(bootcoef[,6]),sd(bootcoef[,7])) +c(sd(coef[,1]),sd(coef[,2]),sd(coef[,3]),sd(coef[,4]),sd(coef[,5]),sd(coef[,6]),sd(coef[,7])) +knitr::opts_chunk$set(echo = TRUE) +library(FNN) +library(mvtnorm) +library(mclust) +library(cluster) +library(fpc) +library(boot) +library(tree) +library(MASS) +library(randomForest) +admissionsData <- read.csv("Admission_Predict_Ver1.1.csv") +#summary (admissionsData) +attach(admissionsData) +#Admission_Predict_Ver1.1 <- read.csv("~/Google Drive/Year 3 - S2 Class Files/DATA 311/Project/graduate-admissions/Admission_Predict_Ver1.1.csv") +#View(Admission_Predict_Ver1.1) +head(admissionsData) +admissionsData <- read.csv("Admission_Predict_Ver1.1.csv") +#summary (admissionsData) +attach(admissionsData) +#Admission_Predict_Ver1.1 <- read.csv("~/Google Drive/Year 3 - S2 Class Files/DATA 311/Project/graduate-admissions/Admission_Predict_Ver1.1.csv") +#View(Admission_Predict_Ver1.1) +head(admissionsData[,-1]) +knitr::opts_chunk$set(echo = TRUE) +library(FNN) +library(mvtnorm) +library(mclust) +library(cluster) +library(fpc) +library(boot) +library(tree) +library(MASS) +library(randomForest) +hms <- hclust(na.omit(dg), method="single") +knitr::opts_chunk$set(echo = TRUE) +library(FNN) +library(mvtnorm) +library(mclust) +library(cluster) +library(fpc) +library(boot) +library(tree) +library(MASS) +library(randomForest) +admissionsData <- read.csv("Admission_Predict_Ver1.1.csv") +#summary (admissionsData) +attach(admissionsData) +#Admission_Predict_Ver1.1 <- read.csv("~/Google Drive/Year 3 - S2 Class Files/DATA 311/Project/graduate-admissions/Admission_Predict_Ver1.1.csv") +#View(Admission_Predict_Ver1.1) +head(admissionsData[,-1]) +dg<-daisy(admissionsData, metric="gower") +pdist <- cmdscale(d=dg) +plot(pdist) +hms <- hclust(na.omit(dg), method="single") +#plot(hms) +pairs(pdist, col=cutree(hms,2)) +#plot(pdist) +setwd("~/Documents/Ubisoft/Technical Test/clever-challenge") +source('~/Documents/Ubisoft/Technical Test/clever-challenge/seq.R') +varImpPlot(rf) +glm(factor(class) ~ ., data=data.frame(sample[,-1])) +simlog <- glm(factor(class) ~ sample[,-1], family="binomial") +simlog <- glm(factor(class) ~ ., family="binomial", data=sample[,1]) +simlog <- glm(formula=factor(class) ~ ., family="binomial", data=sample[,1]) +simlog <- glm(formula=factor(class) ~ sample[,1], family="binomial", data=sample[,1]) +simlog <- glm(factor(class) ~ sample, family="binomial") +simdat <- sample[,-1] +simlog <- glm(factor(class) ~ simdat, family="binomial") +View(simlog) +simlog +simdat <- as.data.frame(sample[,-1]) +simlog <- glm(factor(class) ~ simdat, family="binomial") +simlog <- glm(class ~ simdat, family="binomial") +simdat <- as.matrix(sample[,-1]) +simlog <- glm(class ~ simdat, family="binomial") +simlog <- glm(factor(class) ~ simdat, family="binomial") +simdat <- c(sample$f4, sample$timestamp, sample$f3.1, sample$f3, sample$f8, sample$f5, sample$f10, sample$f6, sample$f9, sample$f12, sample$f13, sample$f7, sample$f17, sample$f15, sample$f2) +simdat <- c(sample$f4, sample$timestamp, sample$f3.1, sample$f3, sample$f8, sample$f5, sample$f10, sample$f6, sample$f9, sample$f12, sample$f13, sample$f7, sample$f17, sample$f15, sample$f2, sample$class) +glm(factor(class) ~ simdat, family="binomial") +glm(as.formula(factor(class) ~ simdat), family="binomial") +?unique +glm(class ~ simdat, family="binomial") +glm(class ~ as.matrix(simdat), family="binomial") +simdat$class +simdat["class"] +simdat <- as.matrix(sample$f4, sample$timestamp, sample$f3.1, sample$f3, sample$f8, sample$f5, sample$f10, sample$f6, sample$f9, sample$f12, sample$f13, sample$f7, sample$f17, sample$f15, sample$f2, sample$class) +simdat +View(simdat) +simdat <- as.data.frame(sample$f4, sample$timestamp, sample$f3.1, sample$f3, sample$f8, sample$f5, sample$f10, sample$f6, sample$f9, sample$f12, sample$f13, sample$f7, sample$f17, sample$f15, sample$f2, sample$class) +View(sample) +simdat <- sample[,-c(1,4,24,17,21,23,14,19,22,25,26,27,28,29,30,31)] +simdat +glm(class ~ simdat, family="binomial") +glm(class ~ ., family="binomial", data=as.matrix(simdat)) +glm(class ~ ., family="binomial", data=simdat) +simlog <- glm(class ~ ., family="binomial", data=simdat) +table(predict(simlog, type="response") > 0.5, sample$class) +table(predict(rf, newdata=sample[,-1]), sample$class) +library(gclus) +??gclus +?glcus +?hclust +plot(sample$event_id, sample$timestamp) +plot(sample$timestamp, sample$f14) +plot(sample$timestamp, sample$f4) +plot(sample$timestamp, sample$class) +?hclust +dg<-daisy(sample[,-1], metric="gower") +install.packages(insect) +install.packages("insect") +?insect +library(insect) +install.packages("sequential") +install.packages("Sequential") +library(Sequential) +??Sequential +?CV.G.Binomial +??neuralnet +??nn +knitr::opts_chunk$set(echo = TRUE) +library(glmnet) +library(neuralnet) +library(NeuralNetTools) +library(nnet) +library(randomForest) +set.seed(212919156) +sample <- read.csv("seq/sample.csv") +sample$class = factor(sample$class) +#Use a random forest to generate a model to predict class +rf <- randomForest(factor(class)~., data=data.frame(sample[,-1])) +#Now look at the Variable Importance Plot of the Random Forest to determine relevant variables to the model +varImpPlot(rf) diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..c5c99fe --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +.RData + diff --git a/main.go b/main.go index ac77289..7376df1 100644 --- a/main.go +++ b/main.go @@ -1,7 +1,11 @@ package main import ( + "encoding/json" "fmt" + "io/ioutil" + "os" + "strings" "time" ) @@ -19,8 +23,8 @@ func timeTrack(start time.Time, name string) { //It also calls the compute and output the returned struct //to stdout. func main() { - defer timeTrack(time.Now(), "compute diff") - fmt.Println(computeDiff()) + //defer timeTrack(time.Now(), "compute diff") + //fmt.Println(computeDiff()) defer timeTrack(time.Now(), "compute AST") fmt.Println(computeAST()) @@ -36,6 +40,11 @@ func main() { // list of function calls seen in the diffs and their number of calls func computeDiff() *diffResult { + //s :=make([]string, 1) + + + + return nil } @@ -43,5 +52,138 @@ func computeDiff() *diffResult { //a astResult struct that contains all the variable declarations func computeAST() *astResult { - return nil + vars := make([]variableDescription, 0); + + //path to ast json file + path := "./ast/astChallenge.json" + jsonFile, err := os.Open(path) + + if err != nil { + fmt.Println(err) + } + + fmt.Println("Successfully opened "+path) + defer jsonFile.Close() + + byteValue, _ := ioutil.ReadAll(jsonFile) + + var root Root + json.Unmarshal(byteValue, &root) + + allNodes := decl(root.Root) + + for i := 0; i < len(allNodes); i++ { + if(allNodes[i].Type == "LocalDeclarationStatement") { + variable := getVar(allNodes[i]) + vars = append(vars, variableDescription{variable[1], variable[0]}) + } + } + + r := astResult{vars} + + + return &r + +} + +//This function gets all nodes in the ast +func decl(v Type) []Type { + returnTypes := make([]Type, 1) + + if len(v.Children) == 0 { + returnTypes = append(returnTypes, v) + } else { + returnTypes = append(returnTypes, v) + for i := 0; i < len(v.Children); i++ { + appends := decl(v.Children[i]) + for j := 0; j < len(appends); j++ { + returnTypes = append(returnTypes, appends[j]) + } + } + } + + return returnTypes +} + +//This function takes specific nodes from the ast, and searches successive nodes +//for variables. +//This works via a keyword search. Essentially, the function searches all nodes indexed after +//the input node until a node is found that contains an "IdentifierToken" or "*Keyword" in the "Type" field +//in our example, nodes with these "Types" ALWAYS contain some information about a declared variable +//in particular, this is the case when searching through the children of a "LocalDeclarationStatement", which is +//the only time this function is called. +//Regrettably, there's an argument to be made here that this is essentially hard-coding, but the only discernible +//pattern among variables in the ast is the presence of certain keywords, so this is the most obvious approach, +//as well as the quickest. +//Because this is a keyword search, and our example does not contain user-defined variable types, this function +//will not find user-defined variable types. This could be fixed once I know what keywords to handle for these types. +func getVar(v Type) [2]string { + + var identifier string + var datatype string + + nodes := decl(v) + + + for i := 0; i < len(nodes); i++{ + if nodes[i].Type == "VariableDeclarator" { + j:= 1 + for i+j < len(nodes) && nodes[i+j].Type != "IdentifierToken" && !strings.Contains(nodes[i+j].Type, "Keyword") { + j++ + } + if i+j < len(nodes) { + if(strings.Contains(nodes[i+j].Type, "Keyword")){ + datatype = nodes[i+j].Value + } else { + identifier = nodes[i+j].Value + } + } + continue + } else if nodes[i].Type == "VariableDeclaration" { + j:= 1 + for i+j < len(nodes) && nodes[i+j].Type != "IdentifierToken" && !strings.Contains(nodes[i+j].Type, "Keyword") { + j++ + } + if i+j < len(nodes) { + if(strings.Contains(nodes[i+j].Type, "Keyword")){ + datatype = nodes[i+j].Value + } else { + identifier = nodes[i+j].Value + } + + //This handles the case where a variable is declared as "var x = new type y" + //in our example, this only occurs when an array is declared, so we handle that possibility here + if(nodes[i+j].Value == "var"){ + for i+j < len(nodes) && nodes[i+j].Type != "CloseBracketToken" { + if(strings.Contains(nodes[i+j].Type, "Keyword")) { + datatype = nodes[i+j].Value + } + if nodes[i+j].Type == "OpenBracketToken" { + datatype = strings.Join([]string{datatype, "["}, "") + } + j++ + if nodes[i+j].Type == "CloseBracketToken" { + datatype = strings.Join([]string{datatype, "]"}, "") + } + } + } + } + continue + } + } + + return [...]string{datatype, identifier} + +} + + +type Root struct{ + uuid string`json:"uuid"` + Root Type`json:"Root"` +} + +type Type struct{ + Type string `json:"Type"` + Value string `json:"ValueText"` + Children []Type `json:"Children"` } diff --git a/seq.R b/seq.R new file mode 100644 index 0000000..0048f20 --- /dev/null +++ b/seq.R @@ -0,0 +1,36 @@ +library(glmnet) +library(neuralnet) +library(NeuralNetTools) +library(nnet) +library(randomForest) + +set.seed(212919156) + +#read in the sample data +sample <- read.csv("seq/sample.csv") +sample$class = factor(sample$class) + +#Use a random forest to generate a model to predict class +rf <- randomForest(factor(class)~., data=data.frame(sample[,-1])) + +#Now look at the Variable Importance Plot of the Random Forest to determine relevant variables to the model +varImpPlot(rf) + +#based on the variable importance plot, the following variables are below an arbitrary 200 MeanGiniDecrease threshold +#So, we'll disregard them and remake the data without +simdat <- sample[,-c(1,4,24,17,21,23,14,19,22,25,26,27,28,29,30,31)] + +#now using only those, we can run a logistic regression +simlog <- glm(class ~ ., family="binomial", data=simdat) + +#Now get the confusion matrix for both +#First the randomForest +table(predict(rf, newdata=sample[,-1]), sample$class) + +#Then the logistic regression +table(predict(simlog, type="response") > 0.5, sample$class) + +#obviously, the randomForest performs better for predictions, so given data about the next events, we would use the randomForest +#model to predict the class. Random Forests have built in cross-validation as well, due to Out-of-Bag estimation, +#so there's no need to additionally cross-validate this model. + diff --git a/seq.Rmd b/seq.Rmd new file mode 100644 index 0000000..8de19f7 --- /dev/null +++ b/seq.Rmd @@ -0,0 +1,61 @@ +--- +title: "Seq.Rmd" +author: "Jeff B" +date: "May 24, 2019" +output: pdf_document +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = TRUE) +``` + +#Sequence Question + +This question is a sequence classification question. Considering the large number of variables, however, we can start by examining it as a straightforward classification problem. This will allow us some of the benefits of straightforward classification, most importantly variable selection. This part was done in R. +The sequence classification portion was done in Python, and can be found in the seq folder as seq.py + +Start by importing the necessary libraries and setting the seed for reproducibility +```{r, echo=FALSE, warning=FALSE} +library(glmnet) +library(neuralnet) +library(NeuralNetTools) +library(nnet) +library(randomForest) + +set.seed(212919156) +``` + + + +Next, we read in the data, and generate a RandomForest model for this data. RandomForests have built in cross-validation via Out-of-Bag estimates, so we don't need to do any extra cross-validation. We can also look at the variable importance plot here to determine the most important variables. Somewhat arbitrarily, we'll set a cutoff at 200 MeanDecreaseGini. This leaves us with 15 relevant variables. +```{r} +sample <- read.csv("seq/sample.csv") +sample$class = factor(sample$class) + +#Use a random forest to generate a model to predict class +rf <- randomForest(factor(class)~., data=data.frame(sample[,-1])) + +#Now look at the Variable Importance Plot of the Random Forest to determine relevant variables to the model +varImpPlot(rf) +``` + +For comparison, we'll run a logistic regression using only variables above the threshold + +```{r} +simdat <- sample[,-c(1,4,24,17,21,23,14,19,22,25,26,27,28,29,30,31)] + +#now using only those, we can run a logistic regression +simlog <- glm(class ~ ., family="binomial", data=simdat) +``` + +Now we'll get the confusion matrix for both +```{r} +#First the randomForest +table(predict(rf, newdata=sample[,-1]), sample$class) + +#Then the logistic regression +table(predict(simlog, type="response") > 0.5, sample$class) +``` + +The randomForest model perdorms better for predictions, so given data about the next event, we would use the randomForest model to predict the class. +Considering we don't have that information, this process has still been useful, since we will use only those variables we've deemed relevant in our sequence prediction problem. \ No newline at end of file diff --git a/seq.pdf b/seq.pdf new file mode 100644 index 0000000..f1c5396 Binary files /dev/null and b/seq.pdf differ diff --git a/seq/.~lock.sample.csv# b/seq/.~lock.sample.csv# new file mode 100644 index 0000000..cb84616 --- /dev/null +++ b/seq/.~lock.sample.csv# @@ -0,0 +1 @@ +,jeff,Rupert,24.05.2019 12:01,file:///home/jeff/.config/libreoffice/4; \ No newline at end of file diff --git a/seq/seq.py b/seq/seq.py new file mode 100644 index 0000000..fa02122 --- /dev/null +++ b/seq/seq.py @@ -0,0 +1,142 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Fri May 24 11:51:19 2019 + +@author: jeff +""" + +import pandas +import matplotlib.pyplot as plt +import numpy +import math +from keras.models import Sequential +from keras.layers import Dense +from keras.layers import Flatten +from keras.layers import LSTM +from sklearn.preprocessing import MinMaxScaler +from sklearn.metrics import mean_squared_error +from sklearn.model_selection import train_test_split +from sklearn.datasets.samples_generator import make_blobs + + +#columns to be used decided from basic classification in R +dataset = pandas.read_csv('sample.csv', usecols=[1,2,4,5,6,7,8,9,10,11,12,13,15,16,18,20], engine='python') +dataset_norm = (dataset - dataset.mean()) / (dataset.max() - dataset.min()) +dataset_norm["class"] = dataset["class"] +#plt.plot(dataset_norm) + +#set seed for reproducibility +numpy.random.seed(212919156) + +#plt.figure() +#groups = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +#i = 1 +#for group in groups: +# plt.subplot(len(groups), 1, i) +# plt.plot(dataset.values[:, group]) +# plt.title(dataset.columns[group], y=0.5, loc='right') +# i+=1 +#plt.show + +def series_to_supervised(data, n_in=1, n_out=1, dropnan=True): + n_vars =1 if type(data) is list else data.shape[1] + df = pandas.DataFrame(data) + cols, names = list(), list() + #input sequence + for i in range(n_in, 0, -1): + cols.append(df.shift(i)) + names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)] + #forecast sequence + for i in range(0, n_out): + cols.append(df.shift(-i)) + if i == 0: + names += [('var%d(t)'%(j+1)) for j in range(n_vars)] + else: + names += [('var%d(t+%d)'%(j+1)) for j in range(n_vars)] + #aggregate + agg = pandas.concat(cols, axis=1) + agg.columns = names + #drop NaNs + if dropnan: + agg.dropna(inplace=True) + return agg + +scaler = MinMaxScaler(feature_range=(0,1)) +scaled = scaler.fit_transform(dataset_norm.values.astype('float32')) +scaled[:,1] = scaled[:,1].astype('int') +reframed = series_to_supervised(scaled, 1, 1) +reframed.drop(reframed.columns[[16,18,19,20,21,22,23,24,25,26,27,28,29,30,31]], axis=1, inplace=True) + +#split into train and test sets +train = reframed.values[1:-(math.floor(len(dataset)*0.3)),:] +test = reframed.values[-(math.floor(len(dataset)*0.3)):,:] +#split into inputs and outputs +train_X, train_y = train[:,:-1], train[:,-1] +test_X, test_y = test[:,:-1], test[:,-1] +#reshape for 3D input +train_X = train_X.reshape((train_X.shape[0], 1, train_X.shape[1])) +test_X = test_X.reshape((test_X.shape[0], 1, test_X.shape[1])) + + +#Recurrent Neural Network for sequence prediction +model = Sequential() +model.add(LSTM(100, input_shape=(train_X.shape[1], train_X.shape[2]))) +model.add(Dense(1, activation='sigmoid')) +model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) + +history = model.fit(train_X, train_y, epochs=500, batch_size=128, validation_data=(test_X, test_y), verbose=2, shuffle=False) + +plt.plot(history.history['loss'], label='train') +plt.plot(history.history['val_loss'], label='test') +plt.legend() +plt.show() + +scores = model.evaluate(test_X, test_y, verbose=0) + +# make a prediction +yhat = model.predict_classes(test_X) +truth = test_X[:,:,1].astype('int') + +hits = 0; +misses = 0; + +for i in range(len(truth)): + if truth[i] == yhat[i]: + hits = hits+1 + else: + misses = misses+1 + +accuracy = hits/len(truth) + +print('Measured Accuracy of Model: %.3f' % (accuracy*100)) +print('Evaluated Accuracy of Model: %.3f' % (scores[1]*100) ) + + + +Xnew, _ = make_blobs(n_samples=1, centers=3, n_features=train_X.shape[2], random_state=1) +Xnew = Xnew.reshape((Xnew.shape[0],1,Xnew.shape[1])) +ynew = model.predict_classes(Xnew) + +print("Predicting the Class of the next 1-event") +for i in range(len(Xnew)): + print("Event %s, Predicted=%s" % (i, ynew[i])) + + + +#use dummy data to predict the next few classes +Xnew, _ = make_blobs(n_samples=5, centers=3, n_features=train_X.shape[2], random_state=1) +Xnew = Xnew.reshape((Xnew.shape[0],1,Xnew.shape[1])) +ynew = model.predict_classes(Xnew) + +print("Predicting the Class of the next 5-events") +for i in range(len(Xnew)): + print("Event %s, Predicted=%s" % (i, ynew[i])) + +Xnew, _ = make_blobs(n_samples=10, centers=3, n_features=train_X.shape[2], random_state=1) +Xnew = Xnew.reshape((Xnew.shape[0],1,Xnew.shape[1])) +ynew = model.predict_classes(Xnew) + +print("Predicting the Class of the next 10-events") +for i in range(len(Xnew)): + print("Event %s, Predicted=%s" % (i, ynew[i]))