Graduate Admission Data Analysis

The data is collected from Kaggle.

The dataset contains several parameters that are considered important during the application for Masters Programs. The parameters included are 1. GRE Scores ( out of 340 ) – Continuous
2. TOEFL Scores ( out of 120 ) – Continuous
3. University Rating ( out of 5 ) – Categorical
4. Statement of Purpose ( out of 5) – Continuous
5. Letter of Recommendation Strength ( out of 5 ) – Continuous
6. Undergraduate GPA ( out of 10 ) – Continuous
7. Research Experience ( either 0 or 1 ) – Categorical
8. Chance of Admit ( ranging from 0 to 1 ) – Continuous

We will start with an exploratory Data Analysis to understand the data better. It will help us to clean the data, make wise decisions for which model to use, etc.

setwd("C:/Users/Srijit Mukherjee/Desktop")
data = read.csv("Data.csv")
data = data[,-1]
head(data)
str(data)
colnames(data)
Data = data
## 75% of the sample size
smp_size <- floor(0.75 * nrow(data))
data <- data[train_ind, ]
test <- data[-train_ind, ]
## set the seed to make your partition reproducible
set.seed(123)
train_ind <- sample(seq_len(nrow(data)), size = smp_size)
y = as.matrix(data$Chance.of.Admit)
x = as.matrix(data[c(1:2,4:6)])
catdata = data[c(3,7)]
#given a student and the scores what is the best university for a student to apply to maximize his selection chance?
par(mfrow=c(2,2))
hist(data$GRE.Score)
hist(data$TOEFL.Score)
hist(data$University.Rating)
hist(data$SOP)
par(mfrow=c(2,2))
hist(data$LOR)
hist(data$CGPA)
hist(data$Research)
hist(data$Chance.of.Admit)
#seems like the predictors are almost normal to start with. They have no ambiguities
library(corrplot)
par(mfrow=c(1,2))
corrplot(cor(data), type="upper", order="hclust", col=c("black", "white"),
         bg="lightblue")
corrplot(cor(data), method="number")
# when there is high correlation structure, we need to take out the multicollinearity to avoid overfitting
# hence we do ridge regression by crossvalidation
library(glmnet)
library(dplyr)
library(psych)
lambdas_to_try <- 10^seq(-3, 5, length.out = 100)
ridge_cv <- cv.glmnet(x, y, alpha = 0, lambda = lambdas_to_try,standardize = TRUE, nfolds = 10)
plot(ridge_cv)
lambda_cv <- ridge_cv$lambda.min     
model_cv <- glmnet(x, y, alpha = 0, lambda = lambda_cv, standardize = TRUE)
y_hat_cv <- predict(model_cv, x)
ssr_cv <- t(y - y_hat_cv) %*% (y - y_hat_cv)
rsq_ridge_cv <- cor(y, y_hat_cv)^2                       
rsq_ridge_cv
# hence we do ridge regression by aic, bic
lambdas_to_try <- 10^seq(-3, 5, length.out = 100)
x_scaled <- scale(x)
aic <- c()
bic <- c()
for (lambda in seq(lambdas_to_try)) {
  # Run model
  model <- glmnet(x, y, alpha = 0, lambda = lambdas_to_try[lambda], standardize = TRUE)
  # Extract coefficients and residuals (remove first row for the intercept)
  betas <- as.vector((as.matrix(coef(model))[-1, ]))
  resid <- y - (x_scaled %*% betas)
  # Compute hat-matrix and degrees of freedom
  ld <- lambdas_to_try[lambda] * diag(ncol(x_scaled))
  H <- x_scaled %*% solve(t(x_scaled) %*% x_scaled + ld) %*% t(x_scaled)
  df <- tr(H)
  # Compute information criteria
  aic[lambda] <- nrow(x_scaled) * log(t(resid) %*% resid) + 2 * df
  bic[lambda] <- nrow(x_scaled) * log(t(resid) %*% resid) + 2 * df * log(nrow(x_scaled))}
plot(log(lambdas_to_try), aic, col = "orange", type = "l",
     ylim = c(190, 260), ylab = "Information Criterion")
lines(log(lambdas_to_try), bic, col = "skyblue3")
legend("bottomright", lwd = 1, col = c("orange", "skyblue3"), legend = c("AIC", "BIC"))
# Optimal lambdas according to both criteria
lambda_aic <- lambdas_to_try[which.min(aic)]
lambda_bic <- lambdas_to_try[which.min(bic)]

# Fit final models, get their sum of squared residuals and multiple R-squared
model_aic <- glmnet(x, y, alpha = 0, lambda = lambda_aic, standardize = TRUE)
y_hat_aic <- predict(model_aic, x)
ssr_aic <- t(y - y_hat_aic) %*% (y - y_hat_aic)
rsq_ridge_aic <- cor(y, y_hat_aic)^2

model_bic <- glmnet(x, y, alpha = 0, lambda = lambda_bic, standardize = TRUE)
y_hat_bic <- predict(model_bic, x)
ssr_bic <- t(y - y_hat_bic) %*% (y - y_hat_bic)
rsq_ridge_bic <- cor(y, y_hat_bic)^2


# See how increasing lambda shrinks the coefficients --------------------------
# Each line shows coefficients for one variables, for different lambdas.
# The higher the lambda, the more the coefficients are shrinked towards zero.
res <- glmnet(x, y, alpha = 0, lambda = lambdas_to_try, standardize = FALSE)
plot(res, xvar = "lambda")
legend("bottomright", lwd = 1, col = 1:6, legend = colnames(x), cex = .7)

# aic, bic and cv gives same type of correlation coefficient

#lasso with cv
lasso_cv <- cv.glmnet(x, y, alpha = 1, lambda = lambdas_to_try,standardize = TRUE, nfolds = 10)
plot(lasso_cv)
lambda_cv <- lasso_cv$lambda.min     
model_cv <- glmnet(x, y, alpha = 1, lambda = lambda_cv, standardize = TRUE)
y_hat_cv <- predict(model_cv, x)
ssr_cv <- t(y - y_hat_cv) %*% (y - y_hat_cv)
rsq_lasso_cv <- cor(y, y_hat_cv)^2                       
rsq_lasso_cv
coef(lasso_cv)

#ancova
Data = data
Data$University.Rating = as.factor(Data$University.Rating)
Data$Research = as.factor(Data$Research)
model = aov(Chance.of.Admit~., Data)
summary(model)

#lasso with aic bic
x_scaled <- scale(x)
aic <- c()
bic <- c()
for (lambda in seq(lambdas_to_try)) {
  # Run model
  model <- glmnet(x, y, alpha = 1, lambda = lambdas_to_try[lambda], standardize = TRUE)
  # Extract coefficients and residuals (remove first row for the intercept)
  betas <- as.vector((as.matrix(coef(model))[-1, ]))
  resid <- y - (x_scaled %*% betas)
  # Compute hat-matrix and degrees of freedom
  ld <- lambdas_to_try[lambda] * diag(ncol(x_scaled))
  H <- x_scaled %*% solve(t(x_scaled) %*% x_scaled + ld) %*% t(x_scaled)
  df <- tr(H)
  # Compute information criteria
  aic[lambda] <- nrow(x_scaled) * log(t(resid) %*% resid) + 2 * df
  bic[lambda] <- nrow(x_scaled) * log(t(resid) %*% resid) + 2 * df * log(nrow(x_scaled))}
plot(log(lambdas_to_try), aic, col = "orange", type = "l",
     ylim = c(190, 260), ylab = "Information Criterion")
lines(log(lambdas_to_try), bic, col = "skyblue3")
legend("bottomright", lwd = 1, col = c("orange", "skyblue3"), legend = c("AIC", "BIC"))
# Optimal lambdas according to both criteria
lambda_aic <- lambdas_to_try[which.min(aic)]
lambda_bic <- lambdas_to_try[which.min(bic)]

# Fit final models, get their sum of squared residuals and multiple R-squared
model_aic <- glmnet(x, y, alpha = 1, lambda = lambda_aic, standardize = TRUE)
y_hat_aic <- predict(model_aic, x)
ssr_aic <- t(y - y_hat_aic) %*% (y - y_hat_aic)
rsq_lasso_aic <- cor(y, y_hat_aic)^2

model_bic <- glmnet(x, y, alpha = 1, lambda = lambda_bic, standardize = TRUE)
y_hat_bic <- predict(model_bic, x)
ssr_bic <- t(y - y_hat_bic) %*% (y - y_hat_bic)
rsq_lasso_bic <- cor(y, y_hat_bic)^2


# See how increasing lambda shrinks the coefficients --------------------------
# Each line shows coefficients for one variables, for different lambdas.
# The higher the lambda, the more the coefficients are shrinked towards zero.
res <- glmnet(x, y, alpha = 1, lambda = lambdas_to_try, standardize = FALSE)
plot(res, xvar = "lambda")
legend("bottomright", lwd = 1, col = 1:6, legend = colnames(x), cex = .7)

# forward selection using aov
base=aov(Chance.of.Admit~1, data)
top=aov(Chance.of.Admit~., data)
forward_step=step(base, scope = list(upper=mymodel, lower= ~1), direction = "forward", trace = FALSE)
forward_step

backward_step=step(top,direction = 'backward', trace = FALSE)
backward_step

finalmodel = aov(formula = Chance.of.Admit ~ GRE.Score + TOEFL.Score + University.Rating + 
                   +                      LOR + CGPA + Research, data = data)
#It is telling that SOP is found to be insignificant

y_hat_aov <- predict(finalmodel, data[,-8])
ssr_aov <- t(data[,8] - y_hat_aov) %*% (data[,8] - y_hat_aov)
rsq_aov <- cor(data[,8], y_hat_aov)^2

#All giving the same formula, but in this case of aov it is better.

I will give update the necessary information and conclusions soon.

Leave a Reply

Your email address will not be published. Required fields are marked *

Go Top