library(tidyverse)
library(GGally)
library(patchwork)
library(corrplot)
library(gridExtra)
library(forcats)
library(glmnet)
library(rsample)
library(mltools)

options(warn=-1)  # -1 suppresses all warnings

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.3     ✔ readr     2.1.4
✔ forcats   1.0.0     ✔ stringr   1.5.0
✔ ggplot2   3.4.3     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.0
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
Registered S3 method overwritten by 'GGally':
  method from   
  +.gg   ggplot2

corrplot 0.92 loaded


Attaching package: ‘gridExtra’


The following object is masked from ‘package:dplyr’:

    combine


Loading required package: Matrix


Attaching package: ‘Matrix’


The following objects are masked from ‘package:tidyr’:

    expand, pack, unpack


Loaded glmnet 4.1-8


Attaching package: ‘mltools’


The following object is masked from ‘package:tidyr’:

    replace_na

url = 'https://raw.githubusercontent.com/jadeeechen/STAT-301-Project-2023W1-/main/student-mat.csv'
data <- read.csv(url, sep = ";")
head(data,3)

data <- data |>
    select(-G1, -G2)

#Binary -> Factor Type
data$school <- as.factor(data$school)
data$sex <- as.factor(data$sex)
data$address <- as.factor(data$address)
data$famsize <- as.factor(data$famsize)
data$Pstatus <- as.factor(data$Pstatus)
data$schoolsup <- as.factor(data$schoolsup)
data$famsup <- as.factor(data$famsup)
data$paid <- as.factor(data$paid)
data$activities <- as.factor(data$activities)
data$nursery <- as.factor(data$nursery)
data$higher <- as.factor(data$higher)
data$internet <- as.factor(data$internet)
data$romantic <- as.factor(data$romantic)

#Nominal -> Factor Type
data$Mjob <- as.factor(data$Mjob)
data$Fjob <- as.factor(data$Fjob)
data$reason <- as.factor(data$reason)
data$guardian <- as.factor(data$guardian)

#Numeric -> Numeric Type
data$age <- as.numeric(data$age)
data$Medu <- as.numeric(data$Medu)
data$Fedu <- as.numeric(data$Fedu)
data$traveltime <- as.numeric(data$traveltime)
data$studytime <- as.numeric(data$studytime)
data$failures <- as.numeric(data$failures)
data$famrel <- as.numeric(data$famrel)
data$freetime <- as.numeric(data$freetime)
data$goout <- as.numeric(data$goout)
data$Dalc <- as.numeric(data$Dalc)
data$Walc <- as.numeric(data$Walc)
data$health <- as.numeric(data$health)
data$absences <- as.numeric(data$absences)
data$G3 <- as.numeric(data$G3)

numeric_vars <- c("age", "Medu", "Fedu", "traveltime", "studytime", "failures", 
                  "famrel", "freetime", "goout", "Dalc", "Walc", "health", 
                  "absences", "G3")

categorical_vars <- c("school", "sex", "address", "famsize", "Pstatus", 
                      "Mjob", "Fjob", "reason", "guardian", "schoolsup", 
                      "famsup", "paid", "activities", "nursery", "higher", 
                      "internet", "romantic")

summary_stats <- summary(data[numeric_vars])
print(summary_stats)

      age            Medu            Fedu         traveltime      studytime    
 Min.   :15.0   Min.   :0.000   Min.   :0.000   Min.   :1.000   Min.   :1.000  
 1st Qu.:16.0   1st Qu.:2.000   1st Qu.:2.000   1st Qu.:1.000   1st Qu.:1.000  
 Median :17.0   Median :3.000   Median :2.000   Median :1.000   Median :2.000  
 Mean   :16.7   Mean   :2.749   Mean   :2.522   Mean   :1.448   Mean   :2.035  
 3rd Qu.:18.0   3rd Qu.:4.000   3rd Qu.:3.000   3rd Qu.:2.000   3rd Qu.:2.000  
 Max.   :22.0   Max.   :4.000   Max.   :4.000   Max.   :4.000   Max.   :4.000  
    failures          famrel         freetime         goout      
 Min.   :0.0000   Min.   :1.000   Min.   :1.000   Min.   :1.000  
 1st Qu.:0.0000   1st Qu.:4.000   1st Qu.:3.000   1st Qu.:2.000  
 Median :0.0000   Median :4.000   Median :3.000   Median :3.000  
 Mean   :0.3342   Mean   :3.944   Mean   :3.235   Mean   :3.109  
 3rd Qu.:0.0000   3rd Qu.:5.000   3rd Qu.:4.000   3rd Qu.:4.000  
 Max.   :3.0000   Max.   :5.000   Max.   :5.000   Max.   :5.000  
      Dalc            Walc           health         absences     
 Min.   :1.000   Min.   :1.000   Min.   :1.000   Min.   : 0.000  
 1st Qu.:1.000   1st Qu.:1.000   1st Qu.:3.000   1st Qu.: 0.000  
 Median :1.000   Median :2.000   Median :4.000   Median : 4.000  
 Mean   :1.481   Mean   :2.291   Mean   :3.554   Mean   : 5.709  
 3rd Qu.:2.000   3rd Qu.:3.000   3rd Qu.:5.000   3rd Qu.: 8.000  
 Max.   :5.000   Max.   :5.000   Max.   :5.000   Max.   :75.000  
       G3       
 Min.   : 0.00  
 1st Qu.: 8.00  
 Median :11.00  
 Mean   :10.42  
 3rd Qu.:14.00  
 Max.   :20.00

for(var in categorical_vars) {
  freq_table <- table(data[[var]])
  print(paste("Frequency table for", var))
  print(freq_table)
}

[1] "Frequency table for school"

 GP  MS 
349  46 
[1] "Frequency table for sex"

  F   M 
208 187 
[1] "Frequency table for address"

  R   U 
 88 307 
[1] "Frequency table for famsize"

GT3 LE3 
281 114 
[1] "Frequency table for Pstatus"

  A   T 
 41 354 
[1] "Frequency table for Mjob"

 at_home   health    other services  teacher 
      59       34      141      103       58 
[1] "Frequency table for Fjob"

 at_home   health    other services  teacher 
      20       18      217      111       29 
[1] "Frequency table for reason"

    course       home      other reputation 
       145        109         36        105 
[1] "Frequency table for guardian"

father mother  other 
    90    273     32 
[1] "Frequency table for schoolsup"

 no yes 
344  51 
[1] "Frequency table for famsup"

 no yes 
153 242 
[1] "Frequency table for paid"

 no yes 
214 181 
[1] "Frequency table for activities"

 no yes 
194 201 
[1] "Frequency table for nursery"

 no yes 
 81 314 
[1] "Frequency table for higher"

 no yes 
 20 375 
[1] "Frequency table for internet"

 no yes 
 66 329 
[1] "Frequency table for romantic"

 no yes 
263 132

options(repr.plot.width=6, repr.plot.height=6)
box_internet <- ggplot(data, aes(x = internet, y = G3, fill = internet)) +
    geom_boxplot() +
    labs(x = "Internet Access at Home", y = "Final Grade (G3)") +
    ggtitle("Internet Access vs. Final Grade")
box_internet

options(repr.plot.width=8, repr.plot.height=8)

############## FACTOR ############## 

#Remove Internet because already explored
data_factor <- data |>
    select_if(is.factor) |>
    select(-internet)

# Define the variable names for faceting
facet_vars_factor <- colnames(data_factor)

# Create a list of ggplot objects for each facet
facetbar_factor <- lapply(facet_vars_factor, function(var) {
    average_data <- data |>
        group_by(.data[[var]]) |>
        summarize(avg_G3 = mean(G3, na.rm = TRUE)) |>
        arrange(desc(avg_G3)) |>
        mutate(!!var := reorder(.data[[var]], avg_G3)) #  Re-arrange the factor levels so that they are ordered by the values of avg_G3
    
    ggplot(average_data, aes(x = .data[[var]], y = avg_G3)) +
    geom_bar(stat = "identity", fill = "blue") +
    labs(x = var, y = "Final Grade (G3)") +
    ggtitle(var)
})

# Combine the plots
plot_facetbar_factor <- wrap_plots(facetbar_factor, ncol = 3)
plot_facetbar_factor

############## NUMERIC ############## 

options(repr.plot.width=8, repr.plot.height=8)

data_numeric <- data |>
    select_if(is.numeric)

# Define the variable names for faceting
facet_vars_numeric <- colnames(data_numeric)

# Create a list of ggplot objects for each facet
facetbar_numeric <- lapply(facet_vars_numeric, function(var) {
    average_data <- data |>
        group_by(.data[[var]]) |>
        summarize(avg_G3 = mean(G3, na.rm = TRUE))
    
    ggplot(average_data, aes(x = .data[[var]], y = avg_G3)) +
    geom_bar(stat = "identity", fill = "blue") +
    labs(x = var, y = "Final Grade (G3)") +
    ggtitle(var)
})

# Combine the plots
plot_facetbar_numeric <- wrap_plots(facetbar_numeric, ncol = 3)
plot_facetbar_numeric

cor_matrix <- cor(data_numeric)

options(repr.plot.width=8, repr.plot.height=8)
corrplot(cor_matrix, method = "circle", type = "upper", order = "hclust", tl.col = "black", addCoef.col = "dark grey")

options(repr.plot.width=12, repr.plot.height=8)
scatter_absences_combined <- ggplot(data, aes(x = absences, y = G3, color = internet)) +
  geom_point() +
  labs(x = "Absences", y = "Final Grade (G3)", 
       title = "Absences vs Final Grade") +
  geom_smooth(method = "lm", se = FALSE)

scatter_failures_combined <- ggplot(data, aes(x = failures, y = G3, color = internet)) +
  geom_point() +
  labs(x = "Number of Past Class Failures", y = "Final Grade (G3)", 
       title = "Past Class Failures vs Final Grade") +
  geom_smooth(method = "lm", se = FALSE)

combined_plot <- grid.arrange(scatter_absences_combined, scatter_failures_combined, ncol = 2)

combined_plot

`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'

TableGrob (1 x 2) "arrange": 2 grobs
  z     cells    name           grob
1 1 (1-1,1-1) arrange gtable[layout]
2 2 (1-1,2-2) arrange gtable[layout]

# Create a boxplot of math grades based on the school ('GP' - Gabriel Pereira or 'MS' - Mousinho da Silveira)
mat_boxplot_school <- data |>
        ggplot(aes(x = school, y = G3, fill = school)) +
        geom_boxplot() +
        stat_summary(fun=mean, geom='point',color="yellow", fill="yellow") +
        labs(x = "Schools", y = "Math Final Grade (G3)", 
        title = "Impact of School on Final Grade") +
        theme(plot.title = element_text(size = 9))

# Create a boxplot of math grades based on extra educational support 
mat_boxplot_schoolsup <- data |>
        ggplot(aes(x = schoolsup, y = G3, fill = schoolsup)) +
        geom_boxplot() +
        stat_summary(fun=mean, geom='point', color="yellow", fill="yellow") +
        labs(x = "Extra Educational Support", y = "Math Final Grade (G3)", 
        title = "Impact of Extra Educational Support on Final Grade")+
        theme(plot.title = element_text(size = 9))

# Create a boxplot of math grades based on aspiration to take higher education 
mat_boxplot_higher <- data |>
        ggplot(aes(x = higher, y = G3, fill = higher)) +
        geom_boxplot() +
        stat_summary(fun=mean, geom='point', color="yellow", fill="yellow") +
        labs(x = "Aspiration For Higher Education", y = "Math Final Grade (G3)", 
        title = "Impact of Aspiration For Higher Education on Final Grade") +
        theme(plot.title = element_text(size = 9))

# Create a boxplot of math grades based on Internet access at home
mat_boxplot_internet <- data |>
        ggplot(aes(x = internet, y = G3, fill = internet)) +
        geom_boxplot() +
        stat_summary(fun=mean, geom='point', color="yellow", fill="yellow") +
        labs(x = "Internet Access At Home", y = "Math Final Grade (G3)", 
        title = "Impact of Internet Access At Home on Final Grade") +
        theme(plot.title = element_text(size = 9))

combine_plot1 <- grid.arrange(mat_boxplot_school , mat_boxplot_schoolsup, mat_boxplot_higher, 
                              mat_boxplot_internet, ncol = 2)

explanatory_vars <- c("studytime", "traveltime", "goout", "Dalc", "Walc", "absences")
response_var <- "G3"

data_for_model <- data[, c(explanatory_vars, response_var)]

# Create model matrix for LASSO regression
x <- model.matrix(G3 ~ ., data_for_model)[, -1] 
y <- data$G3

# Scale the explanatory variables
x_scaled <- scale(x)


set.seed(123) 

cv_lasso <- cv.glmnet(x_scaled, y, alpha = 1)  # alpha=1 for LASSO

# Extract the best lambda value
best_lambda <- cv_lasso$lambda.min

# Fit the LASSO model with the best lambda value
lasso_model <- glmnet(x_scaled, y, alpha = 1, lambda = best_lambda)

# Extract coefficients from the LASSO model
lasso_coef <- coef(lasso_model, s = best_lambda)

# Predict values using the LASSO model
predicted_values <- predict(lasso_model, s = best_lambda, newx = x_scaled)

# Calculate R-squared value
r_squared <- 1 - sum((y - predicted_values)^2) / sum((y - mean(y))^2)

# Print R-squared value
print(r_squared)

# Convert the LASSO coefficients to a dataframe for plotting
coef_df <- data.frame(
  Variable = rownames(lasso_coef),
  Coefficient = lasso_coef[, 1]
)

# Remove the intercept row for plotting
coef_df <- coef_df[-1, ]

# Visualization
ggplot(coef_df, aes(x = Variable, y = Coefficient)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  coord_flip() +  # Flips the axes for easier reading
  theme_minimal() +
  labs(title = "LASSO Regression Coefficients",
       x = "Predictor Variables",
       y = "Coefficients")

[1] 0.03570681

#  MJob and FJob: Merge at_home into other and health/teacher into services.
data$Mjob <- recode_factor(data$Mjob, at_home = "other", health = "services", teacher = "services")
data$Fjob <- recode_factor(data$Mjob, at_home = "other", health = "services", teacher = "services")

# Reason: Merge course and reputation into school_attribute, and home into other.
data$reason <- recode_factor(data$reason, course = "school attribute", reputation = "school attribute", home = "other")

# Guardian: Merge father and mother into parents.
data$guardian <- recode_factor(data$guardian, father = "parents", mother = "parents")

binary <- c("schoolsup", "famsup", "paid", "activities", "nursery", "higher", "internet", "romantic")

for (column_name in binary) {
  data[[column_name]] <- as.numeric(data[[column_name]] == "yes")
}

factor_2_level <- c("school", "sex", "address", "famsize", "Pstatus", "Mjob", "Fjob", "reason", "guardian")

# Convert factor columns to numeric (1 for the first level, 2 for the second level)
for (column_name in factor_2_level) {
  levels <- levels(data[[column_name]])
  data[[column_name]] <- as.numeric(data[[column_name]] == levels[1]) + 1
}

set.seed(123)
data_split <- initial_split(data, prop = 0.6, strata = G3)
data_selection <- training(data_split)
data_prediction <- testing(data_split)

response_col_index <- which(names(data) == "G3")

X_train <- as.matrix(data_selection[, -response_col_index])
Y_train <- as.matrix(data_selection[, response_col_index])
X_test <- as.matrix(data_prediction[, -response_col_index])
Y_test <- as.matrix(data_prediction[, response_col_index])

lasso_model <- cv.glmnet(x = X_train |> as.matrix(), 
                         y = Y_train |> as.matrix(), 
                         alpha = 1)

beta_lasso <- coef(lasso_model, s = "lambda.min")

lasso_selected_covariates <- as_tibble(
        as.matrix(beta_lasso),
        rownames='covariate') |>
        filter(covariate != '(Intercept)' & abs(s1) !=0) |>
        pull(covariate)

model <- lm(G3 ~ ., data = data_selection |> select(lasso_selected_covariates, G3))
model

Call:
lm(formula = G3 ~ ., data = select(data_selection, lasso_selected_covariates, 
    G3))

Coefficients:
(Intercept)          sex      address         Medu         Mjob     failures  
    11.6409      -0.6660      -0.9169       0.4513      -0.5298      -1.9090  
   internet     romantic       famrel       health  
     0.7916      -1.1859       0.5280      -0.3936

model_full <- lm(G3 ~ ., data = data_selection)
prediction_full <- predict(model_full, newdata = data_prediction)

R_MSE_models <- tibble(
  Model = "Full Model",
  R_MSE = rmse(
    preds = prediction_full,
    actuals = data_prediction$G3
  )
)


prediction <- predict(model, newdata = data_prediction)

R_MSE_models <- rbind(
    R_MSE_models,
    tibble(
        Model = "LASSO Model",
        R_MSE = rmse(
            preds = prediction,
            actuals = data_prediction$G3
        )
    )
)

R_MSE_models

Student Performance¶

Introduction¶

Research Question¶

Methods and Results¶

Importing Packages¶

Read Data¶

Drop Features¶

Data Wrangling¶

Feature Details¶

Exploratory Data Analysis¶

Summary Statistics for Numerical Data Analysis¶

Frequency Table for Categorical Data Analysis¶

Box Plot¶

Faceted Bar Chart¶

Correlation Plot¶

Visualizations¶

Visualization 1¶

Visualization 2¶

Visualization 3¶

Method: Lasso Regularization¶

Reason¶

Assumptions¶

Implementation¶

Feature Selection¶

Potential Limitations¶

Discussion¶

References¶

	school	sex	age	address	famsize	Pstatus	Medu	Fedu	Mjob	Fjob	⋯	famrel	freetime	goout	Dalc	Walc	health	absences	G1	G2	G3
	<chr>	<chr>	<int>	<chr>	<chr>	<chr>	<int>	<int>	<chr>	<chr>	⋯	<int>	<int>	<int>	<int>	<int>	<int>	<int>	<int>	<int>	<int>
1	GP	F	18	U	GT3	A	4	4	at_home	teacher	⋯	4	3	4	1	1	3	6	5	6	6
2	GP	F	17	U	GT3	T	1	1	at_home	other	⋯	5	3	3	1	1	3	4	5	5	6
3	GP	F	15	U	LE3	T	1	1	at_home	other	⋯	4	3	2	2	3	3	10	7	8	10