-
Notifications
You must be signed in to change notification settings - Fork 1
/
Predicting College GPA.R
78 lines (54 loc) · 2.67 KB
/
Predicting College GPA.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
#Bringing in the data
dat <- read.csv("GPA2.csv", header = T)
#Checking out the data
head(dat)
dim(dat)
#Importing ggplot2 package for visualizations
library(ggplot2)
#Distribution, shape, summary and standard deviation of college GPA column
#(Our y variable)
hist(dat$colgpa,main = "College GPA", col="blue",xlim=c(1,4), breaks=15, xlab = "GPA")
boxplot(dat$colgpa,main = "College GPA", col="red", xlab = "GPA")
summary(dat$colgpa)
sd(dat$colgpa)
#Correlation matrix of the other variables to see if any...
#relationships exist
Matrix <- dat[,c(1,2,4,5,6)]
cor(Matrix)
#Round results to 2 decimal places
round(cor(Matrix),2)
#Regression to see which variables are important in the model
reg1 <- lm(colgpa ~., data = dat)
summary(reg1)
reg2 <- lm(colgpa ~ athlete+female, data = dat)
summary (reg2)
#According to these results,
#A student's SAT score, athlete status, rank in high school graduating class, high school percentile position
#and gender status are significant in this regression model.
#All of these variables have small p-values (<0.05) which indicates that they are important for the model.
#Analyzing the interaction terms
#High school percentile broken down by athlete and by female gender
library(ggplot2)
ggplot(dat, aes(x = hsperc, y = colgpa, color = as.factor(athlete))) +
geom_point() + geom_smooth(method = "lm") + scale_color_manual(values = c ("1" = "red","0" = "blue"))
ggplot(dat, aes(x = hsperc, y = colgpa, color = as.factor(female))) +
geom_point() + geom_smooth(method = "lm")
#Looking at these two scatterplots, it appears that only the interaction term hsperc*athlete is worth exploring and placing in the model.
#This is because the lines on that plot are not parallel, have different slopes, and they intersect.
#New regression model to predict college GPA (y) using the important predictor variables
#SAT score, athlete status, rank in high school graduating class, high school percentile position, gender status, interaction term
Model_1 <- lm(colgpa ~ SAT+ athlete+ hsrank+hsperc+female+hsperc*athlete, data=dat)
summary(Model_1)
#Round results to 3 decimal places
round(Model_1$coeff,3)
#Least Squares Line:
#ColGPA = 1.156 + 0.002SAT - 0.02athlete – 0.001hsrank – 0.011hsperc + 0.152female + 0.006athlete*hsperc.
#Trying out a prediction
#Let's say a male student received an 1390 on his SAT
#Let's say he was ranked 8th in his high school class
#Let's say he's not an athlete
#And let's say this rank is in the 6th percentile.
newdat <- data.frame(SAT = 1390, female = 0, hsrank = 8, hsperc = 6, athlete = 0)
round(predict(Model_1, newdat, interval = "predict"),3)
#According to the model, this male student is predicted to have
#A college GPA of 3.358