-
Notifications
You must be signed in to change notification settings - Fork 0
/
wnba_analysis.Rmd
154 lines (117 loc) · 5.42 KB
/
wnba_analysis.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
---
title: "2024 WNBA Analysis"
author: "Michelle Hewson"
date: "2024-06-13"
output: html_document
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
# Introduction
### This R file aims to analyze WNBA season data through various popular sports model frameworks, including the normal model, Bradley-Terry model, and Thurstone-Mosteller model. Team strengths will be estimated based on current season data and game outcome predictions for games that haven't been played yet will be calculated. These predictions will help inform sports analysts (and sports betters) for the upcoming WNBA playoffs.
## Read/process in data
```{r}
wnba <- read.csv("completed_games_wnba_2024.csv")
wnba_full <- read.csv("all_games_wnba_2024.csv")
```
```{r}
teams <- unique(wnba$HomeTeam)
num_teams <- length(teams)
# Construct design matrix for model calculations
X <- outer(wnba$HomeTeam, teams, "==") - outer(wnba$AwayTeam, teams, "==")
W <- rbind(diag(num_teams-1), rep(-1, num_teams-1))
# Create schedule matrix
X_star <- X %*% W
wnba$X_star <- X_star
```
## Normal model to estimate WNBA team strengths
```{r}
# Normal model
normal_ability_model <- lm(PtsDiff~X_star + AtHome + 0, data=wnba)
W_h <- rbind(cbind(W,0), c(rep(0,num_teams-1),1))
resid_stderr <- summary(normal_ability_model)$sigma
team_ability_estimates <- W_h %*% normal_ability_model$coefficients
V <- W_h %*% vcov(normal_ability_model) %*% t(W_h)
dimnames(V) <- list(c(teams,"HFA"), c(teams,"HFA")) # Home field advantage
normal_estimates <- data.frame(Estimate=team_ability_estimates, StdErr=sqrt(diag(V)))
row.names(normal_estimates) <- c(teams,"HFA")
# Ranking WNBA teams by normal strength estimates
team_ranks_normal <- normal_estimates[order(-normal_estimates$Estimate),]
print(team_ranks_normal)
# Write to csv for analysis in Python
write.csv(team_ranks_normal, "normal_estimates_wnba2024.csv")
```
## Another approach: Thurstone-Mosteller model to estimate WNBA team strengths
```{r}
# Thurstone-Mosteller model (probit)
tm_model <- glm(HomeWin~X_star+AtHome+0,
family=binomial(link="probit"), data=wnba)
# Add home field advantage
W_h <- rbind(cbind(W,0),c(rep(0,num_teams-1),1))
# Team strength estimates
thurstone_estimates <- data.frame("Est"=W_h %*% tm_model$coefficients)
V.probit <- W_h %*% summary(tm_model)$cov.unscaled %*% t(W_h)
thurstone_estimates$StdErr <- sqrt(diag(V.probit))
# Format Thurstone-Mosteller dataframe
row.names(thurstone_estimates) <- c(teams,"HFA")
dimnames(V.probit) = list(c(teams,"HFA"), c(teams,"HFA"))
thurstone_estimates = thurstone_estimates[
order(-thurstone_estimates$Est),]
thurstone_estimates
# Write to csv for analysis in Python
write.csv(thurstone_estimates, "thurstone_estimates_wnba2024.csv")
```
## Another approach: Bradley Terry model to estimate WNBA team strengths
```{r}
# Bradley Terry model
bt_model = glm(HomeWin ~ 0 + X_star + AtHome,
family="binomial", data = wnba)
# Bradley Terry model estimates
bt_estimates =
data.frame(Est=W_h %*% bt_model$coefficients)
# Variance covariance matrix
V.probit =
W_h %*% summary(bt_model)$cov.unscaled %*% t(W_h)
# Standard error
bt_estimates$Stderr = sqrt(diag(V.probit))
# Fix row names and format dataframe
row.names(bt_estimates) = c(teams,"HFA")
dimnames(V.probit) = list(c(teams,"HFA"), c(teams,"HFA"))
bt_estimates
# Write to csv for analysis in Python
write.csv(bt_estimates, "bt_estimates_wnba2024.csv")
```
## Paired comparison models (like the Thurstone-Mosteller model and Bradley-Terry model) can be used to estimate the total number of games a team wins at the end of a season by using data from each game outcome mid-way through the season
## Here we are using Bradley-Terry estimates to predict the number of games a team will win before playoffs, then we'll check how close our predictions are once the entire season is complete
```{r}
# Create indicator variable for whether the home or away team wins
wnba_full['HomeWin'] = ifelse(wnba_full$HomePts > wnba_full$AwayPts, 1, 0)
wnba_full['AtHome'] = 1
# Construct design matrix (schedule)
X_24 = outer(wnba_full$HomeTeam, teams, "==") - outer(wnba_full$AwayTeam, teams, "==")
W = rbind(diag(num_teams-1), rep(-1, (num_teams-1)))
X_star = X_24 %*% W
wnba_full$X_star = X_star
W_h = rbind(cbind(W,0),c(rep(0,num_teams-1),1))
# Fit BT model
bt_model_full_schedule = glm(HomeWin ~ 0 + X_star + AtHome,
family="binomial",data=wnba_full)
V.logit = W_h %*% summary(bt_model_full_schedule)$cov.unscaled %*% t(W_h)
# Format dataframe for BT estimates
bt_estimates$Stderr = sqrt(diag(V.logit))
row.names(bt_estimates) = c(teams,"HFA")
dimnames(V.logit) = list(c(teams,"HFA"), c(teams,"HFA"))
# Predict the probabilities for the home team winning each game across every game in the season
pred_prob <- predict(bt_model_full_schedule, wnba_full, type='response')
# Create y, which has a 1 if the game was played and home team won, 0 if the game was played and home team lost, or the predicted probability of the home team winning if the game was not played yet
wnba_full$y <- ifelse(is.na(wnba_full$HomePts), pred_prob, ifelse(wnba_full$HomePts > wnba_full$AwayPts, 1,0))
# Calculate number of wins based off of y vector
ystar = 2*wnba_full$y - 1
Xty = as.vector(t(X_24) %*% ystar)
wnba_wins_preds = (Xty + 40)/2
names(wnba_wins_preds) = teams
# Sort number of win predictions and convert to dataframe
wnba_wins_preds <- sort(wnba_wins_preds, decreasing = TRUE)
wnba_wins_preds
write.csv(wnba_wins_preds, "wnba_win_number_preds.csv")
```