---
title: "Model Sports Injuries as Counts"
date: "`r Sys.Date()`"
output: rmarkdown::html_vignette
vignette: >
%\VignetteIndexEntry{model-injury-data-i}
%\VignetteEncoding{UTF-8}
%\VignetteEngine{knitr::rmarkdown}
editor_options:
chunk_output_type: console
---
```{r, include = FALSE}
library(knitr)
knitr::opts_chunk$set(
collapse = TRUE,
comment = "#>"
)
options(rmarkdown.html_vignette.check_title = FALSE) # to supress R-CMD check
## to fold/hook the code
hook_output <- knit_hooks$get("output")
knit_hooks$set(output = function(x, options) {
lines <- options$output.lines
if (is.null(lines)) {
return(hook_output(x, options)) # pass to default hook
}
x <- unlist(strsplit(x, "\n"))
more <- "..."
if (length(lines) == 1) {
if (length(x) > lines) {
# truncate the output, but add ....
x <- c(head(x, lines), more)
}
} else {
x <- c(if (abs(lines[1]) > 1) more else NULL,
x[lines],
if (length(x) > lines[abs(length(lines))]) more else NULL
)
}
# paste these lines together
x <- paste(c(x, ""), collapse = "\n")
hook_output(x, options)
})
modern_r <- getRversion() >= "4.1.0"
```
```{r setup, message = F, warning = F}
library(injurytools)
library(dplyr)
library(stringr)
library(tidyr)
library(lme4)
library(pscl)
# library(glmmTMB)
library(MASS)
library(ggplot2)
library(gridExtra)
library(knitr)
```
```{r setup2, echo = F}
# set the global theme of all plots
theme_set(theme_bw())
```
**Example data**: we continue exploring the cohort of Liverpool Football Club male's first team players over two consecutive seasons, 2017-2018 and 2018-2019, scrapped from https://www.transfermarkt.com/ website[^visualize-note-1].
[^visualize-note-1]: These data sets are provided for illustrative purposes. We warn that they might not be accurate and could potentially include discrepancies or incomplete information compared to what actually occurred.
This article provides some modelling approaches when injuries are seen as count data.
First, the distribution of the count/rate variables is explored, e.g. number of injuries and number of days lost per exposure time. Afterwards, four different models are presented, that depending on the distribution of the data, will provide a better fit to them due to the distributional assumptions made by each of them.
# Exploring the distribution of injuries
We plot the histograms of the injury incidence and injury burden variables for each season.
See the code to prepare the data
```{r datasetup1, warning = F}
## 17/18
df_exposures1718 <- prepare_exp(df_exposures0 =
raw_df_exposures |> filter(season == "17/18"),
person_id = "player_name",
date = "year",
time_expo = "minutes_played") |>
mutate(seasonb = date2season(date))
df_injuries1718 <- prepare_inj(df_injuries0 =
raw_df_injuries |> filter(season == "17/18"),
person_id = "player_name",
date_injured = "from",
date_recovered = "until")
injd1718 <- prepare_all(data_exposures = df_exposures1718,
data_injuries = df_injuries1718,
exp_unit = "matches_minutes")
```
```{r datasetup2, warning = F}
## 18/19
df_exposures1819 <- prepare_exp(df_exposures0 =
raw_df_exposures |> filter(season == "18/19"),
person_id = "player_name",
date = "year",
time_expo = "minutes_played") |>
mutate(seasonb = date2season(date))
df_injuries1819 <- prepare_inj(df_injuries0 =
raw_df_injuries |> filter(season == "18/19"),
person_id = "player_name",
date_injured = "from",
date_recovered = "until")
injd1819 <- prepare_all(data_exposures = df_exposures1819,
data_injuries = df_injuries1819,
exp_unit = "matches_minutes")
```
```{r, warning = F}
## calculate injury summary statistics
dfs1718 <- calc_summary(injd1718, quiet = T)
dfs1819 <- calc_summary(injd1819, quiet = T)
dfs1718p <- calc_summary(injd1718, overall = FALSE, quiet = T)
dfs1819p <- calc_summary(injd1819, overall = FALSE, quiet = T)
dfsp <- bind_rows("Season 17-18" = dfs1718p,
"Season 18-19" = dfs1819p,
.id = "season")
```
```{r, eval = F}
## plot
p1 <- ggplot(data = dfsp) +
geom_histogram(aes(x = incidence, fill = season)) +
facet_wrap(~season) +
scale_fill_manual(name = "", values = c("#E7B800", "#2E9FDF")) +
ylab("Number of players") +
xlab("Incidence (number of injuries per 100 player-match)") +
ggtitle("Histogram of injury incidence in each season") +
theme(legend.position = "none")
p2 <- ggplot(data = dfsp) +
geom_histogram(aes(x = burden, fill = season)) +
facet_wrap(~season) +
scale_fill_manual(name = "", values = c("#E7B800", "#2E9FDF")) +
ylab("Number of players") +
xlab("Burden (number of days lost due to injury per 100 player-match)") +
ggtitle("Histogram of injury burden in each season") +
theme(legend.position = "none")
grid.arrange(p1, p2, ncol = 1)
```
```{r, echo = F, warning = F}
p1 <- ggplot(data = dfsp) +
geom_histogram(aes(x = incidence, fill = season)) +
facet_wrap(~season) +
scale_fill_manual(name = "", values = c("#E7B800", "#2E9FDF")) +
ylab("Number of players") +
xlab("Incidence (number of injuries per 100 player-match)") +
ggtitle(bquote("Histogram of injury" ~ bold("incidence") ~ "in each season")) +
theme(legend.position = "none")
p2 <- ggplot(data = dfsp) +
geom_histogram(aes(x = burden, fill = season)) +
facet_wrap(~season) +
scale_fill_manual(name = "", values = c("#E7B800", "#2E9FDF")) +
ylab("Number of players") +
xlab("Burden (number of days lost due to injury per 100 player-match)") +
ggtitle(bquote("Histogram of injury" ~ bold("burden") ~ "in each season")) +
theme(legend.position = "none")
```
Code for further plot specifications
```{r, eval = F}
theme_counts <- theme(axis.text = element_text(size = rel(1.2)),
axis.title = element_text(size = rel(1.3)),
strip.text = element_text(size = rel(1.4)),
plot.title = element_text(size = rel(1.4)),
legend.text = element_text(size = rel(1.3)),
legend.title = element_text(size = rel(1.3)))
p1 <- p1 + theme_counts
p2 <- p2 + theme_counts
```
```{r, warning = F, message = F, echo = F, fig.width = 10, fig.height = 6.8}
theme_counts <- theme(axis.text = element_text(size = rel(1.2)),
axis.title = element_text(size = rel(1.3)),
strip.text = element_text(size = rel(1.4)),
plot.title = element_text(size = rel(1.4)),
legend.text = element_text(size = rel(1.3)),
legend.title = element_text(size = rel(1.3)))
p1 <- p1 + theme_counts
p2 <- p2 + theme_counts
grid.arrange(p1, p2, ncol = 1)
```
In the following, we use `dfs1718p` data.
# - Poisson (mixed) model
We merge into the `dfs1718p` data frame the player-related information (i.e. `positionb`, `age`, `assists`, `goals`, `yellows` etc.) available in the `raw_df_exposures` data frame.
```{r}
## 17/18
df_exposures1718 <- prepare_exp(df_exposures0 =
raw_df_exposures |> filter(season == "17/18"),
person_id = "player_name",
date = "year",
time_expo = "minutes_played") |>
mutate(seasonb = date2season(date))
dfs1718p <- dfs1718p |>
mutate(seasonb = "2017/2018") |>
## join to have info, such as position, age, citizenship etc.
left_join(df_exposures1718, by = c("person_id" = "person_id",
"seasonb" = "seasonb")) |>
## create positionb column
## (so that the categories are: Attack, Defender, Goalkeeper and Midfield)
mutate(positionb = factor(str_split_i(position, "_", 1)))
```
```{r}
## quit Goalkeepers
dfs1718p <- dplyr::filter(dfs1718p, positionb != "Goalkeeper") |>
droplevels()
```
To fit a Poisson regression model for injury incidence (considering `positionb` as a covariate), we do:
```{r}
incidence_glm_pois <- glm(ncases ~ positionb, # + offset(log(totalexpo))
offset = log(totalexpo),
data = dfs1718p,
family = poisson)
```
Or alternatively, we can use the [`glmmTMB::glmmTMB`](https://glmmtmb.github.io/glmmTMB/reference/glmmTMB.html) function:
```{r, eval = F}
# incidence_glm_pois2 <- glmmTMB(formula = ncases ~ foot,
# offset = log(totalexpo),
# family = poisson(), data = dfs1718p)
# summary(incidence_glm_pois2)
```
Besides, if we have repeated measurements as in `dfsp`, we can fit a Mixed Model via:
Add more covariates to `dfsp` data frame
```{r}
df_exposures <- prepare_exp(df_exposures0 = raw_df_exposures,
person_id = "player_name",
date = "year",
time_expo = "minutes_played") |>
mutate(seasonb = date2season(date))
dfsp <- dfsp |>
mutate(seasonb = if_else(season == "Season 17-18", "2017/2018", "2018/2019")) |>
## join to have info, such as position, age, citizenship etc.
left_join(df_exposures, by = c("person_id" = "person_id",
"seasonb" = "seasonb")) |>
## create positionb column
## (so that the categories are: Attack, Defender, Goalkeeper and Midfield)
mutate(positionb = factor(str_split_i(position, "_", 1))) |>
droplevels()
```
```{r, eval = F}
incidence_glmm_pois <- glmer(formula = ncases ~ positionb + (1 | person_id),
offset = log(totalexpo),
data = dfsp,
family = poisson)
# incidence_glmm_pois2 <- glmmTMB::glmmTMB(formula = ncases ~ positionb + (1 | person_id),
# offset = log(totalexpo),
# data = dfsp,
# family = poisson)
```
We can do the analogue for the burden, e.g.:
```{r}
burden_glm_pois <- glm(ndayslost ~ positionb, offset = log(totalexpo), ## or ~ foot
data = dfs1718p,
family = poisson)
```
As of now, lets stick with the model `burden_glm_pois` and lets interpret the model output.
```{r}
summary(burden_glm_pois)
```
The estimated coefficients are:
```{r}
cbind(estimate = exp(coef(burden_glm_pois)) * c(90*100, 1, 1),
exp(confint(burden_glm_pois)) * c(90*100, 1, 1)) |> # (to report per 100 player-matches)
kable()
```
# - Negative Binomial model
```{r, warning = F}
burden_glm_nb <- glm.nb(ndayslost ~ positionb + offset(log(totalexpo)),
data = dfs1718p)
```
```{r}
summary(burden_glm_nb)
```
# - Zero-inflated Poisson model
```{r}
burden_zinfpois <- zeroinfl(ndayslost ~ positionb | positionb,
offset = log(totalexpo),
data = dfs1718p,
link = "logit",
dist = "poisson",
trace = FALSE, EM = FALSE)
## Or:
# burden_zinfpois <- glmmTMB::glmmTMB(formula = ndayslost ~ 1 + positionb,
# offset = log(totalexpo),
# ziformula = ~ 1 + positionb,
# data = dfs1718p,
# family = poisson)
```
```{r}
summary(burden_zinfpois)
```
# - Zero-inflated Negative Binomial model
```{r}
burden_zinfnb <- zeroinfl(ndayslost ~ positionb | positionb,
offset = log(totalexpo),
data = dfs1718p,
link = "logit",
dist = "negbin",
trace = FALSE, EM = FALSE)
## Or:
# burden_zinfnb <- glmmTMB::glmmTMB(ndayslost ~ 1 + positionb, offset = log(totalexpo),
# ziformula = ~ 1 + positionb,
# data = dfs1718p,
# family = nbinom2)
```
```{r}
summary(burden_zinfnb)
```
# Model comparison
Finally, lets compare the four models.
We compute the conditional predicted mean probabilities of each model:
```{r}
## pois
## predprob: for each subj, prob of observing each value
phat_pois <- predprob(burden_glm_pois)
phat_pois_mn <- apply(phat_pois, 2, mean)
## nb
phat_nb <- predprob(burden_glm_nb)
phat_nb_mn <- apply(phat_nb, 2, mean)
## zinfpois
phat_zinfpois <- predprob(burden_zinfpois)
phat_zinfpois_mn <- apply(phat_zinfpois, 2, mean)
## zinfnb
phat_zinfnb <- predprob(burden_zinfnb)
phat_zinfnb_mn <- apply(phat_zinfnb, 2, mean)
## put in a data frame
idx <- seq(1, 62, length.out = 30)
df_probs <- data.frame(phat_pois_mn = phat_pois_mn[idx],
phat_nb_mn = phat_nb_mn[idx],
phat_zinfpois_mn = phat_zinfpois_mn[idx],
phat_zinfnb_mn = phat_zinfnb_mn[idx],
x= idx) |>
tidyr::gather(key = "prob_type", value = "value", -x) |>
mutate(prob_type = factor(prob_type))
```
And we display them over the histogram of the data to examine the fits:
```{r, fig.width = 9, fig.height = 4.8}
ggplot(data = dfs1718p) +
geom_histogram(aes(x = burden/100, after_stat(density)),
breaks = seq(-0.5, 62, length.out = 30),
col = "black", alpha = 0.5) +
geom_point(data = df_probs, aes(x = x, y = value,
group = prob_type, col = prob_type)) +
geom_line(data = df_probs, aes(x = x, y = value,
group = prob_type, col = prob_type)) +
ylim(c(0, 0.3)) + xlab("Injury burden") + ylab("Density") +
scale_color_manual(name = "Model:",
labels = c("Negative Binomial", "Poisson",
"Zero-Inflated Negative Binomial",
"Zero-Inflated Poisson"),
values = c("darkblue", "chocolate", "purple", "red")) +
ggtitle("Histogram of injury burden in 2017/2018\nwith conditional Poisson, NB, ZIP and ZINB Densities") +
theme_counts +
theme(legend.position = c(0.7, 0.7))
```
```{r, echo = F, eval = F}
## using base R
with(dfs1718p, {
hist(ndayslost, prob = TRUE, breaks = seq(-0.5, 316.5, length.out = 30),
xlab = "Injury burden (number of injuries per player-season)",
main = "Histogram of overall injury burden\nwith conditional Poisson, NB, ZIP and ZINB Densities")
lines(x = idx, y = phat_pois_mn[idx], type = "b", lwd = 2, col = "black")
lines(x = idx, y = phat_nb_mn[idx], type = "b", lwd = 2, col = "purple")
lines(x = idx, y = phat_zinfpois_mn[idx], type = "b", lwd = 2, col = "red")
lines(x = idx, y = phat_zinfnb_mn[idx], type = "b", lwd = 2, col = "blue")
})
legend(250, 0.026, c("Poisson", "NB", "ZIP","ZINB"), lty = 1,
col = c("black", "purple", "red","blue"), lwd = 2)
```
Besides, we compute goodness of fit measures such as AIC, BIC and deviance explained:
```{r}
models <- list("Poisson model" = burden_glm_pois,
"Negative binomial model" = burden_glm_nb,
"Zero-inflated Poisson model" = burden_zinfpois,
"Zero-inflated Negative Binomial model" = burden_zinfnb)
res_gof <- lapply(models, function(model) {
aic <- AIC(model)
bic <- BIC(model)
if (class(model)[[1]] == "zeroinfl") {
deviance <- -2*logLik(model)[[1]]
null_model <- update(model, .~ -positionb)
null_deviance <- -2*logLik(null_model)[[1]]
} else {
deviance <- model$deviance
null_deviance <- model$null.deviance
}
dev_expl <- (null_deviance - deviance)/null_deviance * 100
return(res = data.frame(aic = aic, bic = bic, dev_expl = dev_expl))
})
```
```{r}
res_gof |>
bind_rows(.id = "model") |>
## arrange them according to dev_expl.
arrange(desc(dev_expl)) |>
knitr::kable(digits = 2,
col.names = c("Model", "AIC", "BIC", "Deviance Explained"))
```
According to these measures, the Negative Binomial model (`burden_glm_nb`) fits these data best.