The Premier League, often referred to outside the UK as the English Premier League, or sometimes the EPL, is the top level of the English football league system. Contested by 20 clubs, it operates on a system of promotion and relegation with the English Football League.
In this project, I have attempted to visualize a few of the most common football stats as seen on major sports channels and publications. The raw data used can be found on this website.
Fetching the data from the github repository
full.data <- read.csv("https://raw.githubusercontent.com/rohanprad/Data-Science-Project/main/Dataset/PL20-21.csv")
Selecting only relevant observations
library(dplyr)
tidy.data <- select(full.data, Date:AR)
tidy.data <- select(tidy.data, -(Time))
tidy.data <- select(tidy.data, -(Referee))
Getting the List of the 20 Team Names
clubs <- unique(tidy.data$HomeTeam)
Then I defined a function called getTeamData() which was used to summarize the results of all the matches and return a single row dataframe of relevant data columns for each team
Then I created an empty dataframe (the league table) to store the final results for all 20 teams
league.table <- data.frame(matrix(nrow = 0, ncol = 27))
colnames(league.table) <- c("Club", "P", "W", "D", "L", "GD", "Pts", "GF", "GA",
"GF.Avg", "GA.Avg", "HW", "HD", "HL", "HW.Rate",
"AW", "AD", "AL", "AW.Rate", "W.Rate", "S", "ST",
"C", "F", "B", "YC", "RC")
for(club in clubs){
row <- getTeamData(club)
league.table <- rbind(league.table, row)
}
Adding two new columns for each team defining the colour and the city
league.table <- arrange(league.table, Club)
teamColors <- c("#EF0107", "#95BFE5", "#0057B8", "#6C1D45", "#034694", "#A7A5A6",
"#003399", "#000000", "#FFCD00", "#003090", "#9C1310", "#6CABDD",
"#DA291C", "#241F20", "#EE2737", "#D71920", "#132257", "#122F67",
"#7A263A", "#FDB913")
teamCities <- factor(c("London", "Birmingham", "Brighton & Hove", "Burnley",
"London", "London", "Liverpool", "London", "Leeds", "Leceister",
"Liverpool", "Manchester", "Manchester", "Newcastle", "Sheffield",
"Southampton", "London", "West Bromwich", "London", "Wolverhampton"))
league.table$Col = teamColors
league.table$City = teamCities
Arranging the league table by points > goal difference > goals for
league.table <- arrange(league.table, desc(Pts), desc(GD), desc(GF))
Converting the clubs column into a factor
league.table <- mutate(league.table, Club = as.factor(league.table$Club))
Creating a CSV file to store the final tidy data
write.csv(league.table, file = "./Dataset/TidyData.csv")
summary(league.table)
## Club P W D
## Arsenal : 1 Min. :28 Min. : 3.00 Min. : 2.0
## Aston Villa : 1 1st Qu.:29 1st Qu.: 7.75 1st Qu.: 5.0
## Brighton : 1 Median :29 Median :12.00 Median : 7.0
## Burnley : 1 Mean :29 Mean :11.10 Mean : 6.8
## Chelsea : 1 3rd Qu.:29 3rd Qu.:14.00 3rd Qu.: 9.0
## Crystal Palace: 1 Max. :30 Max. :22.00 Max. :11.0
## (Other) :14
## L GD Pts GF
## Min. : 3.00 Min. :-37.00 Min. :14.00 Min. :16.00
## 1st Qu.: 8.75 1st Qu.:-15.00 1st Qu.:32.75 1st Qu.:28.00
## Median :11.00 Median : 0.50 Median :40.00 Median :39.50
## Mean :11.10 Mean : 0.00 Mean :40.10 Mean :37.95
## 3rd Qu.:14.00 3rd Qu.: 13.75 3rd Qu.:48.25 3rd Qu.:45.75
## Max. :23.00 Max. : 43.00 Max. :71.00 Max. :64.00
##
## GA GF.Avg GA.Avg HW HD
## Min. :21.00 Min. :0.600 Min. :0.70 Min. : 2.0 Min. :1.00
## 1st Qu.:32.00 1st Qu.:1.000 1st Qu.:1.10 1st Qu.: 4.0 1st Qu.:2.00
## Median :36.50 Median :1.400 Median :1.25 Median : 5.0 Median :3.00
## Mean :37.95 Mean :1.320 Mean :1.31 Mean : 5.4 Mean :3.40
## 3rd Qu.:47.00 3rd Qu.:1.625 3rd Qu.:1.60 3rd Qu.: 7.0 3rd Qu.:4.25
## Max. :57.00 Max. :2.100 Max. :2.00 Max. :12.0 Max. :7.00
##
## HL HW.Rate AW AD AL
## Min. : 2.00 Min. :12.00 Min. : 1.0 Min. :0.0 Min. : 0.00
## 1st Qu.: 4.75 1st Qu.:28.50 1st Qu.: 4.0 1st Qu.:3.0 1st Qu.: 3.75
## Median : 6.00 Median :36.00 Median : 6.0 Median :3.0 Median : 5.00
## Mean : 5.70 Mean :37.35 Mean : 5.7 Mean :3.4 Mean : 5.40
## 3rd Qu.: 6.25 3rd Qu.:50.00 3rd Qu.: 7.0 3rd Qu.:4.0 3rd Qu.: 7.00
## Max. :11.00 Max. :75.00 Max. :10.0 Max. :7.0 Max. :12.00
##
## AW.Rate W.Rate S ST C
## Min. : 7.0 Min. :10.0 Min. :237.0 Min. : 77.0 Min. :101.0
## 1st Qu.:27.0 1st Qu.:27.0 1st Qu.:282.8 1st Qu.:105.5 1st Qu.:122.0
## Median :41.5 Median :41.0 Median :347.0 Median :116.5 Median :141.0
## Mean :39.3 Mean :38.2 Mean :339.6 Mean :122.0 Mean :144.6
## 3rd Qu.:47.0 3rd Qu.:48.0 3rd Qu.:381.0 3rd Qu.:142.5 3rd Qu.:166.2
## Max. :71.0 Max. :73.0 Max. :471.0 Max. :177.0 Max. :199.0
##
## F B YC RC
## Min. :268.0 Min. :30.00 Min. :30.00 Min. :0.00
## 1st Qu.:303.5 1st Qu.:38.75 1st Qu.:38.00 1st Qu.:0.00
## Median :312.0 Median :44.00 Median :41.00 Median :1.50
## Mean :318.6 Mean :43.75 Mean :42.15 Mean :1.60
## 3rd Qu.:334.8 3rd Qu.:48.25 3rd Qu.:47.25 3rd Qu.:2.25
## Max. :381.0 Max. :58.00 Max. :55.00 Max. :5.00
##
## Col City
## Length:20 London :6
## Class :character Liverpool :2
## Mode :character Manchester :2
## Birmingham :1
## Brighton & Hove:1
## Burnley :1
## (Other) :7
table(league.table$City)
##
## Birmingham Brighton & Hove Burnley Leceister Leeds
## 1 1 1 1 1
## Liverpool London Manchester Newcastle Sheffield
## 2 6 2 1 1
## Southampton West Bromwich Wolverhampton
## 1 1 1
table(league.table$Club)
##
## Arsenal Aston Villa Brighton Burnley
## 1 1 1 1
## Chelsea Crystal Palace Everton Fulham
## 1 1 1 1
## Leeds Leicester Liverpool Man City
## 1 1 1 1
## Man United Newcastle Sheffield United Southampton
## 1 1 1 1
## Tottenham West Brom West Ham Wolves
## 1 1 1 1
table(league.table$Club, league.table$City)
##
## Birmingham Brighton & Hove Burnley Leceister Leeds Liverpool
## Arsenal 0 0 0 0 0 0
## Aston Villa 1 0 0 0 0 0
## Brighton 0 1 0 0 0 0
## Burnley 0 0 1 0 0 0
## Chelsea 0 0 0 0 0 0
## Crystal Palace 0 0 0 0 0 0
## Everton 0 0 0 0 0 1
## Fulham 0 0 0 0 0 0
## Leeds 0 0 0 0 1 0
## Leicester 0 0 0 1 0 0
## Liverpool 0 0 0 0 0 1
## Man City 0 0 0 0 0 0
## Man United 0 0 0 0 0 0
## Newcastle 0 0 0 0 0 0
## Sheffield United 0 0 0 0 0 0
## Southampton 0 0 0 0 0 0
## Tottenham 0 0 0 0 0 0
## West Brom 0 0 0 0 0 0
## West Ham 0 0 0 0 0 0
## Wolves 0 0 0 0 0 0
##
## London Manchester Newcastle Sheffield Southampton
## Arsenal 1 0 0 0 0
## Aston Villa 0 0 0 0 0
## Brighton 0 0 0 0 0
## Burnley 0 0 0 0 0
## Chelsea 1 0 0 0 0
## Crystal Palace 1 0 0 0 0
## Everton 0 0 0 0 0
## Fulham 1 0 0 0 0
## Leeds 0 0 0 0 0
## Leicester 0 0 0 0 0
## Liverpool 0 0 0 0 0
## Man City 0 1 0 0 0
## Man United 0 1 0 0 0
## Newcastle 0 0 1 0 0
## Sheffield United 0 0 0 1 0
## Southampton 0 0 0 0 1
## Tottenham 1 0 0 0 0
## West Brom 0 0 0 0 0
## West Ham 1 0 0 0 0
## Wolves 0 0 0 0 0
##
## West Bromwich Wolverhampton
## Arsenal 0 0
## Aston Villa 0 0
## Brighton 0 0
## Burnley 0 0
## Chelsea 0 0
## Crystal Palace 0 0
## Everton 0 0
## Fulham 0 0
## Leeds 0 0
## Leicester 0 0
## Liverpool 0 0
## Man City 0 0
## Man United 0 0
## Newcastle 0 0
## Sheffield United 0 0
## Southampton 0 0
## Tottenham 0 0
## West Brom 1 0
## West Ham 0 0
## Wolves 0 1
Chi Squared Test on Club and City
Null Hypothesis: The variables are dependent.
summary(table(league.table$Club, league.table$City))
## Number of cases in table: 20
## Number of factors: 2
## Test for independence of all factors:
## Chisq = 240, df = 228, p-value = 0.2798
## Chi-squared approximation may be incorrect
p-value > 0.05 : Fails to provide any evidence
Quantile for number of goals scored with 5% probablity
quantile(league.table$GF, .05)
## 5%
## 19.8
Quantile for number of goals scored with 5% and 95% probablity
quantile(league.table$GF, c(.05, .95))
## 5% 95%
## 19.8 56.4
Quantile for number of goals scored with an interval of 25% in the probabilties
quantile(league.table$GF)
## 0% 25% 50% 75% 100%
## 16.00 28.00 39.50 45.75 64.00
Using the t.test and asking if the mean of points can be 40
t.test(league.table$Pts, mu = 40)
##
## One Sample t-test
##
## data: league.table$Pts
## t = 0.032737, df = 19, p-value = 0.9742
## alternative hypothesis: true mean is not equal to 40
## 95 percent confidence interval:
## 33.70649 46.49351
## sample estimates:
## mean of x
## 40.1
p-value < 0.05 so it’s unlikely that mean is 40.
Null hypothesis is rejected.
Using the t.test and asking if the mean of points can be 40 with a confidence level of 99%
t.test(league.table$Pts, conf.level = 0.99, mu = 40)
##
## One Sample t-test
##
## data: league.table$Pts
## t = 0.032737, df = 19, p-value = 0.9742
## alternative hypothesis: true mean is not equal to 40
## 99 percent confidence interval:
## 31.36077 48.83923
## sample estimates:
## mean of x
## 40.1
p-value < 0.05 so it’s unlikely that mean is 40.
Null hypothesis is rejected.
Calculating the confidence interval for the median of points using the Wilcox Test
wilcox.test(league.table$Pts, conf.int = TRUE)
##
## Wilcoxon signed rank test with continuity correction
##
## data: league.table$Pts
## V = 210, p-value = 9.542e-05
## alternative hypothesis: true location is not equal to 0
## 95 percent confidence interval:
## 33.49996 46.00002
## sample estimates:
## (pseudo)median
## 40.02641
Using the Shapiro Test for normality
shapiro.test(league.table$Pts)
##
## Shapiro-Wilk normality test
##
## data: league.table$Pts
## W = 0.98854, p-value = 0.9957
p-value > 0.05 so it’s normally distributed.
Checking if the correlation between points and the number of wins is significant
cor.test(league.table$Pts, league.table$W)
##
## Pearson's product-moment correlation
##
## data: league.table$Pts and league.table$W
## t = 24.449, df = 18, p-value = 2.931e-15
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.9623379 0.9942835
## sample estimates:
## cor
## 0.9852749
p-value < 0.05 so a significant correlation exists.
Checking if the correlation between points and the number of yellow cards is significant
cor.test(league.table$Pts, league.table$YC)
##
## Pearson's product-moment correlation
##
## data: league.table$Pts and league.table$YC
## t = -2.015, df = 18, p-value = 0.05908
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.73247947 0.01666435
## sample estimates:
## cor
## -0.4290202
p-value > 0.05 so a significant correlation does not exist.
Loading the necessary packages
library(ggplot2)
#install.packages("devtools")
#devtools::install_github("slowkow/ggrepel")
library(ggrepel)
ggplot(league.table, aes(Club, Pts))+
geom_bar(stat = "identity",
fill = league.table$Col)+
theme_bw()+
coord_flip()+
labs(x = "Club",
y = "Points",
title = "Premier League 2020-21 Points")
ggplot(league.table, aes(Club, GF))+
geom_bar(stat = "identity",
fill = league.table$Col)+
theme_bw()+
coord_flip()+
labs(x = "Club",
y = "Goals",
title = "Premier League 2020-21 Goals Scored")
ggplot(league.table, aes(Club, GA))+
geom_bar(stat = "identity",
fill = league.table$Col)+
theme_bw()+
coord_flip()+
labs(x = "Club",
y = "Goals",
title = "Premier League 2020-21 Goals Conceded")
ggplot(league.table, aes(GF, Pts) )+
geom_point(size = 4, col = league.table$Col)+
geom_smooth(method = lm, se = F, formula = y ~ x)+
theme_bw()+
labs(
x = "Goals",
y = "Points",
title = "Points Vs Goals Scored")+
geom_text_repel(
label= league.table$Club,
nudge_x = 0, nudge_y = -1.0,
aes(size = 2.5)
)+
scale_size_identity()
ggplot(league.table, aes(GA, Pts))+
geom_point(size = 4, col = league.table$Col)+
geom_smooth(method = lm, se = F, formula = y ~ x)+
theme_bw()+
labs(
x = "Goals",
y = "Points",
title = "Points Vs Goals Conceded")+
geom_text_repel(
label= league.table$Club,
nudge_x = 0, nudge_y = -1.0,
aes(size = 3)
)+
scale_size_identity()
ggplot(league.table, aes(HW.Rate, Pts))+
geom_point(size = 4, col = league.table$Col)+
geom_smooth(method = lm, se = FALSE, formula = y ~ x)+
theme_bw()+
labs(
x = "Win %",
y = "Points",
title = "Home Performance")+
geom_text_repel(
label= league.table$Club,
nudge_x = 0, nudge_y = -0.25,
aes(size = 3)
)+
scale_size_identity()
ggplot(league.table, aes(AW.Rate, Pts))+
geom_point(size = 4, col = league.table$Col)+
geom_smooth(method = lm, se = FALSE, formula = y ~ x)+
theme_bw()+
labs(
x = "Win %",
y = "Points",
title = "Away Performance")+
geom_text_repel(
label= league.table$Club,
nudge_x = 0, nudge_y = -0.25,
aes(size = 3)
)+
scale_size_identity()
ggplot(league.table, aes(S, ST))+
geom_point(size = 4, col = league.table$Col)+
geom_smooth(method = lm, se = FALSE, formula = y ~ x)+
theme_bw()+
labs(
x = "Shots",
y = "Shots on Target",
title = "Creativity")+
geom_text_repel(
label= league.table$Club,
nudge_x = 0, nudge_y = -0.25,
aes(size = 3)
)+
scale_size_identity()
ggplot(league.table, aes(S, (GF/S * 100)))+
geom_point(size = 4, col = league.table$Col)+
geom_smooth(method = lm, se = FALSE, formula = y ~ x)+
theme_bw()+
labs(
x = "Shots",
y = "Score Rate",
title = "Clinical Attacks")+
geom_text_repel(
label= league.table$Club,
nudge_x = 0, nudge_y = -0.25,
aes(size = 3)
)+
scale_size_identity()
ggplot(league.table, aes(F, B))+
geom_point(size = 4, col = league.table$Col)+
geom_smooth(method = lm, se = FALSE, formula = y ~ x)+
theme_bw()+
labs(
x = "Fouls",
y = "Bookings",
title = "Fair Play/ Tactical Fouls")+
geom_text_repel(
label= league.table$Club,
nudge_x = 0, nudge_y = -0.25,
aes(size = 3)
)+
scale_size_identity()
ggplot(league.table, aes(Club, YC))+
geom_bar(stat = "identity",
fill = league.table$Col)+
theme_bw()+
coord_flip()+
labs(x = "Club",
y = "Yellow Cards",
title = "Premier League 2020-21 Yellow Cards")
ggplot(league.table, aes(Club, RC))+
geom_bar(stat = "identity",
fill = league.table$Col)+
theme_bw()+
coord_flip()+
labs(x = "Club",
y = "Red Cards",
title = "Premier League 2020-21 Red Cards")
By Rohan Pradhan (2018CSC1047)