Introduction


The Premier League, often referred to outside the UK as the English Premier League, or sometimes the EPL, is the top level of the English football league system. Contested by 20 clubs, it operates on a system of promotion and relegation with the English Football League.

In this project, I have attempted to visualize a few of the most common football stats as seen on major sports channels and publications. The raw data used can be found on this website.

 

 

Getting and Cleaning the Data


Fetching the data from the github repository

full.data <- read.csv("https://raw.githubusercontent.com/rohanprad/Data-Science-Project/main/Dataset/PL20-21.csv")

 

Selecting only relevant observations

library(dplyr)

tidy.data <- select(full.data, Date:AR)
tidy.data <- select(tidy.data, -(Time))
tidy.data <- select(tidy.data, -(Referee))

 

Getting the List of the 20 Team Names

clubs <- unique(tidy.data$HomeTeam)

 

Then I defined a function called getTeamData() which was used to summarize the results of all the matches and return a single row dataframe of relevant data columns for each team

 

Then I created an empty dataframe (the league table) to store the final results for all 20 teams

league.table <- data.frame(matrix(nrow = 0, ncol = 27))
colnames(league.table) <- c("Club", "P", "W", "D", "L", "GD", "Pts", "GF", "GA", 
                            "GF.Avg", "GA.Avg", "HW", "HD", "HL", "HW.Rate", 
                            "AW", "AD", "AL", "AW.Rate", "W.Rate", "S", "ST", 
                            "C", "F", "B", "YC", "RC")
for(club in clubs){
  row <- getTeamData(club)
  league.table <- rbind(league.table, row)
}

 

Adding two new columns for each team defining the colour and the city

league.table <- arrange(league.table, Club)

teamColors <- c("#EF0107", "#95BFE5", "#0057B8", "#6C1D45", "#034694", "#A7A5A6",
                "#003399", "#000000", "#FFCD00", "#003090", "#9C1310", "#6CABDD", 
                "#DA291C", "#241F20", "#EE2737", "#D71920", "#132257", "#122F67", 
                "#7A263A", "#FDB913")

teamCities <- factor(c("London", "Birmingham", "Brighton & Hove", "Burnley", 
                       "London", "London", "Liverpool", "London", "Leeds", "Leceister",
                       "Liverpool", "Manchester", "Manchester", "Newcastle", "Sheffield",
                       "Southampton", "London", "West Bromwich", "London", "Wolverhampton"))

league.table$Col = teamColors
league.table$City = teamCities

 

Arranging the league table by points > goal difference > goals for

league.table <- arrange(league.table, desc(Pts), desc(GD), desc(GF))

 

Converting the clubs column into a factor

league.table <- mutate(league.table, Club = as.factor(league.table$Club))

 

Creating a CSV file to store the final tidy data

write.csv(league.table, file = "./Dataset/TidyData.csv")

To Links

 

Performing Statistical Tests


summary(league.table)
##              Club          P            W               D       
##  Arsenal       : 1   Min.   :28   Min.   : 3.00   Min.   : 2.0  
##  Aston Villa   : 1   1st Qu.:29   1st Qu.: 7.75   1st Qu.: 5.0  
##  Brighton      : 1   Median :29   Median :12.00   Median : 7.0  
##  Burnley       : 1   Mean   :29   Mean   :11.10   Mean   : 6.8  
##  Chelsea       : 1   3rd Qu.:29   3rd Qu.:14.00   3rd Qu.: 9.0  
##  Crystal Palace: 1   Max.   :30   Max.   :22.00   Max.   :11.0  
##  (Other)       :14                                              
##        L               GD              Pts              GF       
##  Min.   : 3.00   Min.   :-37.00   Min.   :14.00   Min.   :16.00  
##  1st Qu.: 8.75   1st Qu.:-15.00   1st Qu.:32.75   1st Qu.:28.00  
##  Median :11.00   Median :  0.50   Median :40.00   Median :39.50  
##  Mean   :11.10   Mean   :  0.00   Mean   :40.10   Mean   :37.95  
##  3rd Qu.:14.00   3rd Qu.: 13.75   3rd Qu.:48.25   3rd Qu.:45.75  
##  Max.   :23.00   Max.   : 43.00   Max.   :71.00   Max.   :64.00  
##                                                                  
##        GA            GF.Avg          GA.Avg           HW             HD      
##  Min.   :21.00   Min.   :0.600   Min.   :0.70   Min.   : 2.0   Min.   :1.00  
##  1st Qu.:32.00   1st Qu.:1.000   1st Qu.:1.10   1st Qu.: 4.0   1st Qu.:2.00  
##  Median :36.50   Median :1.400   Median :1.25   Median : 5.0   Median :3.00  
##  Mean   :37.95   Mean   :1.320   Mean   :1.31   Mean   : 5.4   Mean   :3.40  
##  3rd Qu.:47.00   3rd Qu.:1.625   3rd Qu.:1.60   3rd Qu.: 7.0   3rd Qu.:4.25  
##  Max.   :57.00   Max.   :2.100   Max.   :2.00   Max.   :12.0   Max.   :7.00  
##                                                                              
##        HL           HW.Rate            AW             AD            AL       
##  Min.   : 2.00   Min.   :12.00   Min.   : 1.0   Min.   :0.0   Min.   : 0.00  
##  1st Qu.: 4.75   1st Qu.:28.50   1st Qu.: 4.0   1st Qu.:3.0   1st Qu.: 3.75  
##  Median : 6.00   Median :36.00   Median : 6.0   Median :3.0   Median : 5.00  
##  Mean   : 5.70   Mean   :37.35   Mean   : 5.7   Mean   :3.4   Mean   : 5.40  
##  3rd Qu.: 6.25   3rd Qu.:50.00   3rd Qu.: 7.0   3rd Qu.:4.0   3rd Qu.: 7.00  
##  Max.   :11.00   Max.   :75.00   Max.   :10.0   Max.   :7.0   Max.   :12.00  
##                                                                              
##     AW.Rate         W.Rate           S               ST              C        
##  Min.   : 7.0   Min.   :10.0   Min.   :237.0   Min.   : 77.0   Min.   :101.0  
##  1st Qu.:27.0   1st Qu.:27.0   1st Qu.:282.8   1st Qu.:105.5   1st Qu.:122.0  
##  Median :41.5   Median :41.0   Median :347.0   Median :116.5   Median :141.0  
##  Mean   :39.3   Mean   :38.2   Mean   :339.6   Mean   :122.0   Mean   :144.6  
##  3rd Qu.:47.0   3rd Qu.:48.0   3rd Qu.:381.0   3rd Qu.:142.5   3rd Qu.:166.2  
##  Max.   :71.0   Max.   :73.0   Max.   :471.0   Max.   :177.0   Max.   :199.0  
##                                                                               
##        F               B               YC              RC      
##  Min.   :268.0   Min.   :30.00   Min.   :30.00   Min.   :0.00  
##  1st Qu.:303.5   1st Qu.:38.75   1st Qu.:38.00   1st Qu.:0.00  
##  Median :312.0   Median :44.00   Median :41.00   Median :1.50  
##  Mean   :318.6   Mean   :43.75   Mean   :42.15   Mean   :1.60  
##  3rd Qu.:334.8   3rd Qu.:48.25   3rd Qu.:47.25   3rd Qu.:2.25  
##  Max.   :381.0   Max.   :58.00   Max.   :55.00   Max.   :5.00  
##                                                                
##      Col                         City  
##  Length:20          London         :6  
##  Class :character   Liverpool      :2  
##  Mode  :character   Manchester     :2  
##                     Birmingham     :1  
##                     Brighton & Hove:1  
##                     Burnley        :1  
##                     (Other)        :7
table(league.table$City)
## 
##      Birmingham Brighton & Hove         Burnley       Leceister           Leeds 
##               1               1               1               1               1 
##       Liverpool          London      Manchester       Newcastle       Sheffield 
##               2               6               2               1               1 
##     Southampton   West Bromwich   Wolverhampton 
##               1               1               1
table(league.table$Club)
## 
##          Arsenal      Aston Villa         Brighton          Burnley 
##                1                1                1                1 
##          Chelsea   Crystal Palace          Everton           Fulham 
##                1                1                1                1 
##            Leeds        Leicester        Liverpool         Man City 
##                1                1                1                1 
##       Man United        Newcastle Sheffield United      Southampton 
##                1                1                1                1 
##        Tottenham        West Brom         West Ham           Wolves 
##                1                1                1                1
table(league.table$Club, league.table$City)
##                   
##                    Birmingham Brighton & Hove Burnley Leceister Leeds Liverpool
##   Arsenal                   0               0       0         0     0         0
##   Aston Villa               1               0       0         0     0         0
##   Brighton                  0               1       0         0     0         0
##   Burnley                   0               0       1         0     0         0
##   Chelsea                   0               0       0         0     0         0
##   Crystal Palace            0               0       0         0     0         0
##   Everton                   0               0       0         0     0         1
##   Fulham                    0               0       0         0     0         0
##   Leeds                     0               0       0         0     1         0
##   Leicester                 0               0       0         1     0         0
##   Liverpool                 0               0       0         0     0         1
##   Man City                  0               0       0         0     0         0
##   Man United                0               0       0         0     0         0
##   Newcastle                 0               0       0         0     0         0
##   Sheffield United          0               0       0         0     0         0
##   Southampton               0               0       0         0     0         0
##   Tottenham                 0               0       0         0     0         0
##   West Brom                 0               0       0         0     0         0
##   West Ham                  0               0       0         0     0         0
##   Wolves                    0               0       0         0     0         0
##                   
##                    London Manchester Newcastle Sheffield Southampton
##   Arsenal               1          0         0         0           0
##   Aston Villa           0          0         0         0           0
##   Brighton              0          0         0         0           0
##   Burnley               0          0         0         0           0
##   Chelsea               1          0         0         0           0
##   Crystal Palace        1          0         0         0           0
##   Everton               0          0         0         0           0
##   Fulham                1          0         0         0           0
##   Leeds                 0          0         0         0           0
##   Leicester             0          0         0         0           0
##   Liverpool             0          0         0         0           0
##   Man City              0          1         0         0           0
##   Man United            0          1         0         0           0
##   Newcastle             0          0         1         0           0
##   Sheffield United      0          0         0         1           0
##   Southampton           0          0         0         0           1
##   Tottenham             1          0         0         0           0
##   West Brom             0          0         0         0           0
##   West Ham              1          0         0         0           0
##   Wolves                0          0         0         0           0
##                   
##                    West Bromwich Wolverhampton
##   Arsenal                      0             0
##   Aston Villa                  0             0
##   Brighton                     0             0
##   Burnley                      0             0
##   Chelsea                      0             0
##   Crystal Palace               0             0
##   Everton                      0             0
##   Fulham                       0             0
##   Leeds                        0             0
##   Leicester                    0             0
##   Liverpool                    0             0
##   Man City                     0             0
##   Man United                   0             0
##   Newcastle                    0             0
##   Sheffield United             0             0
##   Southampton                  0             0
##   Tottenham                    0             0
##   West Brom                    1             0
##   West Ham                     0             0
##   Wolves                       0             1

 

Chi Squared Test on Club and City
Null Hypothesis: The variables are dependent.

summary(table(league.table$Club, league.table$City))
## Number of cases in table: 20 
## Number of factors: 2 
## Test for independence of all factors:
##  Chisq = 240, df = 228, p-value = 0.2798
##  Chi-squared approximation may be incorrect
p-value > 0.05 : Fails to provide any evidence

 

Quantile for number of goals scored with 5% probablity

quantile(league.table$GF, .05)
##   5% 
## 19.8

 

Quantile for number of goals scored with 5% and 95% probablity

quantile(league.table$GF, c(.05, .95))
##   5%  95% 
## 19.8 56.4

 

Quantile for number of goals scored with an interval of 25% in the probabilties

quantile(league.table$GF)
##    0%   25%   50%   75%  100% 
## 16.00 28.00 39.50 45.75 64.00

 

Using the t.test and asking if the mean of points can be 40

t.test(league.table$Pts, mu = 40)
## 
##  One Sample t-test
## 
## data:  league.table$Pts
## t = 0.032737, df = 19, p-value = 0.9742
## alternative hypothesis: true mean is not equal to 40
## 95 percent confidence interval:
##  33.70649 46.49351
## sample estimates:
## mean of x 
##      40.1

p-value < 0.05 so it’s unlikely that mean is 40.
Null hypothesis is rejected.

 

Using the t.test and asking if the mean of points can be 40 with a confidence level of 99%

t.test(league.table$Pts, conf.level = 0.99, mu = 40)
## 
##  One Sample t-test
## 
## data:  league.table$Pts
## t = 0.032737, df = 19, p-value = 0.9742
## alternative hypothesis: true mean is not equal to 40
## 99 percent confidence interval:
##  31.36077 48.83923
## sample estimates:
## mean of x 
##      40.1

p-value < 0.05 so it’s unlikely that mean is 40.
Null hypothesis is rejected.

 

Calculating the confidence interval for the median of points using the Wilcox Test

wilcox.test(league.table$Pts, conf.int = TRUE)
## 
##  Wilcoxon signed rank test with continuity correction
## 
## data:  league.table$Pts
## V = 210, p-value = 9.542e-05
## alternative hypothesis: true location is not equal to 0
## 95 percent confidence interval:
##  33.49996 46.00002
## sample estimates:
## (pseudo)median 
##       40.02641

 

Using the Shapiro Test for normality

shapiro.test(league.table$Pts)
## 
##  Shapiro-Wilk normality test
## 
## data:  league.table$Pts
## W = 0.98854, p-value = 0.9957

p-value > 0.05 so it’s normally distributed.

 

Checking if the correlation between points and the number of wins is significant

cor.test(league.table$Pts, league.table$W)
## 
##  Pearson's product-moment correlation
## 
## data:  league.table$Pts and league.table$W
## t = 24.449, df = 18, p-value = 2.931e-15
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.9623379 0.9942835
## sample estimates:
##       cor 
## 0.9852749

p-value < 0.05 so a significant correlation exists.

 

Checking if the correlation between points and the number of yellow cards is significant

cor.test(league.table$Pts, league.table$YC)
## 
##  Pearson's product-moment correlation
## 
## data:  league.table$Pts and league.table$YC
## t = -2.015, df = 18, p-value = 0.05908
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.73247947  0.01666435
## sample estimates:
##        cor 
## -0.4290202

p-value > 0.05 so a significant correlation does not exist.

 

To Links

Visualizing the Data


Loading the necessary packages

library(ggplot2)

#install.packages("devtools")
#devtools::install_github("slowkow/ggrepel")

library(ggrepel)

 

Who has how many points?


ggplot(league.table, aes(Club, Pts))+
  geom_bar(stat = "identity",
           fill = league.table$Col)+
  theme_bw()+
  coord_flip()+
  labs(x = "Club",
       y = "Points",
       title = "Premier League 2020-21 Points")

To Links

 

Goals Scored


ggplot(league.table, aes(Club, GF))+
  geom_bar(stat = "identity",
           fill = league.table$Col)+
  theme_bw()+
  coord_flip()+
  labs(x = "Club",
       y = "Goals",
       title = "Premier League 2020-21 Goals Scored")

To Links

 

Goals Conceded


ggplot(league.table, aes(Club, GA))+
  geom_bar(stat = "identity",
           fill = league.table$Col)+
  theme_bw()+
  coord_flip()+
  labs(x = "Club",
       y = "Goals",
       title = "Premier League 2020-21 Goals Conceded")

To Links

 

Points Vs Goals Scored


ggplot(league.table, aes(GF, Pts) )+
  geom_point(size = 4, col = league.table$Col)+
  geom_smooth(method = lm, se = F, formula = y ~ x)+
  theme_bw()+
  labs(
    x = "Goals",
    y = "Points",
    title = "Points Vs Goals Scored")+
  geom_text_repel(
    label= league.table$Club, 
    nudge_x = 0, nudge_y = -1.0, 
    aes(size = 2.5)
  )+
  scale_size_identity()

To Links

 

Points Vs Goals Conceded


ggplot(league.table, aes(GA, Pts))+
  geom_point(size = 4, col = league.table$Col)+
  geom_smooth(method = lm, se = F, formula = y ~ x)+
  theme_bw()+
  labs(
    x = "Goals",
    y = "Points",
    title = "Points Vs Goals Conceded")+
  geom_text_repel(
    label= league.table$Club, 
    nudge_x = 0, nudge_y = -1.0, 
    aes(size = 3)
  )+
  scale_size_identity()

To Links

 

Home Performance


ggplot(league.table, aes(HW.Rate, Pts))+
  geom_point(size = 4, col = league.table$Col)+
  geom_smooth(method = lm, se = FALSE, formula = y ~ x)+
  theme_bw()+
  labs(
    x = "Win %",
    y = "Points",
    title = "Home Performance")+
  geom_text_repel(
    label= league.table$Club, 
    nudge_x = 0, nudge_y = -0.25, 
    aes(size = 3)
  )+
  scale_size_identity()

To Links

 

Away Performance


ggplot(league.table, aes(AW.Rate, Pts))+
  geom_point(size = 4, col = league.table$Col)+
  geom_smooth(method = lm, se = FALSE, formula = y ~ x)+
  theme_bw()+
  labs(
    x = "Win %",
    y = "Points",
    title = "Away Performance")+
  geom_text_repel(
    label= league.table$Club, 
    nudge_x = 0, nudge_y = -0.25, 
    aes(size = 3)
  )+
  scale_size_identity()

To Links

 

Creativity


ggplot(league.table, aes(S, ST))+
  geom_point(size = 4, col = league.table$Col)+
  geom_smooth(method = lm, se = FALSE, formula = y ~ x)+
  theme_bw()+
  labs(
    x = "Shots",
    y = "Shots on Target",
    title = "Creativity")+
  geom_text_repel(
    label= league.table$Club, 
    nudge_x = 0, nudge_y = -0.25, 
    aes(size = 3)
  )+
  scale_size_identity()

To Links

 

Clinical Attacks


ggplot(league.table, aes(S, (GF/S * 100)))+
  geom_point(size = 4, col = league.table$Col)+
  geom_smooth(method = lm, se = FALSE, formula = y ~ x)+
  theme_bw()+
  labs(
    x = "Shots",
    y = "Score Rate",
    title = "Clinical Attacks")+
  geom_text_repel(
    label= league.table$Club, 
    nudge_x = 0, nudge_y = -0.25, 
    aes(size = 3)
  )+
  scale_size_identity()

To Links

 

Fair Play / Tactical Fouls


ggplot(league.table, aes(F, B))+
  geom_point(size = 4, col = league.table$Col)+
  geom_smooth(method = lm, se = FALSE, formula = y ~ x)+
  theme_bw()+
  labs(
    x = "Fouls",
    y = "Bookings",
    title = "Fair Play/ Tactical Fouls")+
  geom_text_repel(
    label= league.table$Club, 
    nudge_x = 0, nudge_y = -0.25, 
    aes(size = 3)
  )+
  scale_size_identity()

To Links

 

Yellow Cards


ggplot(league.table, aes(Club, YC))+
  geom_bar(stat = "identity",
           fill = league.table$Col)+
  theme_bw()+
  coord_flip()+
  labs(x = "Club",
       y = "Yellow Cards",
       title = "Premier League 2020-21 Yellow Cards")

To Links

 

Red Cards


ggplot(league.table, aes(Club, RC))+
  geom_bar(stat = "identity",
           fill = league.table$Col)+
  theme_bw()+
  coord_flip()+
  labs(x = "Club",
       y = "Red Cards",
       title = "Premier League 2020-21 Red Cards")

To Links

By Rohan Pradhan (2018CSC1047)

To Top