For this page we will use the results of the 2021–22 English Premier League football season, available from here and downloadable as a data frame directly from the internet via the following:
import pandas as pd
# Display options
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', 8)
pd.set_option('display.max_colwidth', 40)
pd.set_option('display.width', 117)
# Download the data
df = pd.read_csv('https://www.football-data.co.uk/mmz4281/2122/E0.csv')
# Trim
cols = ['Date', 'HomeTeam', 'AwayTeam', 'FTHG', 'FTAG', 'FTR']
df = df[cols]
# Re-format the dates
df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%Y')
print(df.head())
## Date HomeTeam AwayTeam FTHG FTAG FTR
## 0 2021-08-13 Brentford Arsenal 2 0 H
## 1 2021-08-14 Man United Leeds 5 1 H
## 2 2021-08-14 Burnley Brighton 1 2 A
## 3 2021-08-14 Chelsea Crystal Palace 3 0 H
## 4 2021-08-14 Everton Southampton 3 1 H
We’re going to use the following columns:
H
for a home-team win, A
for an away-team win, D
for a draw)We need to have a data frame with data in the correct format in order to be able to plot it. So let’s go ahead and calculate the running total of goals scored for each team over the season as that is something we can represent in a line plot:
# Data preparation: calculate running total of goals scored
season = {}
for team in df['HomeTeam'].unique():
home = df[df['HomeTeam'] == team].copy()
home['goals_scored'] = home['FTHG']
away = df[df['AwayTeam'] == team].copy()
away['goals_scored'] = away['FTAG']
season[team] = pd.concat([home, away])
season[team] = season[team].sort_values('Date').reset_index(drop=True)
season[team]['total'] = season[team]['goals_scored'].cumsum()
# Check the data for Manchester United
print(season['Man United'].head())
## Date HomeTeam AwayTeam FTHG FTAG FTR goals_scored total
## 0 2021-08-14 Man United Leeds 5 1 H 5 5
## 1 2021-08-22 Southampton Man United 1 1 D 1 6
## 2 2021-08-29 Wolves Man United 0 1 A 1 7
## 3 2021-09-11 Man United Newcastle 4 1 H 4 11
## 4 2021-09-19 West Ham Man United 1 2 A 2 13
Not a lot. Here’s how to turn the above data into a line plot without using Pandas (only Matplotlib):
import matplotlib.pyplot as plt
# Create axes
ax = plt.axes()
# Extract the data
man_u = season['Man United']
# Plot the index (the week number) against the total number of Man United goals
ax.plot(man_u.index, man_u['total'], c='#e80909')
# Re-format the plot
ax.set_title("Manchester United's 2021-22 Premier League Goals")
ax.set_ylabel('Goals Scored')
ax.set_ylim(0, 60)
ax.set_xlabel('Week')
ax.set_xlim(0, 37)
plt.show()
…and here’s how to do it using Pandas:
# Create axes
ax = plt.axes()
# Extract the data
man_utd = season['Man United']
# Plot the index (the week number) against the total number of Man United goals
man_utd.plot(y='total', c='#e80909', ax=ax)
# Re-format the plot
ax.set_title("Manchester United's 2021-22 Premier League Goals")
ax.set_ylabel('Goals Scored')
ax.set_ylim(0, 60)
ax.set_xlabel('Week')
ax.set_xlim(0, 37)
ax.legend().remove()
plt.show()
Notice the difference? There isn’t any difference in what the plots look like but the first was made using this line:
ax.plot(man_u.index, man_u['total'], c='#e80909')
while the second used this line:
man_utd.plot(y='total', c='#e80909', ax=ax)
Everything else was the same, and the same plot was produced because Pandas uses Matplotlib to create its plots. It’s simply a matter of preference as to which method you use.
Anyway, here’s a plot of all the teams’ goals, using a dictionary of the teams’ colours for the formatting:
# Create a dictionary of the teams' colours
team_colours = {
'Arsenal': '#ef0107',
'Aston Villa': '#7b003a',
'Brentford': '#e30613',
'Brighton': '#005daa',
'Burnley': '#80bfff',
'Chelsea': '#034694',
'Crystal Palace': '#c4122e',
'Everton': '#274488',
'Leeds': '#ffff00',
'Leicester': '#0101e8',
'Liverpool': '#dd0000',
'Man City': '#6caddf',
'Man United': '#e80909',
'Newcastle': '#000000',
'Norwich': '#00a650',
'Southampton': '#ed1a3b',
'Tottenham': '#132257',
'Watford': '#fbee23',
'West Ham': '#7f0000',
'Wolves': '#fdbc02'
}
# Plot
ax = plt.axes()
for team in sorted(df['HomeTeam'].unique()):
season[team].plot(y='total', ax=ax, label=team, color=team_colours[team])
# Formatting
ax.set_title('2021-22 English Premier League Season')
ax.set_ylabel('Goals Scored')
ax.set_ylim(0, 100)
ax.set_xlabel('Week')
ax.set_xlim(0, 37)
plt.subplots_adjust(right=0.75)
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5), fontsize='x-small')
plt.show()
Re-format the data to make it into something that can be plotted in a scatter plot:
# Data preparation: calculate the total number of home and away goals for each team
home_goals = pd.pivot_table(df, values='FTHG', index='HomeTeam', columns='AwayTeam')
home_goals = home_goals.sum(axis=1)
away_goals = pd.pivot_table(df, values='FTAG', index='AwayTeam', columns='HomeTeam')
away_goals = away_goals.sum(axis=1)
goals = pd.concat([home_goals, away_goals], axis=1)
goals = goals.rename(columns={0: 'Home Goals', 1: 'Away Goals'})
print(goals.head())
## Home Goals Away Goals
## Arsenal 35.0 26.0
## Aston Villa 29.0 23.0
## Brentford 22.0 26.0
## Brighton 19.0 23.0
## Burnley 18.0 16.0
We will use a dictionary of the teams’ short-hand codes for the annotations:
team_codes = {
'Arsenal': 'ARS',
'Aston Villa': 'AVL',
'Brentford': 'BRE',
'Brighton': 'BRI',
'Burnley': 'BUR',
'Chelsea': 'CHE',
'Crystal Palace': 'CRY',
'Everton': 'EVE',
'Leeds': 'LEE',
'Leicester': 'LEI',
'Liverpool': 'LIV',
'Man City': 'MCI',
'Man United': 'MUN',
'Newcastle': 'NEW',
'Norwich': 'NOR',
'Southampton': 'SOT',
'Tottenham': 'TOT',
'Watford': 'WAT',
'West Ham': 'WHU',
'Wolves': 'WOL',
}
# Plot
ax = plt.axes()
goals.plot.scatter('Home Goals', 'Away Goals', ax=ax, alpha=0.5)
ax.plot([0, 63], [0, 63], 'k--', alpha=0.5)
# Formatting
ax.set_title('2021-22 English Premier League Season')
ax.set_ylim(0, 63)
ax.set_xlim(0, 63)
for i, row in goals.iterrows():
ax.annotate(team_codes[i], (row['Home Goals'], row['Away Goals']), fontsize=6)
# Make the axes square
y0, y1 = ax.get_ylim()
x0, x1 = ax.get_xlim()
ax.set_aspect(abs(x1 - x0) / abs(y1 - y0))
plt.show()
Similarly, re-format the data now to make it into something that can be plotted in a box plot:
# Data preparation: calculate total number of home and away wins for each team
home_results = pd.pivot_table(df, values='AwayTeam', index='HomeTeam', columns='FTR', aggfunc=len)
away_results = pd.pivot_table(df, values='HomeTeam', index='AwayTeam', columns='FTR', aggfunc=len)
overall = pd.concat([home_results['H'], away_results['A']], axis=1)
overall = overall.rename(columns={'H': 'Home Wins', 'A': 'Away Wins'})
overall['team_colours'] = team_colours.values()
print(overall.head())
## Home Wins Away Wins team_colours
## Arsenal 13.0 9 #ef0107
## Aston Villa 6.0 7 #7b003a
## Brentford 7.0 6 #e30613
## Brighton 5.0 7 #005daa
## Burnley 5.0 2 #80bfff
import numpy as np
# Plot
ax = plt.axes()
bp = overall.boxplot(column=['Home Wins', 'Away Wins'], grid=False, return_type='dict', patch_artist=True)
# Formatting
ax.set_title('2021-22 English Premier League Season')
x = np.random.normal(loc=1, scale=0.08, size=len(overall))
ax.scatter(x, overall['Home Wins'], zorder=10, c=overall['team_colours'], alpha=0.5)
for i, (j, row) in enumerate(overall.iterrows()):
ax.annotate(team_codes[j], (x[i], row['Home Wins']), fontsize=6, rotation=45)
x = np.random.normal(loc=2, scale=0.08, size=len(overall))
ax.scatter(x, overall['Away Wins'], zorder=10, c=overall['team_colours'], alpha=0.5)
for i, (j, row) in enumerate(overall.iterrows()):
ax.annotate(team_codes[j], (x[i], row['Away Wins']), fontsize=6, rotation=45)
plt.setp(bp['boxes'], color='k')
plt.setp(bp['medians'], color='k')
plt.setp(bp['whiskers'], color='k')
plt.setp(bp['boxes'], facecolor='#cccccc')
ax.set_ylim(0, 16)
plt.show()
# Data preparation: calculate total number of goals in each game
overall = df.copy()
overall['total_goals'] = overall['FTHG'] + overall['FTAG']
print(overall.head())
## Date HomeTeam AwayTeam FTHG FTAG FTR total_goals
## 0 2021-08-13 Brentford Arsenal 2 0 H 2
## 1 2021-08-14 Man United Leeds 5 1 H 6
## 2 2021-08-14 Burnley Brighton 1 2 A 3
## 3 2021-08-14 Chelsea Crystal Palace 3 0 H 3
## 4 2021-08-14 Everton Southampton 3 1 H 4
# Plot
ax = plt.axes()
bin_edges = [x - 0.5 for x in range(0, 12)]
overall.hist('total_goals', ax=ax, grid=False, bins=bin_edges, rwidth=0.9, facecolor='#cccccc', edgecolor='k')
# Formatting
ax.set_title('2021-22 English Premier League Season')
ax.set_ylabel('Number of Matches')
ax.set_ylim(0, 90)
ax.set_xlabel('Number of Goals in the Match')
ax.set_xlim(-0.5, 9.5)
plt.xticks(range(0, 10))
plt.show()
# Data preparation: get the overall results for each team
home_results = pd.pivot_table(df, values='AwayTeam', index='HomeTeam', columns='FTR', aggfunc=len)
home_results = home_results.rename(columns={'H': 'W', 'A': 'L'})
away_results = pd.pivot_table(df, values='HomeTeam', index='AwayTeam', columns='FTR', aggfunc=len)
away_results = away_results.rename(columns={'A': 'W', 'H': 'L'})
results = home_results + away_results
results = results.T
print(results.head())
## HomeTeam Arsenal Aston Villa Brentford Brighton ... Tottenham Watford West Ham Wolves
## FTR ...
## D 3.0 6.0 7.0 15.0 ... 5.0 5.0 8.0 6.0
## L 13.0 19.0 18.0 11.0 ... 11.0 27.0 14.0 17.0
## W 22.0 13.0 13.0 12.0 ... 22.0 6.0 16.0 15.0
##
## [3 rows x 20 columns]
# Plot
colours = ['#9c824a', '#023474', '#db0007']
labels = ['Drew', 'Lost', 'Won']
ax = results.plot.pie(y='Arsenal', colors=colours, labels=labels)
# Formatting
ax.get_legend().remove()
ax.set_title('2021-22 English Premier League Season:\nArsenal')
ax.set_ylabel('')
plt.show()
Plotting all the subplots using one function call creates the problem that the colours can’t be changed after the plot has been generated:
teams = ['Arsenal', 'Man United']
labels = ['Drew', 'Lost', 'Won']
axes = results[teams].plot.pie(subplots=True, layout=(1, 2), labels=labels)
# Formatting
for i, ax in enumerate(axes[0]):
ax.set_title(teams[i])
ax.set_ylabel('')
ax.get_legend().remove()
plt.show()
Rather use a loop to plot each sub-plot individually, as this allows the format of each plot to be controlled separately:
colours = [
['#9c824a', '#023474', '#db0007'],
['#000000', '#ffe500', '#da020e'],
]
fig, axes = plt.subplots(nrows=1, ncols=2)
for i, team in enumerate(['Arsenal', 'Man United']):
# Plot
results[team].plot.pie(ax=axes[i], labels=labels, colors=colours[i])
# Formatting
axes[i].set_title(team)
axes[i].set_ylabel('')
plt.show()