For this page we will use data from the finals of various Olympic events, available on Wikipedia.
Boxplots using Matplotlib in Python can take data in the form of a list-of-lists:
# Results from the men's long jump finals of four Olympic games
data = [
[8.59, 8.47, 8.32, 8.31, 8.25, 8.24, 8.23, 8.21],
[8.34, 8.24, 8.20, 8.19, 8.19, 8.16, 8.07, 8.00],
[8.31, 8.16, 8.12, 8.11, 8.10, 8.07, 8.01, 7.93],
[8.38, 8.37, 8.29, 8.25, 8.17, 8.10, 8.06, 8.05]
]
labels = ['Athens 2004', 'Beijing 2008', 'London 2012', 'Rio 2016']
But it is often easier to use a Pandas data frame that you first convert into a Numpy array:
import pandas as pd
import numpy as np
# Results from the men's long jump finals of four Olympic games
data = pd.DataFrame({
'Athens 2004': [8.59, 8.47, 8.32, 8.31, 8.25, 8.24, 8.23, 8.21],
'Beijing 2008': [8.34, 8.24, 8.20, 8.19, 8.19, 8.16, 8.07, 8.00],
'London 2012': [8.31, 8.16, 8.12, 8.11, 8.10, 8.07, 8.01, 7.93],
'Rio 2016': [8.38, 8.37, 8.29, 8.25, 8.17, 8.10, 8.06, 8.05]
})
# You can use the column headings from the data frame as labels
labels = list(data)
# A data frame needs to be converted into an array before it can be plotted this way
data = np.array(data)
Both of these methods will work with the code that follows. However, the advantage of using a data frame is that it is a more useful format for doing data manipulation in general.
The plotting is going to be done in two lines: one to create the plot and the axes:
ax = plt.axes()
and one to create the boxplot:
bp = ax.boxplot(data, labels=labels)
import matplotlib.pyplot as plt
# Plot
ax = plt.axes()
bp = ax.boxplot(data, labels=labels)
# Axis details
ax.set_title("Men's Long Jump Finals")
ax.set_ylabel('Distance [m]')
ax.set_xlabel('Olympics')
Note that this method is robust to the number of datasets you have. We could easily comment out some of the sets of results (or add more) and still have it work.
We’re going to make a number of formatting changes to improve how this graph looks:
plt.axes()
plt.axes([left, bottom, width, height])
[0.125, 0.11, 0.775, 0.77]
sym=''
)whis='range'
)widths=0.4
)import numpy as np
# Settings
x = 6 # Want figures to be A6
plt.rc('figure', figsize=[46.82 * .5**(.5 * x), 33.11 * .5**(.5 * x)])
plt.rc('text', usetex=True)
plt.rc('font', family='serif')
# Plot
# plt.axes([left, bottom, width, height])
# default = [0.125, 0.11, 0.775, 0.77]
ax = plt.axes([0.1, 0.08, 0.79, 0.86])
bp = ax.boxplot(data, sym='', whis=[0, 100], widths=0.4, labels=labels)
# Axis details
ax.set_title("Men's Long Jump Finals")
ax.set_ylabel('Distance [m]')
ax.set_xlabel('Olympics')
ylim = ax.set_ylim(7.9, 8.6)
ax.tick_params(axis='x', bottom=False)
xticks = ax.set_xticks(np.arange(len(labels) + 1) + 0.5, minor=True)
plt.show()
It might be cool to colour each boxplot in the colours of the host country. Let’s see how to do that:
ax.boxplot()
you need to add patch_artist=True
plt.setp()
:
plt.setp(bp['boxes'], color='red')
will set the box outlines to redplt.setp(bp['boxes'], facecolor='red')
will set the box fills to redplt.setp(bp['medians'], color='red')
will set the median lines to redplt.setp(bp['whiskers'], color='red')
will set the vertical whiskers to redplt.setp(bp['caps'], color='red')
will set the horizontal caps to red# Plot
ax = plt.axes()
bp = ax.boxplot(
data, sym='', whis=[0, 100], widths=0.4, labels=labels,
patch_artist=True
)
# Axis details
ax.set_title("Men's Long Jump Finals")
ax.set_ylabel('Distance [m]')
ax.set_xlabel('Olympics')
ax.tick_params(axis='x', bottom=False)
xticks = ax.set_xticks(np.arange(len(labels) + 1) + 0.5, minor=True)
# Colour the outlines
colours = ['blue', 'yellow', 'red', 'yellow']
for i, box in enumerate(bp['boxes']):
# Iterate over the colours
j = i % len(colours)
# Set the colour of the boxes' outlines
plt.setp(bp['boxes'][i], color=colours[j])
# Set the colour of the median lines
plt.setp(bp['medians'][i], color=colours[j])
# Set the colour of the lower whiskers
plt.setp(bp['whiskers'][2 * i], color=colours[j])
# Set the colour of the upper whiskers
plt.setp(bp['whiskers'][2 * i + 1], color=colours[j])
# Set the colour of the lower caps
plt.setp(bp['caps'][2 * i], color=colours[j])
# Set the colour of the upper caps
plt.setp(bp['caps'][2 * i + 1], color=colours[j])
# Fill the boxes with colours (requires patch_artist=True)
colours = ['white', 'red', 'blue', 'green']
for i, box in enumerate(bp['boxes']):
# Iterate over the colours
j = i % len(colours)
# Set the colour of the box
plt.setp(bp['boxes'][i], facecolor=colours[j])
plt.show()
Unfortunately, that doesn’t look so good, so we’ll remove it going forward and just keep the fill colour and the median line colour.
It’s pretty useful to be able to see the median values of each dataset right there on the graph, and this feature can be added by using the ax.text()
function.
bp['medians'][i].get_ydata()[0]
str(round(s, 2))
np.arange(len(data)) + 1
ax.text(x_pos, y_pos, 'text')
#
# Plot
#
ax = plt.axes()
bp = ax.boxplot(
data, sym='', whis=[0, 100], widths=0.4, labels=labels,
patch_artist=True
)
# Axis details
ax.set_title("Men's Long Jump Finals")
ax.set_ylabel('Distance [m]')
ax.set_xlabel('Olympics')
ax.tick_params(axis='x', bottom=False)
xticks = ax.set_xticks(np.arange(len(labels) + 1) + 0.5, minor=True)
# Set the fill colour
plt.setp(bp['boxes'], facecolor='C0')
# Set the colour of the median lines
plt.setp(bp['medians'], color='black')
#
# Annotate the medians
#
# Get the samples' medians
medians = [bp['medians'][i].get_ydata()[0] for i in range(len(labels))]
medians = [str(round(s, 2)) for s in medians]
# Get the x-positions for the labels
x_pos = np.arange(len(labels)) + 1
# Get the y-axis limits and range
y_lim = ax.get_ylim()
y_range = y_lim[1] - y_lim[0]
# Increase the height of the plot by 5% to fit the labels
ax.set_ylim([y_lim[0] - 0.05 * y_range, y_lim[1]])
# Set the y-positions for the labels
y_pos = y_lim[0] - 0.025 * y_range
# Create the annotations
for i in range(len(labels)):
k = i % 2
ax.text(
x_pos[i], y_pos, r'$\tilde{x} =' + fr' {medians[i]}$ m',
horizontalalignment='center', size='small'
)
As you start adding more annotations to the plot area you quickly begin to run out of space. As a result, it sometimes makes more sense to add theses labels above the boxes instead of below them:
#
# Plot
#
ax = plt.axes()
bp = ax.boxplot(
data, sym='', whis=[0, 100], widths=0.4, labels=labels,
patch_artist=True
)
# Axis details
ax.set_title("Men's Long Jump Finals")
ax.set_ylabel('Distance [m]')
ax.set_xlabel('Olympics')
ax.tick_params(axis='x', bottom=False)
xticks = ax.set_xticks(np.arange(len(labels) + 1) + 0.5, minor=True)
# Set the fill colour
plt.setp(bp['boxes'], facecolor='C0')
# Set the colour of the median lines
plt.setp(bp['medians'], color='black')
# Get the samples' medians and sizes
medians = [bp['medians'][i].get_ydata()[0] for i in range(len(labels))]
medians = [str(round(s, 2)) for s in medians]
sample_sizes = [data.shape[0] for i in range(len(labels))]
sample_sizes = [f'{s:,}' for s in sample_sizes]
# Get the x-positions for the labels
x_pos = np.arange(len(labels)) + 1
# Get the y-axis limits and range
y_lim = ax.get_ylim()
y_range = y_lim[1] - y_lim[0]
# Increase the height of the plot by 10% to fit the labels
ax.set_ylim([y_lim[0], y_lim[1] + 0.10 * y_range])
# Set the y-positions for the median labels
y_pos = y_lim[1] + 0.04 * y_range
# Create the annotations for the medians
for i in range(len(labels)):
k = i % 2
ax.text(
x_pos[i], y_pos, r'$\tilde{x} =' + fr' {medians[i]}$ m',
horizontalalignment='center', size='small'
)
# Set the y-positions for the sample size labels
y_pos = y_lim[1]
# Create the annotations for the sample sizes
for i in range(len(labels)):
k = i % 2
ax.text(
x_pos[i], y_pos, r'$n =' + fr' {sample_sizes[i]}$',
horizontalalignment='center', size='small'
)
A background can help make the plot look more professional and gridlines can aid in interpretation.
ax.set_facecolor()
. A specific colour can be set (eg ‘red’, ‘blue’, etc) while a number between 0 and 1 in string format (eg ‘0.9’) will invoke a shade of grey (‘0’ being black, ‘1’ being white)plt.grid()
ax.axhline()
#
# Plot
#
ax = plt.axes()
bp = ax.boxplot(
data, sym='', whis=[0, 100], widths=0.4, labels=labels, patch_artist=True
)
# Axis details
ax.set_title("Men's Long Jump Finals")
ax.set_ylabel('Distance [m]')
ax.set_xlabel('Olympics')
ax.tick_params(axis='x', bottom=False)
xticks = ax.set_xticks(np.arange(len(labels) + 1) + 0.5, minor=True)
# Set the fill colour
plt.setp(bp['boxes'], facecolor='C0')
# Set the colour of the median lines
plt.setp(bp['medians'], color='black')
# Background
ax.set_facecolor('0.9')
plt.grid(color='white')
ax.axhline(8, color='black', alpha=0.4)
ax.axhline(8.5, color='black', alpha=0.4)
# Get the samples' medians
medians = [bp['medians'][i].get_ydata()[0] for i in range(len(labels))]
medians = [str(round(s, 2)) for s in medians]
# Get the x-positions for the labels
x_pos = np.arange(len(labels)) + 1
# Get the y-axis limits and range
y_lim = ax.get_ylim()
y_range = y_lim[1] - y_lim[0]
# Increase the height of the plot by 5% to fit the labels
ax.set_ylim([y_lim[0] - 0.05 * y_range, y_lim[1]])
# Set the y-positions for the labels
y_pos = y_lim[0] - 0.025 * y_range
# Create the annotations
for i in range(len(labels)):
k = i % 2
ax.text(
x_pos[i], y_pos, r'$\tilde{x} =' + fr' {medians[i]}$ m',
horizontalalignment='center', size='small'
)
The easiest way to plots multiple graphs that all have the same format is to create a function and call that repeatedly on each dataset. Some changes do need to be made, however:
fig, axes = plt.subplots(nrows, ncols)
fig.subplots_adjust()
for i, ax in enumerate(axes.flat):
First, create the data:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
# Long jump
long_jump_men = pd.DataFrame({
'Athens 2004': [8.59, 8.47, 8.32, 8.31, 8.25, 8.24, 8.23, 8.21],
'Beijing 2008': [8.34, 8.24, 8.20, 8.19, 8.19, 8.16, 8.07, 8.00],
'London 2012': [8.31, 8.16, 8.12, 8.11, 8.10, 8.07, 8.01, 7.93],
'Rio 2016': [8.38, 8.37, 8.29, 8.25, 8.17, 8.10, 8.06, 8.05]
})
long_jump_women = pd.DataFrame({
'Athens 2004': [7.07, 7.05, 7.05, 6.96, 6.85, 6.83, 6.80, 6.73],
'Beijing 2008': [7.04, 7.03, 6.91, 6.79, 6.76, 6.70, 6.64, 6.58],
'London 2012': [7.12, 7.07, 6.89, 6.88, 6.77, 6.76, 6.72, 6.67],
'Rio 2016': [7.17, 7.15, 7.08, 6.95, 6.81, 6.79, 6.74, 6.69]
})
# Shot put
shot_put_men = pd.DataFrame({
'Athens 2004': [21.16, 21.16, 21.07, 20.84, 20.60, 20.34, 20.31, 20.26],
'Beijing 2008': [21.51, 21.09, 21.05, 21.04, 20.98, 20.63, 20.53, 20.42],
'London 2012': [21.89, 21.86, 21.23, 21.19, 20.93, 20.84, 20.71, 20.69],
'Rio 2016': [22.52, 21.78, 21.36, 21.20, 21.02, 20.72, 20.64, 20.64]
})
shot_put_women = pd.DataFrame({
'Athens 2004': [21.06, 19.59, 19.55, 19.49, 19.01, 18.96, 18.64, 18.59],
'Beijing 2008': [20.56, 20.28, 19.86, 19.50, 19.20, 19.08, 19.01, 19.00],
'London 2012': [21.36, 20.70, 20.48, 20.22, 19.63, 19.42, 19.18, 19.02],
'Rio 2016': [20.63, 20.42, 19.87, 19.39, 19.35, 19.03, 18.37, 18.23]
})
# Javelin
javelin_men = pd.DataFrame({
'Athens 2004': [86.50, 84.95, 84.84, 84.13, 83.31, 83.25, 83.14, 83.01],
'Beijing 2008': [90.57, 86.64, 86.16, 83.95, 83.46, 83.45, 83.15, 82.06],
'London 2012': [84.58, 84.51, 84.12, 83.34, 82.80, 82.63, 81.91, 81.21],
'Rio 2016': [90.30, 88.24, 85.38, 85.32, 83.95, 83.05, 82.51, 82.42]
})
javelin_women = pd.DataFrame({
'Athens 2004': [71.53, 65.82, 64.29, 64.23, 63.54, 62.77, 62.51, 61.75],
'Beijing 2008': [71.42, 70.78, 66.13, 65.75, 65.29, 63.35, 62.02, 59.64],
'London 2012': [69.55, 65.16, 64.91, 64.53, 63.70, 62.89, 61.62, 60.73],
'Rio 2016': [66.18, 64.92, 64.80, 64.78, 64.60, 64.36, 64.04, 62.92]
})
# Collate
data = [
np.array(long_jump_men), np.array(long_jump_women), np.array(shot_put_men),
np.array(shot_put_women), np.array(javelin_men), np.array(javelin_women)
]
labels = [
list(long_jump_men), list(long_jump_women), list(shot_put_men),
list(shot_put_women), list(javelin_men), list(javelin_women)
]
titles = [
"Men's Long Jump", "Women's Long Jump",
"Men's Shot Put", "Women's Shot Put",
"Men's Javelin", "Women's Javelin",
]
Now plot it:
# Make the figure A4
A = 4
plt.rc('figure', figsize=[33.11 * .5**(.5 * A), 46.82 * .5**(.5 * A)])
def plot_boxplots(data, labels, ax, title):
"""Plot multiple boxplots."""
bp = ax.boxplot(
data, sym='', whis=[0, 100], widths=0.4, labels=labels,
patch_artist=True
)
# Axis details
ax.set_title(title)
ax.set_ylabel('Distance [m]')
ax.set_xlabel('Olympics')
ax.tick_params(axis='x', bottom=False)
xticks = ax.set_xticks(np.arange(len(labels) + 1) + 0.5, minor=True)
# Set the fill colour
plt.setp(bp['boxes'], facecolor='C0')
# Set the colour of the median lines
plt.setp(bp['medians'], color='black')
# Get the samples' medians
medians = [bp['medians'][i].get_ydata()[0] for i in range(len(labels))]
medians = [str(round(s, 2)) for s in medians]
# Get the x-positions for the labels
x_pos = np.arange(len(labels)) + 1
# Get the y-axis limits and range
y_lim = ax.get_ylim()
y_range = y_lim[1] - y_lim[0]
# Increase the height of the plot by 5% to fit the labels
ax.set_ylim([y_lim[0] - 0.05 * y_range, y_lim[1]])
# Set the y-positions for the labels
y_pos = y_lim[0] - 0.025 * y_range
# Create the annotations
for i in range(len(labels)):
k = i % 2
ax.text(
x_pos[i], y_pos, r'$\tilde{x} =' + fr' {medians[i]}$ m',
horizontalalignment='center', size='small'
)
# Call function
fig, axes = plt.subplots(3, 2)
fig.subplots_adjust(
left=0.1, right=0.98, top=0.97, bottom=0.06, wspace=0.3, hspace=0.25
)
for i, ax in enumerate(axes.flat):
plot_boxplots(data[i], labels[i], ax, titles[i])
plt.show()