For this page we will use data from the finals of various Olympic events, available on Wikipedia.
This page functions as a follow-on from Boxplots with One Group of Data.
If we try to plot multiple datasets (eg the results from more than one Olympic Games) with two groups within each dataset (men and women), things start to become confusing:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
# Results of the long jump finals at two Olympic Games
data = pd.DataFrame({
'London 2012 (Men)': [8.31, 8.16, 8.12, 8.11, 8.10, 8.07, 8.01, 7.93],
'Rio 2016 (Men)': [8.38, 8.37, 8.29, 8.25, 8.17, 8.10, 8.06, 8.05],
'London 2012 (Women)': [7.12, 7.07, 6.89, 6.88, 6.77, 6.76, 6.72, 6.67],
'Rio 2016 (Women)': [7.17, 7.15, 7.08, 6.95, 6.81, 6.79, 6.74, 6.69]
})
# Plot
ax = plt.axes()
bp = ax.boxplot(
# A data frame needs to be converted to an array before it can be plotted this way
np.array(data),
# You can use the column headings from the data frame as labels
labels=list(data)
)
# Axis details
ax.set(xlabel='Olympics', ylabel='Distance [m]', title='Long Jump Finals')
ax.tick_params(axis='x', which='major', labelsize=8)
plt.show()
Try breaking the plots up into groups, clearly showing showing which boxes belong together in order to aid interpretation. This will also allow you to fit more data in:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
# Results of the long jump finals at four olympic games
athens = pd.DataFrame({
'Men': [8.59, 8.47, 8.32, 8.31, 8.25, 8.24, 8.23, 8.21],
'Women': [7.07, 7.05, 7.05, 6.96, 6.85, 6.83, 6.80, 6.73]
})
beijing = pd.DataFrame({
'Men': [8.34, 8.24, 8.20, 8.19, 8.19, 8.16, 8.07, 8.00],
'Women': [7.04, 7.03, 6.91, 6.79, 6.76, 6.70, 6.64, 6.58]
})
london = pd.DataFrame({
'Men': [8.31, 8.16, 8.12, 8.11, 8.10, 8.07, 8.01, 7.93],
'Women': [7.12, 7.07, 6.89, 6.88, 6.77, 6.76, 6.72, 6.67]
})
rio = pd.DataFrame({
'Men': [8.38, 8.37, 8.29, 8.25, 8.17, 8.10, 8.06, 8.05],
'Women': [7.17, 7.15, 7.08, 6.95, 6.81, 6.79, 6.74, 6.69]
})
datasets = [athens, beijing, london, rio]
# Create the plot
ax = plt.axes()
# Set x-positions for boxes
x_pos_range = np.arange(len(datasets)) / (len(datasets) - 1)
x_pos = (x_pos_range * 0.5) + 0.75
# Plot
for i, data in enumerate(datasets):
bp = ax.boxplot(
np.array(data), sym='', whis=[0, 100], widths=0.6 / len(datasets),
labels=list(datasets[0]),
positions=[x_pos[i] + j * 1 for j in range(len(data.T))]
)
# Titles
ax.set(
title='Long Jump Finals at the Last Four Olympic Games',
ylabel='Distance [m]'
)
# Remove the major x-axis tickmarks
ax.tick_params(axis='x', bottom=False)
# Positions of the x-axis labels
xticks = ax.set_xticks(np.arange(len(list(datasets[0]))) + 1)
# Positions of the minor x-axis tickmarks
xticks = ax.set_xticks(np.arange(len(list(datasets[0])) + 1) + 0.5, minor=True)
# Change the limits of the x-axis
xlim = ax.set_xlim([0.5, len(list(datasets[0])) + 0.5])
plt.show()
Note that we could also have used lists-of-lists as the datasets instead of data frames (with some minor changes, eg data
would not have been transposed and the labels would have needed to be manually coded).
A big step towards better communication will be colour-coding the boxes. At the same time we’ll improve the format and size of the plots (see Image Sizes and Latex in Labels for more info):
# Settings
x = 6 # Want figures to be A6
plt.rc('figure', figsize=[46.82 * .5**(.5 * x), 33.11 * .5**(.5 * x)])
plt.rc('text', usetex=True)
plt.rc('font', family='serif')
# Define which colours you want to use
colours = ['blue', 'red']
# Create the plot
ax = plt.axes()
# Set x-positions for boxes
x_pos_range = np.arange(len(datasets)) / (len(datasets) - 1)
x_pos = (x_pos_range * 0.5) + 0.75
# Plot
for i, data in enumerate(datasets):
bp = ax.boxplot(
np.array(data), sym='', whis=[0, 100], widths=0.6 / len(datasets),
labels=list(datasets[0]), patch_artist=True,
positions=[x_pos[i] + j * 1 for j in range(len(data.T))]
)
# Fill the boxes with colours (requires patch_artist=True)
k = i % len(colours)
for box in bp['boxes']:
box.set(facecolor=colours[k])
# Make the median lines more visible
plt.setp(bp['medians'], color='black')
# Axis details
details = ax.set(
title='Long Jump Finals at the Last Four Olympic Games',
ylabel='Distance [m]'
)
ax.tick_params(axis='x', bottom=False)
xticks = ax.set_xticks(np.arange(len(list(datasets[0]))) + 1)
xticks = ax.set_xticks(np.arange(len(list(datasets[0])) + 1) + 0.5, minor=True)
xlim = ax.set_xlim([0.5, len(list(datasets[0])) + 0.5])
plt.show()
The one glaring omission is that we still need to show which Olympics each set of data comes from…
Adding a legend will clarify what data belongs to what group.
from matplotlib.patches import Patch
# Define which colours you want to use
colours = ['blue', 'red']
# Define the groups
groups = ['Athens 2004', 'Beijing 2008', 'London 2012', 'Rio 2016']
# Legend
legend_elements = []
for i in range(len(datasets)):
j = i % len(groups)
k = i % len(colours)
legend_elements.append(Patch(facecolor=colours[k], label=groups[j]))
ax.legend(handles=legend_elements, fontsize=8)
The median value of each boxplot can be shown directly on the plot using annotations. Note that if there are too many groups when doing this the annotations might start to overlap, so we’ll drop down to just two for this example.
datasets = [london, rio]
# Define which colours you want to use
colours = ['blue', 'red']
# Define the groups
groups = ['Athens 2004', 'Beijing 2008', 'London 2012', 'Rio 2016']
# Get the max of the dataset
all_maximums = [d.max(axis=1).values for d in datasets]
dataset_maximums = [max(m) for m in all_maximums]
y_max = max(dataset_maximums)
# Get the min of the dataset
all_minimums = [d.min(axis=1).values for d in datasets]
dataset_minimums = [min(m) for m in all_minimums]
y_min = min(dataset_minimums)
# Calculate the y-axis range
y_range = y_max - y_min
# Create the plot
ax = plt.axes()
# Set x-positions for boxes
x_pos_range = np.arange(len(datasets)) / (len(datasets) - 1)
x_pos = (x_pos_range * 0.5) + 0.75
# Plot
for i, data in enumerate(datasets):
positions = [x_pos[i] + j * 1 for j in range(len(data.T))]
bp = ax.boxplot(
np.array(data), sym='', whis=[0, 100], widths=0.6 / len(datasets),
labels=list(datasets[0]), patch_artist=True,
positions=positions
)
# Fill the boxes with colours (requires patch_artist=True)
k = i % len(colours)
for box in bp['boxes']:
box.set(facecolor=colours[k])
# Make the median lines more visible
plt.setp(bp['medians'], color='black')
# Get the samples' medians
medians = [bp['medians'][j].get_ydata()[0] for j in range(len(data.T))]
medians = [str(round(s, 2)) for s in medians]
# Increase the height of the plot by 5% to fit the labels
ax.set_ylim([y_min - 0.1 * y_range, y_max + 0.05 * y_range])
# Set the y-positions for the labels
y_pos = y_min - 0.075 * y_range
for tick, label in zip(range(len(data.T)), ax.get_xticklabels()):
k = tick % 2
ax.text(
positions[tick], y_pos, r'$\tilde{x} =' + fr' {medians[tick]}$m',
horizontalalignment='center', size='medium'
)
# Axis details
details = ax.set(
title='Long Jump Finals at the Last Two Olympic Games',
ylabel='Distance [m]'
)
ax.tick_params(axis='x', bottom=False)
xticks = ax.set_xticks(np.arange(len(list(datasets[0]))) + 1)
xticks = ax.set_xticks(np.arange(len(list(datasets[0])) + 1) + 0.5, minor=True)
xlim = ax.set_xlim([0.5, len(list(datasets[0])) + 0.5])
# Legend
legend_elements = []
for i in range(len(datasets)):
j = i % len(groups)
k = i % len(colours)
legend_elements.append(Patch(facecolor=colours[k], label=groups[j]))
ax.legend(handles=legend_elements, fontsize=8)
plt.show()
When things start getting busier you’ll want to have the legend positioned outside the plot itself. This will require you to edit the amount of white space between the edge of the graph and the edge of the image, and this can be done in the fig.add_axes()
function:
fig.add_axes()
takes a list of four numbers: the amount of white space to the left and below the plot and the plot’s width and height[0.125, 0.11, 0.775, 0.77]
, meaning that the white space to the left of the plot is equal to 12.5% of the width of the image, the white space at the bottom of the image is 11% of the image’s height, etcWith regards to the legend, its location cannot be finely adjusted when you use ax.legend()
so it cannot be moved outside the plot as things stand. However, we can use the more basic function plt.gca().legend()
(where gca
stands for ‘get current axis’) to isolate just the legend of the current set of axes in order to edit its hidden attributes. Specifically, we can use the bbox_to_anchor=()
keyword argument to define its position, so by setting this to \((1, 0.5)\) the legend will be placed at \(x = 1\) (ie outside the plot, to the right of it) and at \(y = 0.5\) (ie halfway up the plot).
# Create the plot
# plt.axes([left, bottom, width, height])
# Default = [0.125, 0.11, 0.775, 0.77]
ax = plt.axes([0.1, 0.06, 0.7, 0.86])
# Set x-positions for boxes
x_pos_range = np.arange(len(datasets)) / (len(datasets) - 1)
x_pos = (x_pos_range * 0.5) + 0.75
# Plot
for i, data in enumerate(datasets):
positions = [x_pos[i] + j * 1 for j in range(len(data.T))]
bp = ax.boxplot(
np.array(data), sym='', whis=[0, 100], widths=0.6 / len(datasets),
labels=list(datasets[0]), patch_artist=True,
positions=positions
)
# Fill the boxes with colours (requires patch_artist=True)
k = i % len(colours)
for box in bp['boxes']:
box.set(facecolor=colours[k])
# Make the median lines more visible
plt.setp(bp['medians'], color='black')
# Get the samples' medians
medians = [bp['medians'][j].get_ydata()[0] for j in range(len(data.T))]
medians = [str(round(s, 2)) for s in medians]
# Increase the height of the plot by 5% to fit the labels
ax.set_ylim([y_min - 0.1 * y_range, y_max + 0.05 * y_range])
# Set the y-positions for the labels
y_pos = y_min - 0.075 * y_range
for tick, label in zip(range(len(data.T)), ax.get_xticklabels()):
k = tick % 2
ax.text(
positions[tick], y_pos, r'$\tilde{x} =' + fr' {medians[tick]}$m',
horizontalalignment='center', size='medium'
)
# Axis details
details = ax.set(
title='Long Jump Finals at the Last Two Olympic Games',
ylabel='Distance [m]'
)
ax.tick_params(axis='x', bottom=False)
xticks = ax.set_xticks(np.arange(len(list(datasets[0]))) + 1)
xticks = ax.set_xticks(np.arange(len(list(datasets[0])) + 1) + 0.5, minor=True)
xlim = ax.set_xlim([0.5, len(list(datasets[0])) + 0.5])
# Legend
legend_elements = []
for i in range(len(datasets)):
j = i % len(groups)
k = i % len(colours)
legend_elements.append(Patch(facecolor=colours[k], label=groups[j]))
plt.gca().legend(
legend_elements, groups,
fontsize=8, loc='center left', bbox_to_anchor=(1, 0.5)
)
plt.show()
A background can help make the plot look more professional and gridlines can aid in interpretation.
ax.set_facecolor()
. A specific colour can be set (eg ‘red’, ‘blue’, etc) while a number between 0 and 1 in string format (eg ‘0.9’) will invoke a shade of grey (‘0’ being black, ‘1’ being white)plt.grid(True)
ax.axhline()
# Background
ax.set_facecolor('0.8')
plt.grid(True, color='white')
ax.axhline(7, color='black', alpha=0.4)
ax.axhline(8, color='black', alpha=0.4)
The method of plotting we have been using is robust to the number of datasets and groups that are used, so if we wanted to add in more data and swap things around it shouldn’t be too much of a problem:
london = pd.DataFrame({
'Long Jump': [8.31, 8.16, 8.12, 8.11, 8.10, 8.07, 8.01, 7.93],
'Shot Put': [21.89, 21.86, 21.23, 21.19, 20.93, 20.84, 20.71, 20.69],
'Discus': [68.27, 68.18, 68.03, 67.38, 67.19, 65.85, 65.56, 64.79],
'Hammer Throw': [80.59, 79.36, 78.71, 78.25, 77.86, 77.17, 77.10, 76.07],
'Javelin': [84.58, 84.51, 84.12, 83.34, 82.80, 82.63, 81.91, 81.21]
})
rio = pd.DataFrame({
'Long Jump': [8.38, 8.37, 8.29, 8.25, 8.17, 8.10, 8.06, 8.05],
'Shot Put': [22.52, 21.78, 21.36, 21.20, 21.02, 20.72, 20.64, 20.64],
'Discus': [68.37, 67.55, 67.05, 66.58, 65.10, 64.95, 64.50, 63.72],
'Hammer Throw': [78.68, 77.79, 77.73, 76.05, 75.97, 75.46, 75.28, 74.61],
'Javelin': [90.30, 88.24, 85.38, 85.32, 83.95, 83.05, 82.51, 82.42]
})
datasets = [london, rio]
# Define which colours you want to use
colours = ['blue', 'red']
# Define the groups
groups = ['Athens 2004', 'Beijing 2008', 'London 2012', 'Rio 2016']
# Get the max of the dataset
all_maximums = [d.max(axis=1).values for d in datasets]
dataset_maximums = [max(m) for m in all_maximums]
y_max = max(dataset_maximums)
# Get the min of the dataset
all_minimums = [d.min(axis=1).values for d in datasets]
dataset_minimums = [min(m) for m in all_minimums]
y_min = min(dataset_minimums)
# Calculate the y-axis range
y_range = y_max - y_min
# Create the plot
# plt.axes([left, bottom, width, height])
# Default = [0.125, 0.11, 0.775, 0.77]
ax = plt.axes()
# Set x-positions for boxes
x_pos_range = np.arange(len(datasets)) / (len(datasets) - 1)
x_pos = (x_pos_range * 0.5) + 0.75
# Plot
for i, data in enumerate(datasets):
positions = [x_pos[i] + j * 1 for j in range(len(data.T))]
bp = ax.boxplot(
np.array(data), sym='', whis=[0, 100], widths=0.6 / len(datasets),
labels=list(datasets[0]), patch_artist=True,
positions=positions
)
# Fill the boxes with colours (requires patch_artist=True)
k = i % len(colours)
for box in bp['boxes']:
box.set(facecolor=colours[k])
# Make the median lines more visible
plt.setp(bp['medians'], color='black')
# Get the samples' medians
medians = [bp['medians'][j].get_ydata()[0] for j in range(len(data.T))]
medians = [str(round(s, 1)) for s in medians]
# Increase the height of the plot by 5% to fit the labels
ax.set_ylim([y_min - 0.1 * y_range, y_max + 0.05 * y_range])
# Set the y-positions for the labels
y_pos = y_min - 0.075 * y_range
for tick, label in zip(range(len(data.T)), ax.get_xticklabels()):
k = tick % 2
ax.text(
positions[tick], y_pos, r'$\tilde{x}=' + fr'{medians[tick]}$',
horizontalalignment='center', size='xx-small'
)
# Axis details
details = ax.set(
title="Men's Finals at the Last Two Olympic Games",
ylabel='Distance [m]'
)
ax.set_xlabel('Event')
ax.tick_params(axis='x', bottom=False)
xticks = ax.set_xticks(np.arange(len(list(datasets[0]))) + 1)
xticks = ax.set_xticks(np.arange(len(list(datasets[0])) + 1) + 0.5, minor=True)
xlim = ax.set_xlim([0.5, len(list(datasets[0])) + 0.5])
for tick in ax.xaxis.get_major_ticks():
tick.label.set_fontsize(7)
# Legend
legend_elements = []
for i in range(len(datasets)):
j = i % len(groups)
k = i % len(colours)
legend_elements.append(Patch(facecolor=colours[k], label=groups[j]))
ax.legend(handles=legend_elements, fontsize=8)
# Background
ax.set_facecolor('0.8')
plt.grid(True, color='white')
plt.show()
The easiest way to plots multiple graphs that all have the same format is to create a function and call that repeatedly on each dataset. Some changes do need to be made, however:
fig, axes = plt.subplots(nrows, ncols)
fig.subplots_adjust()
for i, ax in enumerate(axes.flat):
def plot_boxplots(datasets, colours, groups, ax, title):
# Get the max of the dataset
all_maximums = [d.max(axis=1).values for d in datasets]
dataset_maximums = [max(m) for m in all_maximums]
y_max = max(dataset_maximums)
# Get the min of the dataset
all_minimums = [d.min(axis=1).values for d in datasets]
dataset_minimums = [min(m) for m in all_minimums]
y_min = min(dataset_minimums)
# Calculate the y-axis range
y_range = y_max - y_min
# Set x-positions for boxes
x_pos_range = np.arange(len(datasets)) / (len(datasets) - 1)
x_pos = (x_pos_range * 0.5) + 0.75
# Create the plot
for i, data in enumerate(datasets):
positions = [x_pos[i] + j * 1 for j in range(len(data.T))]
bp = ax.boxplot(
np.array(data), sym='', whis=[0, 100], widths=0.6 / len(datasets),
labels=list(datasets[0]), patch_artist=True,
positions=positions
)
# Fill the boxes with colours (requires patch_artist=True)
k = i % len(colours)
for box in bp['boxes']:
box.set(facecolor=colours[k])
# Make the median lines more visible
plt.setp(bp['medians'], color='black')
# Get the samples' medians
medians = [bp['medians'][j].get_ydata()[0] for j in range(len(data.T))]
medians = [str(round(s, 2)) for s in medians]
# Increase the height of the plot by 5% to fit the labels
ax.set_ylim([y_min - 0.1 * y_range, y_max + 0.05 * y_range])
# Set the y-positions for the labels
y_pos = y_min - 0.075 * y_range
for tick, label in zip(range(len(data.T)), ax.get_xticklabels()):
k = tick % 2
ax.text(
positions[tick], y_pos, r'$\tilde{x}=' + fr'{medians[tick]}$m',
horizontalalignment='center', size='x-small'
)
# Axis details
ax.set(title=title, ylabel='Distance [m]')
ax.tick_params(axis='x', bottom=False)
ax.set_xticks(np.arange(len(list(datasets[0]))) + 1)
ax.set_xticks(np.arange(len(list(datasets[0])) + 1) + 0.5, minor=True)
ax.set_xlim([0.5, len(list(datasets[0])) + 0.5])
# Legend
legend_elements = []
for i in range(len(datasets)):
j = i % len(groups)
k = i % len(colours)
legend_elements.append(Patch(facecolor=colours[k], label=groups[j]))
ax.legend(handles=legend_elements, fontsize=8)
# Make figures A4 in size
A = 4
plt.rc('figure', figsize=[33.11 * .5**(.5 * A), 46.82 * .5**(.5 * A)])
# Use Latex
plt.rc('text', usetex=True)
plt.rc('font', family='serif')
# Call function
fig, axes = plt.subplots(3, 2)
fig.subplots_adjust(
left=0.1, right=0.98, top=0.97, bottom=0.03, wspace=0.3, hspace=0.3
)
for i, ax in enumerate(axes.flat):
plot_boxplots(datasets[i], colours, groups[i], ax, title[i])
plt.show()