⇦ Back

If you’re going to use ggplot2, the first thing to do is load the library:

library(ggplot2)

1 Create a Simple Box Plot

1.1 Using a Pre-Installed Dataset

This example uses the chickwts dataset; a record of the weights of 71 chicks which had been fed one of 6 different diets for their first six weeks:

p <- ggplot(chickwts, aes(feed, weight))
p <- p + geom_boxplot()
print(p)

1.2 Using a Dataset from a Package

Here the training dataset from the Titanic package is used, with a small amount of editing:

library(titanic)

# Replace "0" and "1" with "Died" and "Survived"
titanic_train$"Survived"[titanic_train$"Survived" %in% 0] <- "Died"
titanic_train$"Survived"[titanic_train$"Survived" %in% 1] <- "Survived"

# Plot
p <- ggplot(titanic_train, aes(Survived, Fare))
p <- p + geom_boxplot()
p <- p + xlab("")  # Remove the x-axis label
print(p)

Exclude the outliers by specifying outlier.shape = NA in the geom_boxplot() call then adjust the y-axis limits using ylim():

# Plot
p <- ggplot(titanic_train, aes(Survived, Fare))
p <- p + geom_boxplot(outlier.shape = NA)
p <- p + xlab("")  # Remove the x-axis label
p <- p + ylim(0, 125)
print(p)

1.3 Using a Data Frame in ‘Long’ Format

Take a look at the following dataset (which shows the results of the long jump finals at the last four Olympic Games):

# Raw data
athens <- data.frame(
    "olympics" = rep("Athens 2004", 8),
    "long_jump_men" = c(8.59, 8.47, 8.32, 8.31, 8.25, 8.24, 8.23, 8.21),
    "long_jump_women" = c(7.07, 7.05, 7.05, 6.96, 6.85, 6.83, 6.80, 6.73)
)
beijing <- data.frame(
    "olympics" = rep("Beijing 2008", 8),
    "long_jump_men" = c(8.34, 8.24, 8.20, 8.19, 8.19, 8.16, 8.07, 8.00),
    "long_jump_women" = c(7.04, 7.03, 6.91, 6.79, 6.76, 6.70, 6.64, 6.58)
)
london <- data.frame(
    "olympics" = rep("London 2012", 8),
    "long_jump_men" = c(8.31, 8.16, 8.12, 8.11, 8.10, 8.07, 8.01, 7.93),
    "long_jump_women" = c(7.12, 7.07, 6.89, 6.88, 6.77, 6.76, 6.72, 6.67)
)
rio <- data.frame(
    "olympics" = rep("Rio 2016", 8),
    "long_jump_men" = c(8.38, 8.37, 8.29, 8.25, 8.17, 8.10, 8.06, 8.05),
    "long_jump_women" = c(7.17, 7.15, 7.08, 6.95, 6.81, 6.79, 6.74, 6.69)
)
df <- rbind(athens, beijing, london, rio)

This data is in ‘long’ format because the groups (ie which Olympics each data point comes from) are not separated out but are instead all listed in a column (‘olympics’):

print(head(df, 15))
##        olympics long_jump_men long_jump_women
## 1   Athens 2004          8.59            7.07
## 2   Athens 2004          8.47            7.05
## 3   Athens 2004          8.32            7.05
## 4   Athens 2004          8.31            6.96
## 5   Athens 2004          8.25            6.85
## 6   Athens 2004          8.24            6.83
## 7   Athens 2004          8.23            6.80
## 8   Athens 2004          8.21            6.73
## 9  Beijing 2008          8.34            7.04
## 10 Beijing 2008          8.24            7.03
## 11 Beijing 2008          8.20            6.91
## 12 Beijing 2008          8.19            6.79
## 13 Beijing 2008          8.19            6.76
## 14 Beijing 2008          8.16            6.70
## 15 Beijing 2008          8.07            6.64

This data can be plotted as follows:

# Plot
p <- ggplot(df, aes(x = olympics, y = long_jump_men))
p <- p + geom_boxplot()
p <- p + ggtitle("Men's Long Jump Finals")  # Plot title
p <- p + ylab("Distance [m]")  # y-axis label
p <- p + xlab("Olympics")  # x-axis label
print(p)

1.4 Using a Data Frame in ‘Wide’ Format

Compare the above dataset with this one (which contains the same information):

# Raw data
long_jump_men <- data.frame(
    "Athens 2004" = c(8.59, 8.47, 8.32, 8.31, 8.25, 8.24, 8.23, 8.21),
    "Beijing 2008" = c(8.34, 8.24, 8.20, 8.19, 8.19, 8.16, 8.07, 8.00),
    "London 2012" = c(8.31, 8.16, 8.12, 8.11, 8.10, 8.07, 8.01, 7.93),
    "Rio 2016" = c(8.38, 8.37, 8.29, 8.25, 8.17, 8.10, 8.06, 8.05)
)
long_jump_women <- data.frame(
    "Athens 2004" = c(7.07, 7.05, 7.05, 6.96, 6.85, 6.83, 6.80, 6.73),
    "Beijing 2008" = c(7.04, 7.03, 6.91, 6.79, 6.76, 6.70, 6.64, 6.58),
    "London 2012" = c(7.12, 7.07, 6.89, 6.88, 6.77, 6.76, 6.72, 6.67),
    "Rio 2016" = c(7.17, 7.15, 7.08, 6.95, 6.81, 6.79, 6.74, 6.69)
)

This data is in ‘wide’ format because the groups (ie which Olympics each data point comes from) each have their own column:

print(head(long_jump_women, 8))
##   Athens.2004 Beijing.2008 London.2012 Rio.2016
## 1        7.07         7.04        7.12     7.17
## 2        7.05         7.03        7.07     7.15
## 3        7.05         6.91        6.89     7.08
## 4        6.96         6.79        6.88     6.95
## 5        6.85         6.76        6.77     6.81
## 6        6.83         6.70        6.76     6.79
## 7        6.80         6.64        6.72     6.74
## 8        6.73         6.58        6.67     6.69

Unfortunately, ggplot2 cannot work with wide-format data so you first need to convert it to long-format with the gather() function from the tidyr() package:

library(tidyr)

long_jump_women <- gather(long_jump_women, "Olympics", "Distance")

# Plot
p <- ggplot(long_jump_women, aes(x = Olympics, y = Distance))
p <- p + geom_boxplot()
p <- p + ggtitle("Women's Long Jump Finals")  # Plot title
p <- p + ylab("Distance [m]")  # y-axis label
p <- p + xlab("Olympics")  # x-axis label
print(p)

1.5 Using Vectors

Rather than a data frame (which is essentially a table with columns) the data to be plotted can start out as vectors (which are essentially lists of numbers):

# Raw data
athens2004 <- c(8.59, 8.47, 8.32, 8.31, 8.25, 8.24, 8.23, 8.21)
beijing2008 <- c(8.34, 8.24, 8.20, 8.19, 8.19, 8.16, 8.07, 8.00)
london2012 <- c(8.31, 8.16, 8.12, 8.11, 8.10, 8.07, 8.01, 7.93)
rio2016 <- c(8.38, 8.37, 8.29, 8.25, 8.17, 8.10, 8.06, 8.05)
print(athens2004)
## [1] 8.59 8.47 8.32 8.31 8.25 8.24 8.23 8.21

Much like before, ggplot2 cannot plot individual vectors. So you first need to combine them into a data frame in wide-format then convert that into long-format:

long_jump_men <- data.frame(
    "Athens 2004" = athens2004,
    "Beijing 2008" = beijing2008,
    "London 2012" = london2012,
    "Rio 2016" = rio2016
)
long_jump_men <- gather(long_jump_men, "Olympics", "Distance")
# Plot
p <- ggplot(long_jump_men, aes(x = Olympics, y = Distance))
p <- p + geom_boxplot()
p <- p + ggtitle("Men's Long Jump Finals")  # Plot title
p <- p + ylab("Distance [m]")  # y-axis label
p <- p + xlab("Olympics")  # x-axis label
print(p)

Vectors provide more versatility than a data frame because not all vectors have to be the same length (although they just all happen to be the same length in this example) and vectors can be used in many functions that entire data frames cannot. On the other hand, when data is imported from or exported to a spreadsheet, it will usually be in the data frame format. Here’s a quick comparison of the data formats:

Format Pro Con
Long All data in one data frame Verbose
Wide All data in one data frame, compact Only works when there is the same number of data points in each group
Vectors Versatile Cannot be as easily imported from or exported to a spreadsheet

2 Add Colour and a Legend

The quickest way to add colour and a legend is to use the fill keyword argument in the aesthetic (aes()) function:

p <- ggplot(long_jump_women, aes(x = Olympics, y = Distance, fill = Olympics))
p <- p + geom_boxplot()
p <- p + ggtitle("Women's Long Jump Finals")  # Plot title
p <- p + ylab("Distance [m]")  # y-axis label
p <- p + xlab("")  # x-axis label
print(p)

Notice that we also removed the x-axis label now that we have it in the legend.

2.1 Change the Colours

Edit what colours are used with the scale_fill_manual() function:

p <- ggplot(long_jump_men, aes(x = Olympics, y = Distance, fill = Olympics))
p <- p + geom_boxplot()
p <- p + scale_fill_manual(
    values = c("cadetblue2", "tomato", "khaki1", "darkseagreen3")
)
p <- p + labs(
    title = "Men's Long Jump Finals", x = "", y = "Distance [m]",
    fill = "Olympic Games"
)
print(p)

We also renamed the title of the legend (from “Olympics” to “Olympic Games”). This was done with the labs() function. As it happens, this function can actually be used to set all of the plot’s titles, so the ggtitle(), ylab() and xlab() functions were done away with and replaced by this one function.

2.2 Colour All Boxes the Same

Instead of each box having a different colour - ie having the fill colour correspond to the x-variable - we can give each box the same colour using the same fill keyword argument but in the geom_boxplot() function, not the aes() function. The alpha argument can be used to set the transparency from 0 (completely see-through) to 1 (solid colour - ie the default):

p <- ggplot(long_jump_men, aes(x = Olympics, y = Distance))
p <- p + geom_boxplot(fill = "orange", alpha = 0.5)
p <- p + labs(
    title = "Men's Long Jump Finals", x = "", y = "Distance [m]",
    fill = "Olympic Games"
)
print(p)

2.3 Colour the Outlines

The color option is actually used to control the colour of the outlines of the boxes, as opposed to the fill colour:

p <- ggplot(long_jump_men, aes(x = Olympics, y = Distance))
p <- p + geom_boxplot(fill = "orange", alpha = 0.5, color = "red")
p <- p + labs(
    title = "Men's Long Jump Finals", x = "", y = "Distance [m]",
    fill = "Olympic Games"
)
print(p)

3 Annotate Text - Sample Size & Sample Mean

It’s often useful to include descriptive statistics right in a figure, and this can be done by writing custom functions as follows:

# Functions
sample_size <- function(x) {
    # Sample size labels
    return(
        data.frame(y = median(x) * 1.003, label = paste0("n  =  ", length(x)))
    )
}
sample_mean <- function(x) {
    # Sample mean labels
    return(data.frame(
        y = median(x) * 0.997,
        label = paste0("\U00B5  =  ", round(mean(x), 2))
    ))
}

Note that the symbol for ‘mean’ is the greek letter ‘mu’ (μ). We can’t print strange symbols like this in R so we need to use its Unicode encoding (which is “00B5”). Take a look at the function definition for sample_mean() above: the label that it is returning uses \U00B5 - the \U invokes Unicode and 00B5 renders “μ”.

The code to create the plots then needs to incorporate the stat_summary() function:

p <- ggplot(long_jump_women, aes(x = Olympics, y = Distance, fill = Olympics))
p <- p + geom_boxplot()
p <- p + scale_fill_manual(
    values = c("cadetblue2", "tomato", "khaki1", "darkseagreen3")
)
p <- p + labs(
    title = "Women's Long Jump Finals", x = "", y = "Distance [m]",
    fill = "Olympic Games"
)
p <- p + stat_summary(fun.data = sample_size, geom = "text", fun = median)
p <- p + stat_summary(
    fun.data = sample_mean, geom = "text", fun = mean, colour = "black"
)
print(p)

4 Typesetting in Axis Labels

If we had a dataset where the units of one of the variables was “m3” and its symbol was “V” or something we might want to start using more advanced typesetting options in our axis labels. This can be achieved with the tilde symbol (~) and the bquote() function for writing in superscript and the italic() function for writing in italics:

p <- ggplot(trees, aes(x = Height, y = Volume))
p <- p + ylab(bquote(paste("Volume,", italic("V") ~ (m^3))))
print(p)

5 Save Plot

Finally, save your plot to your computer as an image using one of the following (depending on what format you want the image to be in):

  • png("File Name.png")
  • pdf("File Name.pdf")
  • ggsave("File Name.png")

If you use one of the first two, it must come before you start plotting the graph (ie before you call ggplot()). If you use the last one (ggsave()) it must come after you’ve plotted the graph.

⇦ Back