⇦ Back

An important step in data analysis is grouping measurements together into categories. This could be, for example, classifying results as having passed/failed a certain standard or stratifying patients into high-/medium-/low-risk groups with regards to some disease.

1 Raw Data

We’ll use the pre-loaded dataset ‘iris’. Collected by Edgar Anderson, it details the measurements of 150 iris flowers’ petals and sepals:

This famous (Fisher’s or Anderson’s) iris data set gives the measurements in centimeters of the variables sepal length and width and petal length and width, respectively, for 50 flowers from each of 3 species of iris. The species are Iris setosa, versicolor, and virginica.

For this page, however, we will only use the Iris setosa species:

An Iris setosa flower
iris_setosa <- subset(iris, Species == "setosa")
print(head(iris_setosa, 15))
##    Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1           5.1         3.5          1.4         0.2  setosa
## 2           4.9         3.0          1.4         0.2  setosa
## 3           4.7         3.2          1.3         0.2  setosa
## 4           4.6         3.1          1.5         0.2  setosa
## 5           5.0         3.6          1.4         0.2  setosa
## 6           5.4         3.9          1.7         0.4  setosa
## 7           4.6         3.4          1.4         0.3  setosa
## 8           5.0         3.4          1.5         0.2  setosa
## 9           4.4         2.9          1.4         0.2  setosa
## 10          4.9         3.1          1.5         0.1  setosa
## 11          5.4         3.7          1.5         0.2  setosa
## 12          4.8         3.4          1.6         0.2  setosa
## 13          4.8         3.0          1.4         0.1  setosa
## 14          4.3         3.0          1.1         0.1  setosa
## 15          5.8         4.0          1.2         0.2  setosa
plot(
    iris_setosa$Sepal.Length, iris_setosa$Petal.Length,
    main = "The Sepal and Petal Lengths of 50 Iris Flowers",
    xlab = "Sepal Length [cm]", ylab = "Petal Length [cm]"
)

2 Classification Using One Variable

2.1 Classification into Two Groups

We can classify the flowers as ‘small’ or ‘large’ fairly easily, using the ifelse() function. The classification is stored in a new column:

iris_setosa$size <- ifelse(iris_setosa$Petal.Length > 1.45, "large", "small")
print(head(iris_setosa, 15))
##    Sepal.Length Sepal.Width Petal.Length Petal.Width Species  size
## 1           5.1         3.5          1.4         0.2  setosa small
## 2           4.9         3.0          1.4         0.2  setosa small
## 3           4.7         3.2          1.3         0.2  setosa small
## 4           4.6         3.1          1.5         0.2  setosa large
## 5           5.0         3.6          1.4         0.2  setosa small
## 6           5.4         3.9          1.7         0.4  setosa large
## 7           4.6         3.4          1.4         0.3  setosa small
## 8           5.0         3.4          1.5         0.2  setosa large
## 9           4.4         2.9          1.4         0.2  setosa small
## 10          4.9         3.1          1.5         0.1  setosa large
## 11          5.4         3.7          1.5         0.2  setosa large
## 12          4.8         3.4          1.6         0.2  setosa large
## 13          4.8         3.0          1.4         0.1  setosa small
## 14          4.3         3.0          1.1         0.1  setosa small
## 15          5.8         4.0          1.2         0.2  setosa small
plot(
    iris_setosa$Sepal.Length, iris_setosa$Petal.Length,
    main = "The Sepal and Petal Lengths of 50 Iris Flowers",
    xlab = "Sepal Length [cm]", ylab = "Petal Length [cm]",
)
points(
    Petal.Length ~ Sepal.Length, subset(iris_setosa, size == "small"), pch = 4
)
abline(h = 1.45, col = "red", lty = "dashed")
text(5.78, 1.0, "Small", col = "red")
text(5.78, 1.9, "Large", col = "red")

2.2 Classification into Three+ Groups

This can either be done using nested ifelse() functions (as is done below) or using if statements (see under “Classification Using Two Variables”):

iris_setosa$size <- ifelse(
    iris_setosa$Petal.Length > 1.65, "large",
    ifelse(
        iris_setosa$Petal.Length > 1.35, "medium", "small"
    )
)
print(head(iris_setosa, 15))
##    Sepal.Length Sepal.Width Petal.Length Petal.Width Species   size
## 1           5.1         3.5          1.4         0.2  setosa medium
## 2           4.9         3.0          1.4         0.2  setosa medium
## 3           4.7         3.2          1.3         0.2  setosa  small
## 4           4.6         3.1          1.5         0.2  setosa medium
## 5           5.0         3.6          1.4         0.2  setosa medium
## 6           5.4         3.9          1.7         0.4  setosa  large
## 7           4.6         3.4          1.4         0.3  setosa medium
## 8           5.0         3.4          1.5         0.2  setosa medium
## 9           4.4         2.9          1.4         0.2  setosa medium
## 10          4.9         3.1          1.5         0.1  setosa medium
## 11          5.4         3.7          1.5         0.2  setosa medium
## 12          4.8         3.4          1.6         0.2  setosa medium
## 13          4.8         3.0          1.4         0.1  setosa medium
## 14          4.3         3.0          1.1         0.1  setosa  small
## 15          5.8         4.0          1.2         0.2  setosa  small
plot(
    iris_setosa$Sepal.Length, iris_setosa$Petal.Length,
    main = "The Sepal and Petal Lengths of 50 Iris Flowers",
    xlab = "Sepal Length [cm]", ylab = "Petal Length [cm]",
)
points(
    Petal.Length ~ Sepal.Length, subset(iris_setosa, size == "small"),
    pch = 4
)
points(
    Petal.Length ~ Sepal.Length, subset(iris_setosa, size == "medium"),
    pch = 20
)
abline(h = 1.65, col = "red", lty = "dashed")
abline(h = 1.35, col = "red", lty = "dashed")
text(5.78, 1.3, "Small", col = "red")
text(5.78, 1.6, "Medium", col = "red")
text(5.78, 1.9, "Large", col = "red")

3 Classification Using Two Variables

3.1 Classification into Two Groups

A more complicated classification of the flowers as ‘small’ or ‘large’ could be done by looking at both petal and sepal size:

iris_setosa$size <- ifelse(
    iris_setosa$Petal.Length > 1.35 & iris_setosa$Sepal.Length > 4.95,
    "large", "small"
)
print(head(iris_setosa, 15))
##    Sepal.Length Sepal.Width Petal.Length Petal.Width Species  size
## 1           5.1         3.5          1.4         0.2  setosa large
## 2           4.9         3.0          1.4         0.2  setosa small
## 3           4.7         3.2          1.3         0.2  setosa small
## 4           4.6         3.1          1.5         0.2  setosa small
## 5           5.0         3.6          1.4         0.2  setosa large
## 6           5.4         3.9          1.7         0.4  setosa large
## 7           4.6         3.4          1.4         0.3  setosa small
## 8           5.0         3.4          1.5         0.2  setosa large
## 9           4.4         2.9          1.4         0.2  setosa small
## 10          4.9         3.1          1.5         0.1  setosa small
## 11          5.4         3.7          1.5         0.2  setosa large
## 12          4.8         3.4          1.6         0.2  setosa small
## 13          4.8         3.0          1.4         0.1  setosa small
## 14          4.3         3.0          1.1         0.1  setosa small
## 15          5.8         4.0          1.2         0.2  setosa small
# Create the scatter plot
plot(
    iris_setosa$Sepal.Length, iris_setosa$Petal.Length,
    main = "The Sepal and Petal Lengths of 50 Iris Flowers",
    xlab = "Sepal Length [cm]", ylab = "Petal Length [cm]",
)
points(
    Petal.Length ~ Sepal.Length, subset(iris_setosa, size == "small"), pch = 4
)
# Plot the dashed lines
segments(4.95, 1.35, 4.95, 2, col = "red", lty = "dashed")
segments(4.95, 1.35, 6, 1.35, col = "red", lty = "dashed")
# Add the text
text(5.78, 1.0, "Small", col = "red")
text(5.78, 1.9, "Large", col = "red")

3.2 Classification into Three+ Groups

Again, this could be done using nested ifelse() functions, but multiple ‘if statements’ in a loop is arguably cleared:

iris_setosa$size <- ""  # Clear the size column
for (i in seq_len(nrow(iris_setosa))) {
    if (
        (iris_setosa[i, "Sepal.Length"] > 5.05) &
        (iris_setosa[i, "Petal.Length"] > 1.55)
    ) {
        iris_setosa[i, "size"] <- "large"
    } else if (
        (iris_setosa[i, "Sepal.Length"] > 4.65) &
        (iris_setosa[i, "Petal.Length"] > 1.25)
    ) {
        iris_setosa[i, "size"] <- "medium"
    } else {
        iris_setosa[i, "size"] <- "small"
    }
}
print(head(iris_setosa, 15))
##    Sepal.Length Sepal.Width Petal.Length Petal.Width Species   size
## 1           5.1         3.5          1.4         0.2  setosa medium
## 2           4.9         3.0          1.4         0.2  setosa medium
## 3           4.7         3.2          1.3         0.2  setosa medium
## 4           4.6         3.1          1.5         0.2  setosa  small
## 5           5.0         3.6          1.4         0.2  setosa medium
## 6           5.4         3.9          1.7         0.4  setosa  large
## 7           4.6         3.4          1.4         0.3  setosa  small
## 8           5.0         3.4          1.5         0.2  setosa medium
## 9           4.4         2.9          1.4         0.2  setosa  small
## 10          4.9         3.1          1.5         0.1  setosa medium
## 11          5.4         3.7          1.5         0.2  setosa medium
## 12          4.8         3.4          1.6         0.2  setosa medium
## 13          4.8         3.0          1.4         0.1  setosa medium
## 14          4.3         3.0          1.1         0.1  setosa  small
## 15          5.8         4.0          1.2         0.2  setosa  small
# Create the scatter plot
plot(
    iris_setosa$Sepal.Length, iris_setosa$Petal.Length,
    main = "The Sepal and Petal Lengths of 50 Iris Flowers",
    xlab = "Sepal Length [cm]", ylab = "Petal Length [cm]",
)
points(
    Petal.Length ~ Sepal.Length, subset(iris_setosa, size == "small"),
    pch = 4
)
points(
    Petal.Length ~ Sepal.Length, subset(iris_setosa, size == "medium"),
    pch = 20
)
# Plot the dashed lines
segments(5.05, 1.55, 5.05, 2, col = "red", lty = "dashed")
segments(5.05, 1.55, 6, 1.55, col = "red", lty = "dashed")
segments(4.65, 1.25, 4.65, 2, col = "red", lty = "dashed")
segments(4.65, 1.25, 6, 1.25, col = "red", lty = "dashed")
# Add the text
text(5.78, 1.0, "Small", col = "red")
text(5.78, 1.45, "Medium", col = "red")
text(5.78, 1.9, "Large", col = "red")

⇦ Back