An important step in data analysis is grouping measurements together into categories. This could be, for example, classifying results as having passed/failed a certain standard or stratifying patients into high-/medium-/low-risk groups with regards to some disease.
We’ll use the pre-loaded dataset ‘iris’. Collected by Edgar Anderson, it details the measurements of 150 iris flowers’ petals and sepals:
This famous (Fisher’s or Anderson’s) iris data set gives the measurements in centimeters of the variables sepal length and width and petal length and width, respectively, for 50 flowers from each of 3 species of iris. The species are Iris setosa, versicolor, and virginica.
For this page, however, we will only use the Iris setosa species:
iris_setosa <- subset(iris, Species == "setosa")
print(head(iris_setosa, 15))
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
## 7 4.6 3.4 1.4 0.3 setosa
## 8 5.0 3.4 1.5 0.2 setosa
## 9 4.4 2.9 1.4 0.2 setosa
## 10 4.9 3.1 1.5 0.1 setosa
## 11 5.4 3.7 1.5 0.2 setosa
## 12 4.8 3.4 1.6 0.2 setosa
## 13 4.8 3.0 1.4 0.1 setosa
## 14 4.3 3.0 1.1 0.1 setosa
## 15 5.8 4.0 1.2 0.2 setosa
plot(
iris_setosa$Sepal.Length, iris_setosa$Petal.Length,
main = "The Sepal and Petal Lengths of 50 Iris Flowers",
xlab = "Sepal Length [cm]", ylab = "Petal Length [cm]"
)
We can classify the flowers as ‘small’ or ‘large’ fairly easily, using the ifelse()
function. The classification is stored in a new column:
iris_setosa$size <- ifelse(iris_setosa$Petal.Length > 1.45, "large", "small")
print(head(iris_setosa, 15))
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species size
## 1 5.1 3.5 1.4 0.2 setosa small
## 2 4.9 3.0 1.4 0.2 setosa small
## 3 4.7 3.2 1.3 0.2 setosa small
## 4 4.6 3.1 1.5 0.2 setosa large
## 5 5.0 3.6 1.4 0.2 setosa small
## 6 5.4 3.9 1.7 0.4 setosa large
## 7 4.6 3.4 1.4 0.3 setosa small
## 8 5.0 3.4 1.5 0.2 setosa large
## 9 4.4 2.9 1.4 0.2 setosa small
## 10 4.9 3.1 1.5 0.1 setosa large
## 11 5.4 3.7 1.5 0.2 setosa large
## 12 4.8 3.4 1.6 0.2 setosa large
## 13 4.8 3.0 1.4 0.1 setosa small
## 14 4.3 3.0 1.1 0.1 setosa small
## 15 5.8 4.0 1.2 0.2 setosa small
plot(
iris_setosa$Sepal.Length, iris_setosa$Petal.Length,
main = "The Sepal and Petal Lengths of 50 Iris Flowers",
xlab = "Sepal Length [cm]", ylab = "Petal Length [cm]",
)
points(
Petal.Length ~ Sepal.Length, subset(iris_setosa, size == "small"), pch = 4
)
abline(h = 1.45, col = "red", lty = "dashed")
text(5.78, 1.0, "Small", col = "red")
text(5.78, 1.9, "Large", col = "red")
This can either be done using nested ifelse()
functions (as is done below) or using if statements (see under “Classification Using Two Variables”):
iris_setosa$size <- ifelse(
iris_setosa$Petal.Length > 1.65, "large",
ifelse(
iris_setosa$Petal.Length > 1.35, "medium", "small"
)
)
print(head(iris_setosa, 15))
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species size
## 1 5.1 3.5 1.4 0.2 setosa medium
## 2 4.9 3.0 1.4 0.2 setosa medium
## 3 4.7 3.2 1.3 0.2 setosa small
## 4 4.6 3.1 1.5 0.2 setosa medium
## 5 5.0 3.6 1.4 0.2 setosa medium
## 6 5.4 3.9 1.7 0.4 setosa large
## 7 4.6 3.4 1.4 0.3 setosa medium
## 8 5.0 3.4 1.5 0.2 setosa medium
## 9 4.4 2.9 1.4 0.2 setosa medium
## 10 4.9 3.1 1.5 0.1 setosa medium
## 11 5.4 3.7 1.5 0.2 setosa medium
## 12 4.8 3.4 1.6 0.2 setosa medium
## 13 4.8 3.0 1.4 0.1 setosa medium
## 14 4.3 3.0 1.1 0.1 setosa small
## 15 5.8 4.0 1.2 0.2 setosa small
plot(
iris_setosa$Sepal.Length, iris_setosa$Petal.Length,
main = "The Sepal and Petal Lengths of 50 Iris Flowers",
xlab = "Sepal Length [cm]", ylab = "Petal Length [cm]",
)
points(
Petal.Length ~ Sepal.Length, subset(iris_setosa, size == "small"),
pch = 4
)
points(
Petal.Length ~ Sepal.Length, subset(iris_setosa, size == "medium"),
pch = 20
)
abline(h = 1.65, col = "red", lty = "dashed")
abline(h = 1.35, col = "red", lty = "dashed")
text(5.78, 1.3, "Small", col = "red")
text(5.78, 1.6, "Medium", col = "red")
text(5.78, 1.9, "Large", col = "red")
A more complicated classification of the flowers as ‘small’ or ‘large’ could be done by looking at both petal and sepal size:
iris_setosa$size <- ifelse(
iris_setosa$Petal.Length > 1.35 & iris_setosa$Sepal.Length > 4.95,
"large", "small"
)
print(head(iris_setosa, 15))
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species size
## 1 5.1 3.5 1.4 0.2 setosa large
## 2 4.9 3.0 1.4 0.2 setosa small
## 3 4.7 3.2 1.3 0.2 setosa small
## 4 4.6 3.1 1.5 0.2 setosa small
## 5 5.0 3.6 1.4 0.2 setosa large
## 6 5.4 3.9 1.7 0.4 setosa large
## 7 4.6 3.4 1.4 0.3 setosa small
## 8 5.0 3.4 1.5 0.2 setosa large
## 9 4.4 2.9 1.4 0.2 setosa small
## 10 4.9 3.1 1.5 0.1 setosa small
## 11 5.4 3.7 1.5 0.2 setosa large
## 12 4.8 3.4 1.6 0.2 setosa small
## 13 4.8 3.0 1.4 0.1 setosa small
## 14 4.3 3.0 1.1 0.1 setosa small
## 15 5.8 4.0 1.2 0.2 setosa small
# Create the scatter plot
plot(
iris_setosa$Sepal.Length, iris_setosa$Petal.Length,
main = "The Sepal and Petal Lengths of 50 Iris Flowers",
xlab = "Sepal Length [cm]", ylab = "Petal Length [cm]",
)
points(
Petal.Length ~ Sepal.Length, subset(iris_setosa, size == "small"), pch = 4
)
# Plot the dashed lines
segments(4.95, 1.35, 4.95, 2, col = "red", lty = "dashed")
segments(4.95, 1.35, 6, 1.35, col = "red", lty = "dashed")
# Add the text
text(5.78, 1.0, "Small", col = "red")
text(5.78, 1.9, "Large", col = "red")
Again, this could be done using nested ifelse()
functions, but multiple ‘if statements’ in a loop is arguably cleared:
iris_setosa$size <- "" # Clear the size column
for (i in seq_len(nrow(iris_setosa))) {
if (
(iris_setosa[i, "Sepal.Length"] > 5.05) &
(iris_setosa[i, "Petal.Length"] > 1.55)
) {
iris_setosa[i, "size"] <- "large"
} else if (
(iris_setosa[i, "Sepal.Length"] > 4.65) &
(iris_setosa[i, "Petal.Length"] > 1.25)
) {
iris_setosa[i, "size"] <- "medium"
} else {
iris_setosa[i, "size"] <- "small"
}
}
print(head(iris_setosa, 15))
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species size
## 1 5.1 3.5 1.4 0.2 setosa medium
## 2 4.9 3.0 1.4 0.2 setosa medium
## 3 4.7 3.2 1.3 0.2 setosa medium
## 4 4.6 3.1 1.5 0.2 setosa small
## 5 5.0 3.6 1.4 0.2 setosa medium
## 6 5.4 3.9 1.7 0.4 setosa large
## 7 4.6 3.4 1.4 0.3 setosa small
## 8 5.0 3.4 1.5 0.2 setosa medium
## 9 4.4 2.9 1.4 0.2 setosa small
## 10 4.9 3.1 1.5 0.1 setosa medium
## 11 5.4 3.7 1.5 0.2 setosa medium
## 12 4.8 3.4 1.6 0.2 setosa medium
## 13 4.8 3.0 1.4 0.1 setosa medium
## 14 4.3 3.0 1.1 0.1 setosa small
## 15 5.8 4.0 1.2 0.2 setosa small
# Create the scatter plot
plot(
iris_setosa$Sepal.Length, iris_setosa$Petal.Length,
main = "The Sepal and Petal Lengths of 50 Iris Flowers",
xlab = "Sepal Length [cm]", ylab = "Petal Length [cm]",
)
points(
Petal.Length ~ Sepal.Length, subset(iris_setosa, size == "small"),
pch = 4
)
points(
Petal.Length ~ Sepal.Length, subset(iris_setosa, size == "medium"),
pch = 20
)
# Plot the dashed lines
segments(5.05, 1.55, 5.05, 2, col = "red", lty = "dashed")
segments(5.05, 1.55, 6, 1.55, col = "red", lty = "dashed")
segments(4.65, 1.25, 4.65, 2, col = "red", lty = "dashed")
segments(4.65, 1.25, 6, 1.25, col = "red", lty = "dashed")
# Add the text
text(5.78, 1.0, "Small", col = "red")
text(5.78, 1.45, "Medium", col = "red")
text(5.78, 1.9, "Large", col = "red")