I created this report while following R Ladies Freiburg’s “Guided Tidy Tuesday” meetup on 2021-06-08. Today’s data can be found here and comes from the Coffee Quality Database; it includes reviews of multiple features of coffee beans from around the world.
<- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-07-07/coffee_ratings.csv') coffee
We can see that each row in the data corresponds to a coffee bean, and the columns reflect details of that coffee bean, including information about where it is produced and features rated for that bean:
# a few options for summarizing the data (output hidden)
#head(coffee)
#str(coffee)
::kable(skimr::skim(coffee) %>%
knitrslice_head(n = 5) %>% # 5 character variables
select(skim_type:complete_rate, character.min:character.whitespace))
skim_type | skim_variable | n_missing | complete_rate | character.min | character.max | character.empty | character.n_unique | character.whitespace |
---|---|---|---|---|---|---|---|---|
character | species | 0 | 1.0000000 | 7 | 7 | 0 | 2 | 0 |
character | owner | 7 | 0.9947722 | 3 | 50 | 0 | 315 | 0 |
character | country_of_origin | 1 | 0.9992532 | 4 | 28 | 0 | 36 | 0 |
character | farm_name | 359 | 0.7318895 | 1 | 73 | 0 | 571 | 0 |
character | lot_number | 1063 | 0.2061240 | 1 | 71 | 0 | 227 | 0 |
::kable(skimr::skim(coffee) %>%
knitrslice_tail(n = 5) %>% # 5 numeric variables
select(skim_type:complete_rate, numeric.mean:numeric.hist))
skim_type | skim_variable | n_missing | complete_rate | numeric.mean | numeric.sd | numeric.p0 | numeric.p25 | numeric.p50 | numeric.p75 | numeric.p100 | numeric.hist |
---|---|---|---|---|---|---|---|---|---|---|---|
numeric | quakers | 1 | 0.9992532 | 0.1733931 | 0.832121 | 0 | 0 | 0.00 | 0 | 11 | ▇▁▁▁▁ |
numeric | category_two_defects | 0 | 1.0000000 | 3.5563854 | 5.312541 | 0 | 0 | 2.00 | 4 | 55 | ▇▁▁▁▁ |
numeric | altitude_low_meters | 230 | 0.8282300 | 1750.7133151 | 8669.440545 | 1 | 1100 | 1310.64 | 1600 | 190164 | ▇▁▁▁▁ |
numeric | altitude_high_meters | 230 | 0.8282300 | 1799.3477749 | 8668.805771 | 1 | 1100 | 1350.00 | 1650 | 190164 | ▇▁▁▁▁ |
numeric | altitude_mean_meters | 230 | 0.8282300 | 1775.0305450 | 8668.626080 | 1 | 1100 | 1310.64 | 1600 | 190164 | ▇▁▁▁▁ |
We notice that species
and country_of_origin
are characters but we would like to work with them as factors, so we convert these variables:
<- coffee %>%
coffee mutate(across(c(country_of_origin, species), as_factor))
ggplot(data = coffee %>%
# leave out apparent outliers on both axes
filter(altitude_mean_meters < 4500,
> 50),
total_cup_points aes(x = altitude_mean_meters, y = total_cup_points,
colour = species)) +
geom_point() +
geom_smooth(method = 'lm') +
scale_colour_manual(values = c('Arabica' = '#87BBA2',
'Robusta' = '#EDB458')) +
labs(x = 'Altitude (meters)', y = 'Total cup points',
colour = 'Species',
title = 'Coffee bean ratings by altitude and species',
caption = 'Data source: Tidy Tuesday') +
theme_classic()
#return top 4 most frequent countries
#coffee %>%
# count(country_of_origin, sort = TRUE) %>%
# slice_head(n = 4)
# add new column with frequency by country
<- coffee %>%
coffee add_count(country_of_origin, name = 'country_n')
ggplot(data = coffee %>%
# leave out apparent outliers on both axes
filter(altitude_mean_meters < 4500,
> 50) %>%
total_cup_points # using 4 most frequent countries only
#filter(country_of_origin %in% c('Mexico', 'Colombia', 'Guatemala', 'Brazil')),
# using countries with more than 100 bean entries
filter(country_n > 100),
aes(x = altitude_mean_meters, y = total_cup_points,
colour = country_of_origin)) +
geom_point() +
geom_smooth(method = 'lm') +
scale_colour_brewer(palette = 'Set2') +
labs(x = 'Altitude (meters)', y = 'Total cup points',
colour = 'Country of origin',
title = 'Coffee bean ratings by altitude and country of origin',
caption = 'Data source: Tidy Tuesday') +
theme_classic()
# custom color palette
<- c('seagreen', 'steelblue')
pal_species
<- ggarrange(
fig_ratings ggplot(data = coffee,
aes(x = aroma, fill = species, colour = species)) +
geom_density(alpha = 0.3) +
scale_colour_manual(values = pal_species) +
scale_fill_manual(values = pal_species) +
labs(x = 'Aroma', y = 'Density',
colour = 'Species', fill = 'Species',
subtitle = 'Aroma') +
theme_classic(),
ggplot(data = coffee,
aes(x = sweetness, fill = species, colour = species)) +
geom_density(alpha = 0.3) +
scale_colour_manual(values = pal_species) +
scale_fill_manual(values = pal_species) +
labs(x = 'Sweetness', y = 'Density',
colour = 'Species', fill = 'Species',
subtitle = 'Sweetness') +
theme_classic(),
ggplot(data = coffee,
aes(x = acidity, fill = species, colour = species)) +
geom_density(alpha = 0.3) +
scale_colour_manual(values = pal_species) +
scale_fill_manual(values = pal_species) +
labs(x = 'Acidity', y = 'Density',
colour = 'Species', fill = 'Species',
subtitle = 'Acidity') +
theme_classic(),
nrow = 1, ncol = 3, common.legend = TRUE, legend = 'right'
)
annotate_figure(fig_ratings,
top = text_grob('Coffee feature ratings by species'),
fig.lab = 'Data source: Tidy Tuesday',
fig.lab.pos = 'bottom.right', fig.lab.size = 8)