I created this report while following R Ladies Freiburg’s “Guided Tidy Tuesday” meetup on 2021-06-08. Today’s data can be found here and comes from the Coffee Quality Database; it includes reviews of multiple features of coffee beans from around the world.

coffee <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-07-07/coffee_ratings.csv')

Data summary

We can see that each row in the data corresponds to a coffee bean, and the columns reflect details of that coffee bean, including information about where it is produced and features rated for that bean:

# a few options for summarizing the data (output hidden)
knitr::kable(skimr::skim(coffee) %>% 
               slice_head(n = 5) %>% # 5 character variables
               select(skim_type:complete_rate, character.min:character.whitespace))
skim_type skim_variable n_missing complete_rate character.min character.max character.empty character.n_unique character.whitespace
character species 0 1.0000000 7 7 0 2 0
character owner 7 0.9947722 3 50 0 315 0
character country_of_origin 1 0.9992532 4 28 0 36 0
character farm_name 359 0.7318895 1 73 0 571 0
character lot_number 1063 0.2061240 1 71 0 227 0
knitr::kable(skimr::skim(coffee) %>%
               slice_tail(n = 5) %>% # 5 numeric variables
               select(skim_type:complete_rate, numeric.mean:numeric.hist))
skim_type skim_variable n_missing complete_rate numeric.mean numeric.sd numeric.p0 numeric.p25 numeric.p50 numeric.p75 numeric.p100 numeric.hist
numeric quakers 1 0.9992532 0.1733931 0.832121 0 0 0.00 0 11 ▇▁▁▁▁
numeric category_two_defects 0 1.0000000 3.5563854 5.312541 0 0 2.00 4 55 ▇▁▁▁▁
numeric altitude_low_meters 230 0.8282300 1750.7133151 8669.440545 1 1100 1310.64 1600 190164 ▇▁▁▁▁
numeric altitude_high_meters 230 0.8282300 1799.3477749 8668.805771 1 1100 1350.00 1650 190164 ▇▁▁▁▁
numeric altitude_mean_meters 230 0.8282300 1775.0305450 8668.626080 1 1100 1310.64 1600 190164 ▇▁▁▁▁

Fixing data types

We notice that species and country_of_origin are characters but we would like to work with them as factors, so we convert these variables:

coffee <- coffee %>%
  mutate(across(c(country_of_origin, species), as_factor))


ggplot(data = coffee %>% 
         # leave out apparent outliers on both axes
         filter(altitude_mean_meters < 4500,
                total_cup_points > 50),
       aes(x = altitude_mean_meters, y = total_cup_points,
           colour = species)) +
  geom_point() +
  geom_smooth(method = 'lm') +
  scale_colour_manual(values = c('Arabica' = '#87BBA2',
                                 'Robusta' = '#EDB458')) +
  labs(x = 'Altitude (meters)', y = 'Total cup points',
       colour = 'Species',
       title = 'Coffee bean ratings by altitude and species',
       caption = 'Data source: Tidy Tuesday') +

#return top 4 most frequent countries
#coffee %>%
#  count(country_of_origin, sort = TRUE) %>%
#  slice_head(n = 4) 

# add new column with frequency by country
coffee <- coffee %>%
  add_count(country_of_origin, name = 'country_n')

ggplot(data = coffee %>% 
         # leave out apparent outliers on both axes
         filter(altitude_mean_meters < 4500,
                total_cup_points > 50) %>%
         # using 4 most frequent countries only
         #filter(country_of_origin %in% c('Mexico', 'Colombia', 'Guatemala', 'Brazil')),
         # using countries with more than 100 bean entries
         filter(country_n > 100),
       aes(x = altitude_mean_meters, y = total_cup_points,
           colour = country_of_origin)) +
  geom_point() +
  geom_smooth(method = 'lm') +
  scale_colour_brewer(palette = 'Set2') +
  labs(x = 'Altitude (meters)', y = 'Total cup points',
       colour = 'Country of origin',
       title = 'Coffee bean ratings by altitude and country of origin',
       caption = 'Data source: Tidy Tuesday') +

# custom color palette
pal_species <- c('seagreen', 'steelblue')

fig_ratings <- ggarrange(
  ggplot(data = coffee, 
       aes(x = aroma, fill = species, colour = species)) +
  geom_density(alpha = 0.3) +
  scale_colour_manual(values = pal_species) +
  scale_fill_manual(values = pal_species) +
  labs(x = 'Aroma', y = 'Density',
       colour = 'Species', fill = 'Species',
       subtitle = 'Aroma') +
  ggplot(data = coffee, 
       aes(x = sweetness, fill = species, colour = species)) +
  geom_density(alpha = 0.3) +
  scale_colour_manual(values = pal_species) +
  scale_fill_manual(values = pal_species) +
  labs(x = 'Sweetness', y = 'Density',
       colour = 'Species', fill = 'Species',
       subtitle = 'Sweetness') +
  ggplot(data = coffee, 
       aes(x = acidity, fill = species, colour = species)) +
  geom_density(alpha = 0.3) +
  scale_colour_manual(values = pal_species) +
  scale_fill_manual(values = pal_species) +
  labs(x = 'Acidity', y = 'Density',
       colour = 'Species', fill = 'Species',
       subtitle = 'Acidity') +
  nrow = 1, ncol = 3, common.legend = TRUE, legend = 'right'

                top = text_grob('Coffee feature ratings by species'),
                fig.lab = 'Data source: Tidy Tuesday',
                fig.lab.pos = 'bottom.right', fig.lab.size = 8)