Coffee Bean Ratings

I created this report while following R Ladies Freiburg’s “Guided Tidy Tuesday” meetup on 2021-06-08. Today’s data can be found here and comes from the Coffee Quality Database; it includes reviews of multiple features of coffee beans from around the world.

coffee <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-07-07/coffee_ratings.csv')

Data summary

We can see that each row in the data corresponds to a coffee bean, and the columns reflect details of that coffee bean, including information about where it is produced and features rated for that bean:

# a few options for summarizing the data (output hidden)
#head(coffee)
#str(coffee)
knitr::kable(skimr::skim(coffee) %>% 
               slice_head(n = 5) %>% # 5 character variables
               select(skim_type:complete_rate, character.min:character.whitespace))

skim_type	skim_variable	n_missing	complete_rate	character.min	character.max	character.n_unique
character	species	0	1.0000000	7	7	2
character	owner	7	0.9947722	3	50	315
character	country_of_origin	1	0.9992532	4	28	36
character	farm_name	359	0.7318895	1	73	571
character	lot_number	1063	0.2061240	1	71	227

knitr::kable(skimr::skim(coffee) %>%
               slice_tail(n = 5) %>% # 5 numeric variables
               select(skim_type:complete_rate, numeric.mean:numeric.hist))

skim_type	skim_variable	n_missing	complete_rate	numeric.mean	numeric.sd	numeric.p0	numeric.p25	numeric.p50	numeric.p75	numeric.p100	numeric.hist
numeric	quakers	1	0.9992532	0.1733931	0.832121	0	0	0.00	0	11	▇▁▁▁▁
numeric	category_two_defects	0	1.0000000	3.5563854	5.312541	0	0	2.00	4	55	▇▁▁▁▁
numeric	altitude_low_meters	230	0.8282300	1750.7133151	8669.440545	1	1100	1310.64	1600	190164	▇▁▁▁▁
numeric	altitude_high_meters	230	0.8282300	1799.3477749	8668.805771	1	1100	1350.00	1650	190164	▇▁▁▁▁
numeric	altitude_mean_meters	230	0.8282300	1775.0305450	8668.626080	1	1100	1310.64	1600	190164	▇▁▁▁▁

Fixing data types

We notice that species and country_of_origin are characters but we would like to work with them as factors, so we convert these variables:

coffee <- coffee %>%
  mutate(across(c(country_of_origin, species), as_factor))

Plots

ggplot(data = coffee %>% 
         # leave out apparent outliers on both axes
         filter(altitude_mean_meters < 4500,
                total_cup_points > 50),
       aes(x = altitude_mean_meters, y = total_cup_points,
           colour = species)) +
  geom_point() +
  geom_smooth(method = 'lm') +
  scale_colour_manual(values = c('Arabica' = '#87BBA2',
                                 'Robusta' = '#EDB458')) +
  labs(x = 'Altitude (meters)', y = 'Total cup points',
       colour = 'Species',
       title = 'Coffee bean ratings by altitude and species',
       caption = 'Data source: Tidy Tuesday') +
  theme_classic()

#return top 4 most frequent countries
#coffee %>%
#  count(country_of_origin, sort = TRUE) %>%
#  slice_head(n = 4) 

# add new column with frequency by country
coffee <- coffee %>%
  add_count(country_of_origin, name = 'country_n')

ggplot(data = coffee %>% 
         # leave out apparent outliers on both axes
         filter(altitude_mean_meters < 4500,
                total_cup_points > 50) %>%
         # using 4 most frequent countries only
         #filter(country_of_origin %in% c('Mexico', 'Colombia', 'Guatemala', 'Brazil')),
         # using countries with more than 100 bean entries
         filter(country_n > 100),
       aes(x = altitude_mean_meters, y = total_cup_points,
           colour = country_of_origin)) +
  geom_point() +
  geom_smooth(method = 'lm') +
  scale_colour_brewer(palette = 'Set2') +
  labs(x = 'Altitude (meters)', y = 'Total cup points',
       colour = 'Country of origin',
       title = 'Coffee bean ratings by altitude and country of origin',
       caption = 'Data source: Tidy Tuesday') +
  theme_classic()

# custom color palette
pal_species <- c('seagreen', 'steelblue')


fig_ratings <- ggarrange(
  ggplot(data = coffee, 
       aes(x = aroma, fill = species, colour = species)) +
  geom_density(alpha = 0.3) +
  scale_colour_manual(values = pal_species) +
  scale_fill_manual(values = pal_species) +
  labs(x = 'Aroma', y = 'Density',
       colour = 'Species', fill = 'Species',
       subtitle = 'Aroma') +
  theme_classic(),
  ggplot(data = coffee, 
       aes(x = sweetness, fill = species, colour = species)) +
  geom_density(alpha = 0.3) +
  scale_colour_manual(values = pal_species) +
  scale_fill_manual(values = pal_species) +
  labs(x = 'Sweetness', y = 'Density',
       colour = 'Species', fill = 'Species',
       subtitle = 'Sweetness') +
  theme_classic(),
  ggplot(data = coffee, 
       aes(x = acidity, fill = species, colour = species)) +
  geom_density(alpha = 0.3) +
  scale_colour_manual(values = pal_species) +
  scale_fill_manual(values = pal_species) +
  labs(x = 'Acidity', y = 'Density',
       colour = 'Species', fill = 'Species',
       subtitle = 'Acidity') +
  theme_classic(),
  
  nrow = 1, ncol = 3, common.legend = TRUE, legend = 'right'
)

annotate_figure(fig_ratings,
                top = text_grob('Coffee feature ratings by species'),
                fig.lab = 'Data source: Tidy Tuesday',
                fig.lab.pos = 'bottom.right', fig.lab.size = 8)

Coffee Bean Ratings

Tidy Tuesday 2020-07-07

Shelby Bachman

Data summary

Fixing data types

Plots