🗓️ Session 10: Data preparation

Author

Published

03 05 2024

Modified

11 05 2024

In this session you learn how to turn your raw data into a state such that you can work with it. Luckily, there is one particular form for our data that represents the common starting point for all further operations, such as visualization or modelling. This form is called tidy data. And the goal of this session is to equip you with the tools that you need to turn the often messy raw data into tidy data. These skills are important because they make you independent: you will be able to prepare any data you find or create yourself such that you can further process it, and you will not rely on others to provide you data in a particular form.

👨‍🏫 Lecture Slides

Either click on the slide area below or click here to download the slides.

Data and solutions to the intermediate exercises

Solutions for the exercises in the session on data preparation.

view raw # Data Preparation: solutions hosted with ❤ by GitHub

	here::i_am("R/DataPrep-Exercise-1.R") # Adjust to your folder structure
	library(here)
	library(dplyr)
	library(tidyr)
	library(tibble)
	library(data.table)

	file_path <- here::here("data/raw/exercise_1.csv")
	ex1_data <- data.table::fread(file = file_path)
	ex1_data <- tibble::as_tibble(ex1_data)

	ex1_data_filtered <- ex1_data %>%
	dplyr::filter(
	country %in% c("Germany", "Greece"),
	year %in% seq(1995, 2015),
	year >= 1995, year <= 2015 # equivalent to row above
	)

	ex1_data_tidy <- ex1_data_filtered %>%
	tidyr::pivot_wider(
	names_from = "indicator",
	values_from = "values")

	tidy_path <- "data/tidy/ex1_solution.csv"
	data.table::fwrite(x = ex1_data_tidy, file = tidy_path)

	# Alternative formulation combining filtering and pivoting using the pipe:

	ex1_data %>%
	dplyr::filter(
	country %in% c("Germany", "Greece"),
	year %in% seq(1995, 2015),
	year >= 1995, year <= 2015 # equivalent to row above
	) %>%
	tidyr::pivot_wider(
	names_from = "indicator",
	values_from = "values")

view raw DataPrep-Exercise-1.R hosted with ❤ by GitHub

	here::i_am("R/DataPrep-Exercise-2.R")
	library(here)
	library(dplyr)
	library(tidyr)
	library(data.table)
	library(ggplot2) # only for the bonus

	# Import the data----------------------
	file_path <- here::here("data/raw/exercise_2.csv")
	ex2_data <- data.table::fread(file = file_path)
	ex2_data <- tibble::as_tibble(ex2_data)

	# Wrangle the data---------------------
	cols_to_keep <- c("country", "year", "gdp", "share_indus", "co2")

	ex2_data_final <- ex2_data %>%
	dplyr::select(
	dplyr::all_of(cols_to_keep)
	# or: -dplyr::all_of(c("unemp"))
	) %>%
	dplyr::mutate(share_indus=share_indus/100) %>%
	dplyr::filter(year>=2010, year<=2018) %>%
	pivot_longer(
	cols = dplyr::all_of(c("gdp", "share_indus", "co2")),
	names_to = "indicator",
	values_to = "value") %>%
	dplyr::group_by(country, indicator) %>%
	dplyr::summarise(
	time_avg=mean(value, na.rm=TRUE),
	.groups = "drop" # Not strictly necessary, but good practice
	)

	tidy_path <- "data/tidy/ex2_solution.csv"
	data.table::fwrite(x = ex2_data_final, file = tidy_path)

	# Bonus: make a plot from the data-----
	ex2_plot <- ex2_data_final %>%
	dplyr::filter(indicator=="co2") %>%
	ggplot2::ggplot(
	data = .,
	mapping = aes(x=indicator,
	y = time_avg,
	color=country,
	fill=country)
	) +
	geom_bar(
	stat = "identity",
	position = position_dodge(),
	alpha=0.75) +
	theme_bw() +
	labs(
	title = "Average CO2 emissions (2010-2018)",
	y = "avg. emissions per capita",
	caption = "Data: World Bank.") +
	scale_y_continuous(expand = expansion()) +
	scale_fill_brewer(
	palette = "Set1", aesthetics = c("color", "fill")) +
	theme(
	legend.title = element_blank(),
	legend.position = "bottom",
	axis.title.x = element_blank(),
	axis.ticks.x = element_blank(),
	axis.text.x = element_blank()
	)

	ggsave(plot = ex2_plot,
	filename = here("output/ex2_solution.pdf"),
	width = 4, height = 3)

view raw DataPrep-Exercise-2.R hosted with ❤ by GitHub

	here::i_am("R/DataPrep-Intermediate-Exercises.R")
	# Adjust to your directory structure
	library(here)
	library(dplyr)
	library(tidyr)
	library(data.table)
	library(DataScienceExercises)

	# Data is available via the course homepage.

	# 1. Short recap on reshaping------------
	data_raw_long <- fread(
	file = here("data/raw/data_raw_long.csv"))
	data_raw_long <- tibble::as_tibble(data_raw_long)
	head(data_raw_long)

	data_raw_long_w <- tidyr::pivot_wider(
	data = data_raw_long,
	names_from = "country",
	values_from = "value")
	data_raw_long_w

	data_raw_wide <- fread(
	file = here("data/raw/data_raw_wide.csv"), header = TRUE)
	data_raw_wide <- tibble::as_tibble(data_raw_wide)
	head(data_raw_wide)

	data_raw_wide_l <- data_raw_wide %>%
	tidyr::pivot_longer(
	cols = -country,
	names_to = "year",
	values_to = "gini")

	# 2. Short recap on manipulation basics------------
	wine_data <- tibble::as_tibble(DataScienceExercises::wine2dine)

	# Filter the data set such that it only contains white wines
	wine_1 <- wine_data %>%
	dplyr::filter(kind=="white")

	# Then remove the column 'kind'
	wine_2 <- wine_1 %>%
	dplyr::select(-kind)

	# Change the type of the column 'quality' into double
	wine_3 <- wine_2 %>%
	dplyr::mutate(quality=as.double(quality))

	# Divide the values in the columns 'alcohol' and 'residual sugar' by 100
	wine_4 <- wine_3 %>%
	dplyr::mutate(
	alcohol = alcohol/100,
	`residual sugar` = `residual sugar`/100
	)

	# Filter the data such that you only keep the wines with the highest quality score
	highest_quality <- max(wine_4$quality)
	wine_5 <- wine_4 %>%
	dplyr::filter(quality >= highest_quality)


	# 3. Short recap on summarizing and grouping------------
	# Summarise the data by computing the mean alcohol, mean sugar,
	# and mean quality of white and red wines

	## Alternative 1:
	wine_summary_1 <- wine_data %>%
	dplyr::group_by(kind) %>%
	dplyr::summarise(
	alc_mean = mean(alcohol),
	sugar_mean = mean(`residual sugar`),
	qual_mean = mean(quality))
	wine_summary_1

	## Alternative 2:
	wine_summary_1 <- wine_data %>%
	dplyr::summarise(
	alc_mean = mean(alcohol),
	sugar_mean = mean(`residual sugar`),
	qual_mean = mean(quality), .by = "kind")
	wine_summary_1

	## Alternative 3:
	wine_summary_1 <- wine_data %>%
	dplyr::summarise(across(.cols = everything(), .fns = mean), .by = "kind")
	wine_summary_1

	# Compute a variable indicating how the quality of each wine
	# deviates from the average quality of all wines.

	wine_summary_2 <- wine_data %>%
	dplyr::mutate(
	average_quality = mean(quality),
	quality_deviation = quality - average_quality
	)

	# 4. Short recap on joining data sets------------
	join_x <- tibble::as_tibble(data.table::fread(here("data/raw/join_x.csv")))
	join_y <- tibble::as_tibble(data.table::fread(here("data/raw/join_y.csv")))

	# Try for yourself what the function inner_join() does.
	# How does it differ from left_join(), right_join(), and full_join()?

	xy_inner <- dplyr::inner_join(x = join_x, y = join_y, by = c("time", "id"))
	# Only keeps rows where there are observations in both data sets -> avoids NA
	# In the present case, it produces an empty tibble; for more info check the tutorial

	# Consider the data sets join_x.csv and join_y.csv and the function
	# dplyr::full_join(). What is the difference of joining on columns time and
	# id vs joining only on column id?
	xy_full_t_id <- dplyr::full_join(x = join_x, y = join_y, by = c("time", "id"))
	xy_full_id <- dplyr::full_join(x = join_x, y = join_y, by = c("id"))
	xy_full_t_id
	xy_full_id


	# 5. Short recap on piping------------

	pipedata_v1 <- data.table::fread(here("data/raw/recap-pipes.csv"))

	pipedata_v2 <- tidyr::pivot_longer(
	data = pipedata_v1,
	cols = c("lifeExp", "gdpPercap"),
	names_to = "Indicator",
	values_to = "Value")

	pipedata_v3 <- tidyr::pivot_wider(
	data = pipedata_v2,
	names_from = "year",
	values_from = "Value")

	# Piped version:

	pipe_data_final <- data.table::fread(here("data/raw/recap-pipes.csv")) %>%
	tidyr::pivot_longer(
	cols = c("lifeExp", "gdpPercap"),
	names_to = "Indicator",
	values_to = "Value") %>%
	tidyr::pivot_wider(
	names_from = "year",
	values_from = "Value")

view raw DataPrep-IntermediateExs.R hosted with ❤ by GitHub

🎥 Lecture videos

All the videos are available via this playlist.

Expand to access the videos directly

Solutions to the intermediate exercises

📚 Mandatory Reading

The tutorial Data preparation
Chapter 5 in Wickham et al. (2023).

✍️ Coursework

Do the exercises Wrangling1 from the DataScienceExercises package

Quick code for starting the exercises

learnr::run_tutorial(
  name = "Wrangling1", 
  package = "DataScienceExercises", 
  shiny_args=list("launch.browser"=TRUE))

Download data about the CO2 emissions for some countries of your choice from the World Bank website for the years 2000 to 2020. Set up an R project, save the data, import it, and make a line graph.
- ZIP file with a possible solution¹
If you want more exercises on the challenge of making data longer/wider, you can do the exercises Wrangling2 from the DataScienceExercises package

Quick code for starting the exercises

learnr::run_tutorial(
  name = "Wrangling2", 
  package = "DataScienceExercises", 
  shiny_args=list("launch.browser"=TRUE))

References

Wickham, H. (2014) “Tidy Data,” Journal of Statistical Software 59(10).

Wickham, H., Çetinkaya-Rundel, M. and Grolemund, G. (2023) R for data science: Import, tidy, transform, visualize, and model data, 2nd edition., Beijing et al.: O’Reilly, available at https://r4ds.hadley.nz/.

Footnotes

You can ignore the make_co2_data.R for now and only look at make_co2_plot.R.↩︎

👨‍🏫 Lecture Slides

🎥 Lecture videos

📚 Mandatory Reading

Further Reading

✍️ Coursework

References

Footnotes