::run_tutorial(
learnrname = "Sampling",
package = "DataScienceExercises",
shiny_args=list("launch.browser"=TRUE))
🗓️ Sessions 15 and 16: Sampling
A central concept in data science - and in applied statistics more generally - is that of sampling. This refers to the strategy of using (small) samples to learn about a (large) population. For example, if you wanted to understand the effect of TV advertising on the consumer behaviour of young men in Germany, you could study the whole population of young men in Germany. But since this is usually not feasible, you would rather take a sample of young men, study their behaviour and then generalise to the whole population. In this session we will discuss when and how this is possible. In this context, we will also learn about the concept of Monte Carlo simulations and two central concepts of probability theory underlying applied statistics: the central limit theorem and the law of large numbers, both of which underlie much of modern sampling theory.
👨🏫 Lecture Slides
Either click on the slide area below or click here to download the slides.
Code used during the lecture on sampling. |
base_object <- c("a", "b", "c", "d") | |
result_container <- rep(NA, length(base_object)) | |
for (i in base_object){ | |
print(i) | |
} | |
# Always better to use indices: | |
# (note: function "paste" used to add explanation) | |
for (i in seq_along(base_object)) { | |
print(paste("Iteration:", i)) | |
print(paste("Element of base_object:", base_object[i])) | |
} | |
# Note: you do not need to use "i" in the action sequence, e.g. if you are just | |
# interested in repeating a certain action several times: | |
for (i in seq(1, 10)){ | |
print(i) | |
print(sample(base_object, size = 1)) # Draws a random element from base_object | |
} |
# Write a for-loop that loops over the vector c(1,2,3,4,5,99) | |
#. and computes the square root for each element. | |
base_object <- c(1,2,3,4,5,99) | |
# Note: yes, you can do this via vectorization | |
sqrt(base_object) | |
# but we use this as a simple example to illustrate the idea | |
# 1. Output container | |
output_container <- rep(NA, length(base_object)) | |
# 2. Looping sequence | |
for (i in seq_along(base_object)){# 3. Action body | |
output_container[i] <- sqrt(base_object[i]) | |
} | |
output_container |
library(tibble) | |
library(dplyr) | |
library(tidyr) | |
library(ggplot2) | |
library(scales) | |
# More elaborate example: the ball pid----------- | |
# Note: for a slightly more elegant solution using a function in the loop, | |
#. please read the tutorial on sampling | |
# First step: create the artificial ball pid------ | |
ball_pid_size <- 5000 | |
ball_pid_share_white <- 0.65 | |
white_balls <- as.integer(ball_pid_share_white*ball_pid_size) | |
grey_balls <- ball_pid_size - white_balls | |
ball_pid_colors <- c(rep("white", white_balls), rep("grey", grey_balls)) | |
ball_pid <- tibble::tibble( | |
id = seq(1, ball_pid_size), | |
color = sample(ball_pid_colors) | |
) | |
head(ball_pid) | |
## Conduct the simulation for only the case with sample size 20----------- | |
## Conduct the simulation-------------- | |
n_samples <- 1000 # The number of iterations we want to get | |
results_n20 <- rep(NA, n_samples) # The output container | |
# Since n_samples is a single number, we use seq_len() instead of seq_along() | |
for (i in seq_len(n_samples)){ | |
# Draw a sample of size 20: | |
sample_drawn <- sample(x = ball_pid$color, size = 20) | |
# Compute the share of white balls within this sample: | |
share_white <- sum(sample_drawn=="white")/length(sample_drawn) | |
# Write into output container: | |
results_n20[i] <- share_white | |
} | |
# Compute mean and standard deviation of the sample distribution----- | |
mean(results_n20) | |
sd(results_n20) | |
## Visualize the result----------- | |
hist_visualization <- ggplot( | |
data = tibble(results_n20), | |
mapping = aes(x=results_n20) | |
) + | |
geom_histogram(binwidth = 0.02, fill="#00395B", alpha=0.75) + | |
scale_y_continuous(expand = expansion(add = c(0, 10))) + | |
scale_x_continuous(labels = percent_format()) + | |
labs( | |
x = "Share of white balls", | |
y = "Number of samples", | |
title = "True share: 65%") + | |
geom_vline(xintercept = 0.65) + | |
theme_linedraw() + | |
theme( | |
panel.grid.minor.x = element_blank(), | |
panel.grid.minor.y = element_blank()) | |
hist_visualization |
library(tibble) | |
library(dplyr) | |
library(tidyr) | |
library(ggplot2) | |
library(scales) | |
# Create the population-------- | |
ball_pid_size <- 5000 | |
ball_pid_share_white <- 0.65 | |
white_balls <- as.integer(ball_pid_share_white*ball_pid_size) | |
grey_balls <- ball_pid_size - white_balls | |
ball_pid_colors <- c(rep("white", white_balls), rep("grey", grey_balls)) | |
ball_pid <- tibble::tibble( | |
id = seq(1, ball_pid_size), | |
color = sample(ball_pid_colors) | |
) | |
# Conduct the simulation-------------- | |
n_samples <- 1000 # The number of iterations we want to get | |
results_n20 <- rep(NA, n_samples) # The output container for n=20 | |
results_n50 <- rep(NA, n_samples) # The output container for n=50 | |
results_n100 <- rep(NA, n_samples) # The output container for n=100 | |
# Since n_samples is a single number, we use seq_len() instead of seq_along() | |
for (i in seq_len(n_samples)){ | |
# Draw samples of the respective sizes: | |
sample_drawn_20 <- sample(x = ball_pid$color, size = 20) | |
sample_drawn_50 <- sample(x = ball_pid$color, size = 50) | |
sample_drawn_100 <- sample(x = ball_pid$color, size = 100) | |
# Compute the share of white balls within this sample: | |
share_white_20 <- sum(sample_drawn_20=="white")/length(sample_drawn_20) | |
share_white_50 <- sum(sample_drawn_50=="white")/length(sample_drawn_50) | |
share_white_100 <- sum(sample_drawn_100=="white")/length(sample_drawn_100) | |
# Write into output container: | |
results_n20[i] <- share_white_20 | |
results_n50[i] <- share_white_50 | |
results_n100[i] <- share_white_100 | |
} | |
# Combine all three cases in one tibble: | |
result_table <- tibble( | |
sample_size20 = results_n20, | |
sample_size50 = results_n50, | |
sample_size100 = results_n100 | |
) | |
# Compute mean and standard deviation of the sample distribution----- | |
result_table %>% | |
pivot_longer( | |
cols = everything(), | |
names_to = "Sample size", | |
values_to = "Values") %>% | |
summarise( | |
`Mean share of whites`=mean(Values), `Variation`=sd(Values), | |
.by = "Sample size") | |
# Visualize the result----------- | |
hist_visualization <- result_table %>% | |
pivot_longer( | |
cols = everything(), | |
names_to = "Sample size", | |
values_to = "Values") %>% | |
mutate(`Sample size` = as.integer(gsub( | |
x = `Sample size`, pattern = "sample_size", replacement = "")) # To remove string part | |
) %>% | |
ggplot( | |
data = ., | |
mapping = aes(x=Values) | |
) + | |
geom_histogram(binwidth = 0.02, fill="#00395B", alpha=0.75) + | |
scale_y_continuous(expand = expansion(add = c(0, 10))) + | |
scale_x_continuous(labels = percent_format()) + | |
labs( | |
x = "Share of white balls", | |
y = "Number of samples", | |
title = "True share: 65%") + | |
geom_vline(xintercept = 0.65) + | |
facet_wrap(~`Sample size`, scales = "fixed") + | |
theme_linedraw() + | |
theme( | |
panel.grid.minor.x = element_blank(), | |
panel.grid.minor.y = element_blank()) | |
hist_visualization |
- Extensive solution for the exercise on the average height of EUF students
- Solution for the exercise on terminology
- Explanation of the Central Limit Theorem and the corresponding exercise
🎥 Lecture videos
So far, there are no learning videos available for this lecture.
📚 Mandatory Reading
- Tutorial on sampling
- Chapter 7 in Ismay and Kim (2020).
✍️ Coursework
- Do the exercises
Sampling
from theDataScienceExercises
package