Gejun's Blog

Demystifying the Central Limit Theorem

Marbles to the Rescue: Making the Abstract Tangible

Premise

Imagine that you have a bag filled with red and blue marbles.

How would you guess the proportion of red ones without going through every marble?

Simulating Marble Population

library(tidyverse)

source("../utils.R")   # helper functions

red_marble = "🔴"
blue_marble = "🔵"

# pretend you don't know this number
prob_red = 0.64
num_marbles = 1000

set.seed(42) 
marbles = sample(c(red_marble, blue_marble), 
                 size = num_marbles, 
                 replace = TRUE, 
                 prob = c(prob_red, 1 - prob_red))

Taking One Sample

set.seed(42)
one_sample = get_samples(marbles, sample_size = 10, number_of_samples = 1, func = is_red)

kable(one_sample, "html") |>
  kable_styling(font_size = 20)

sample_number	sample_value
1	0
1	1
1	1
1	0
1	1
1	1
1	0
1	1
1	0
1	0

sample_mean = mean(one_sample$sample_value)
print(str_glue("Percentage of red marbles: {sample_mean * 100}%"))

Percentage of red marbles: 50%

Taking More Samples

size = 10
n = 100

set.seed(42)
samples = get_samples(marbles, sample_size = size, number_of_samples = n, func = is_red)

sample_means = samples |>
  group_by(sample_number) |>
  summarise(proportion = mean(sample_value))

kable(head(sample_means, 5), "html") |>
  kable_styling(font_size = 20)

sample_number	proportion
1	0.5
2	0.7
3	0.4
4	0.7
5	0.8

standard_errors = tibble(sample_size = size, standard_error = sd(sample_means$proportion))
kable(standard_errors)

sample_size	standard_error
10	0.174298

Taking More Samples

p10 = plot_sample_means(samples, size, prob_red)
plot_samples_and_averages(samples, prob_red)

Increasing Sample Size to 20

size = 20
set.seed(42)
samples = get_samples(marbles, sample_size = size, number_of_samples = n, func = is_red)

sample_means = samples |>
  group_by(sample_number) |>
  summarise(proportion = mean(sample_value))

standard_errors = standard_errors |>
  add_row(sample_size = size, 
          standard_error = sd(sample_means$proportion))

kable(standard_errors)

sample_size	standard_error
10	0.1742980
20	0.1035665

Increasing Sample Size to 20

p20 = plot_sample_means(samples, size, prob_red)
grid.arrange(p10, p20, ncol=2)

Increasing Sample Size to 50

size = 50
set.seed(42)
samples = get_samples(marbles, sample_size = size, number_of_samples = n, func = is_red)

sample_means = samples |>
  group_by(sample_number) |>
  summarise(proportion = mean(sample_value))

standard_errors = standard_errors |>
  add_row(sample_size = size, 
          standard_error = sd(sample_means$proportion))

kable(standard_errors, "html")

sample_size	standard_error
10	0.1742980
20	0.1035665
50	0.0669916

Increasing Sample Size to 50

p50 = plot_sample_means(samples, size, prob_red)
grid.arrange(p10, p20, p50, ncol=2)

Increasing Sample Size to 100

size = 100
set.seed(42)
samples = get_samples(marbles, sample_size = size, number_of_samples = n, func = is_red)

sample_means = samples |>
  group_by(sample_number) |>
  summarise(proportion = mean(sample_value))

standard_errors = standard_errors |>
  add_row(sample_size = size, 
          standard_error = sd(sample_means$proportion))

kable(standard_errors, "html")

sample_size	standard_error
10	0.1742980
20	0.1035665
50	0.0669916
100	0.0494720

Increasing Sample Size to 100

p100 = plot_sample_means(samples, size, prob_red)
grid.arrange(p10, p20, p50, p100, ncol=2)

Central Limit Theorem

As the sample size increases, typically when it’s greater than 30, the distribution of the sample means approaches a normal distribution.

Skewed Data

set.seed(42)

n = 1000
mean_log = log(30000)
sd_log = 1 
salaries = rlnorm(n, meanlog = mean_log, sdlog = sd_log)

ggplot(data.frame(salaries), aes(x = salaries)) + 
  geom_histogram(bins = 50, fill = "blue", color = "black") + 
  geom_vline(xintercept = mean(salaries), color = "red", linewidth = 1) +
  geom_text(aes(x = mean(salaries) * 2, y = 400, label = str_glue("{round(mean(salaries), 2)}")), 
            color = "red", size = 3) +
  labs(title = "Salary Distribution", x = "Salary", y = "Count") + 
  theme_minimal()

Skewed Data

samples = get_samples(salaries, sample_size = 100, number_of_samples = 100)
plot_samples_and_averages(samples, mean(salaries))

Key Insights

Sample means cluster around the population mean
Larger sample sizes decrease variability
Distribution of sample means becomes more normal as size increases
Works for any population distribution

Red Marble Proportion Estimation

set.seed(42)
one_sample = sample(marbles, 100)
print(str_glue("Percentage of red marbles: {mean(one_sample == red_marble) * 100}%"))

Percentage of red marbles: 60%

Question

How confident are we in the estimation?
How can we quantify our level of certainty?