create_dir_save_file
) to automatically download and save the required data (data/2020-06-01_statistical_thinking_2
) and image (Images/2020-06-01_statistical_thinking_2
) files.After completing Statistical Thinking in Python (Part 1), you have the probabilistic mindset and foundational hacker stats skills to dive into data sets and extract useful information from them. In this course, you will do just that, expanding and honing your hacker stats toolbox to perform the two key tasks in statistical inference, parameter estimation and hypothesis testing. You will work with real data sets as you learn, culminating with analysis of measurements of the beaks of the Darwin's famous finches. You will emerge from this course with new knowledge and lots of practice under your belt, ready to attack your own inference problems out in the world.
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import collections as mc
import numpy as np
from scipy.stats import pearsonr
import seaborn as sns
from pathlib import Path
import requests
from datetime import datetime
import matplotlib as mpl
mpl.__version__
pd.set_option('max_columns', 200)
pd.set_option('max_rows', 300)
pd.set_option('display.expand_frame_repr', True)
plt.rcParams['figure.figsize'] = (10.0, 10.0)
# plt.style.use('seaborn-dark-palette')
plt.rcParams['axes.grid'] = True
plt.rcParams["patch.force_edgecolor"] = True
def create_dir_save_file
¶def create_dir_save_file(dir_path: Path, url: str):
"""
Check if the path exists and create it if it does not.
Check if the file exists and download it if it does not.
"""
if not dir_path.parents[0].exists():
dir_path.parents[0].mkdir(parents=True)
print(f'Directory Created: {dir_path.parents[0]}')
else:
print('Directory Exists')
if not dir_path.exists():
r = requests.get(url, allow_redirects=True)
open(dir_path, 'wb').write(r.content)
print(f'File Created: {dir_path.name}')
else:
print('File Exists')
data_dir = Path('data/2020-06-01_statistical_thinking_2')
images_dir = Path('Images/2020-06-01_statistical_thinking_2')
def ecdf
¶def ecdf(data):
"""Compute ECDF for a one-dimensional array of measurements."""
# Number of data points: n
n = len(data)
# x-data for the ECDF: x
x = np.sort(data)
# y-data for the ECDF: y
y = np.arange(1, n+1) / n
return x, y
def pearson
¶def pearson_r(x, y):
"""Compute Pearson correlation coefficient between two arrays."""
# Compute correlation matrix: corr_mat
corr_mat = np.corrcoef(x, y)
# Return entry [0,1]
return corr_mat[0,1]
def pearson_r
¶def pearson_r(x, y):
"""Compute Pearson correlation coefficient between two arrays."""
# Compute correlation matrix: corr_mat
corr_mat = np.corrcoef(x, y)
# Return entry [0,1]
return corr_mat[0,1]
anscombe = 'https://assets.datacamp.com/production/repositories/470/datasets/fe820c6cbe9bcf4060eeb9e31dd86aa04264153a/anscombe.csv'
bee_sperm = 'https://assets.datacamp.com/production/repositories/470/datasets/e427679d28d154934a6c087b2fa945bc7696db6d/bee_sperm.csv'
female_lit_fer = 'https://assets.datacamp.com/production/repositories/470/datasets/f1e7f8a98c18da5c60b625cb8af04c3217f4a5c3/female_literacy_fertility.csv'
finch_beaks_1975 = 'https://assets.datacamp.com/production/repositories/470/datasets/eb228490f7d823bfa6458b93db075ca5ccd3ec3d/finch_beaks_1975.csv'
finch_beaks_2012 = 'https://assets.datacamp.com/production/repositories/470/datasets/b28d5bf65e38460dca7b3c5c0e4d53bdfc1eb905/finch_beaks_2012.csv'
fortis_beak = 'https://assets.datacamp.com/production/repositories/470/datasets/532cb2fecd1bffb006c79a28f344af2290d643f3/fortis_beak_depth_heredity.csv'
frog_tongue = 'https://assets.datacamp.com/production/repositories/470/datasets/df6e0479c0f292ce9d2b951385f64df8e2a8e6ac/frog_tongue.csv'
basball = 'https://assets.datacamp.com/production/repositories/470/datasets/593c37a3588980e321b126e30873597620ca50b7/mlb_nohitters.csv'
scandens_beak = 'https://assets.datacamp.com/production/repositories/470/datasets/7ff772e1f4e99ed93685296063b6e604a334236d/scandens_beak_depth_heredity.csv'
sheffield_weather = 'https://assets.datacamp.com/production/repositories/470/datasets/129cba08c45749a82701fbe02180c5b69eb9adaf/sheffield_weather_station.csv'
nohitter_time = 'https://raw.githubusercontent.com/trenton3983/DataCamp/master/data/2020-06-01_statistical_thinking_2/nohitter_time.csv'
sol = 'https://assets.datacamp.com/production/repositories/469/datasets/df23780d215774ff90be0ea93e53f4fb5ebbade8/michelson_speed_of_light.csv'
votes = 'https://assets.datacamp.com/production/repositories/469/datasets/8fb59b9a99957c3b9b1c82b623aea54d8ccbcd9f/2008_all_states.csv'
swing = 'https://assets.datacamp.com/production/repositories/469/datasets/e079fddb581197780e1a7b7af2aeeff7242535f0/2008_swing_states.csv'
datasets = [anscombe, bee_sperm, female_lit_fer, finch_beaks_1975, finch_beaks_2012, fortis_beak, frog_tongue, basball, scandens_beak, sheffield_weather, nohitter_time, sol, votes, swing]
data_paths = list()
for data in datasets:
file_name = data.split('/')[-1].replace('?raw=true', '')
data_path = data_dir / file_name
create_dir_save_file(data_path, data)
data_paths.append(data_path)
df_bb = pd.read_csv(data_paths[7])
nohitter_times = pd.read_csv(data_paths[10])
sol = pd.read_csv(data_paths[11])
votes = pd.read_csv(data_paths[12])
swing = pd.read_csv(data_paths[13])
fem = pd.read_csv(data_paths[2])
anscombe = pd.read_csv(data_paths[0], header=[0, 1])
anscombe.columns = [f'{x[1]}_{x[0]}' for x in anscombe.columns] # reduce multi-level column name to single level
sheffield = pd.read_csv(data_paths[9], header=[8], sep='\\s+')
frogs = pd.read_csv(data_paths[6], header=[14])
frogs.date = pd.to_datetime(frogs.date, format='%Y_%m_%d')
baseball_df = pd.read_csv(data_paths[7])
baseball_df.date = pd.to_datetime(baseball_df.date, format='%Y%m%d')
baseball_df['days_since_last_nht'] = baseball_df.date.diff() # calculate days since last no-hitter
baseball_df['games_since_last_nht'] = baseball_df.game_number.diff().fillna(0) - 1 # calculate games since last no-hitter
bees = pd.read_csv(data_paths[1], header=[3])
finch_1975 = pd.read_csv(data_paths[3])
finch_1975['year'] = 1975
finch_1975.rename(columns={'Beak length, mm': 'beak_length', 'Beak depth, mm': 'beak_depth'}, inplace=True)
finch_2012 = pd.read_csv(data_paths[4])
finch_2012['year'] = 2012
finch_2012.rename(columns={'blength': 'beak_length', 'bdepth': 'beak_depth'}, inplace=True)
finch = pd.concat([finch_1975, finch_2012]).reset_index(drop=True)
When doing statistical inference, we speak the language of probability. A probability distribution that describes your data has parameters. So, a major goal of statistical inference is to estimate the values of these parameters, which allows us to concisely and unambiguously describe our data and draw conclusions from it. In this chapter, you will learn how to find the optimal parameters, those that best describe your data.
np.random.normal
f, axes = plt.subplots(2, 2, figsize=(20, 10), sharex=False)
# Plot 1
sns.histplot(sol['velocity of light in air (km/s)'], bins=9, kde=True, ax=axes[0, 0])
axes[0, 0].set_title('Histogram of the Michelson Measurements')
axes[0, 0].set_ylabel('PDF')
# Plot 2
mean = np.mean(sol['velocity of light in air (km/s)'])
std = np.std(sol['velocity of light in air (km/s)'])
samples = np.random.normal(mean, std, size=10000)
x, y = ecdf(sol['velocity of light in air (km/s)'])
x_theor, y_theor = ecdf(samples)
sns.set()
sns.lineplot(x=x_theor, y=y_theor, label='theoretical', ax=axes[0, 1])
sns.scatterplot(x=x, y=y, label='measured', color='purple', ax=axes[0, 1])
axes[0, 1].set_title('CDF of the Michelson Measurements')
axes[0, 1].set_xlabel('speed of light (km/s)')
axes[0, 1].set_ylabel('CDF')
# Plot 3
samples = np.random.normal(mean, std*1.5, size=10000)
x_theor3, y_theor3 = ecdf(samples)
sns.set()
sns.lineplot(x=x_theor3, y=y_theor3, label='theoretical', ax=axes[1, 0])
sns.scatterplot(x=x, y=y, label='measured', color='purple', ax=axes[1, 0])
axes[1, 0].set_title('CDF of the Michelson Measurements: STD 50% Larger')
axes[1, 0].set_xlabel('speed of light (km/s)')
axes[1, 0].set_ylabel('CDF')
ticks= axes[1, 0].get_xticks()
# Plot 4
samples = np.random.normal(mean*1.0001, std, size=10000)
x_theor4, y_theor4 = ecdf(samples)
sns.set()
sns.lineplot(x=x_theor4, y=y_theor4, label='theoretical', ax=axes[1, 1])
sns.scatterplot(x=x, y=y, label='measured', color='purple', ax=axes[1, 1])
axes[1, 1].set_title('CDF of the Michelson Measurements: Mean 0.01% Larger')
axes[1, 1].set_xlabel('speed of light (km/s)')
axes[1, 1].set_ylabel('CDF')
plt.tight_layout()
plt.show()
The number of games played between each no-hitter in the modern era (1901-2015) of Major League Baseball is stored in the array nohitter_times
.
If you assume that no-hitters are described as a Poisson process, then the time between no-hitters is Exponentially distributed. As you have seen, the Exponential distribution has a single parameter, which we will call $\tau$, the typical interval time. The value of the parameter $\tau$ that makes the exponential distribution best match the data is the mean interval time (where time is in units of number of games) between no-hitters.
Compute the value of this parameter from the data. Then, use np.random.exponential()
to "repeat" the history of Major League Baseball by drawing inter-no-hitter times from an exponential distribution with the $\tau$ you found and plot the histogram as an approximation to the PDF.
NumPy, pandas, matlotlib.pyplot, and seaborn have been imported for you as np
, pd
, plt
, and sns
, respectively.
Instructions
42
.plt.hist()
. Remember to use keyword arguments bins=50
, normed=True
, and histtype='step'
. Be sure to label your axes.# Seed random number generator
np.random.seed(42)
# Compute mean no-hitter time: tau
tau = np.mean(nohitter_times)
# Draw out of an exponential distribution with parameter tau: inter_nohitter_time
inter_nohitter_time = np.random.exponential(tau, 100000)
# Plot the PDF and label axes
_ = plt.hist(inter_nohitter_time, bins=50, density=True, histtype='step')
_ = plt.xlabel('Games between no-hitters')
_ = plt.ylabel('PDF')
# Show the plot
plt.show()
We see the typical shape of the Exponential distribution, going from a maximum at 0 and decaying to the right.
You have modeled no-hitters using an Exponential distribution. Create an ECDF of the real data. Overlay the theoretical CDF with the ECDF from the data. This helps you to verify that the Exponential distribution describes the observed data.
It may be helpful to remind yourself of the function you created in the previous course to compute the ECDF, as well as the code you wrote to plot it.
Instructions
nohitter_times
). Use the ecdf()
function you wrote in the prequel course.inter_nohitter_time
).x_theor
and y_theor
as a line using plt.plot()
. Then overlay the ECDF of the real data x
and y
as points. To do this, you have to specify the keyword arguments marker = '.'
and linestyle = 'none'
in addition to x
and y
inside plt.plot()
.# Create an ECDF from real data: x, y
x, y = ecdf(nohitter_times.nohitter_time)
# Create a CDF from theoretical samples: x_theor, y_theor
x_theor, y_theor = ecdf(inter_nohitter_time)
# Overlay the plots
plt.plot(x_theor, y_theor, label='Theoretical ECDF')
plt.plot(x, y, marker='.', linestyle='none', label='Real ECDF')
# Margins and axis labels
plt.margins(0.02)
plt.xlabel('Games between no-hitters')
plt.ylabel('CDF')
plt.legend()
# Show the plot
plt.show()
It looks like no-hitters in the modern era of Major League Baseball are Exponentially distributed. Based on the story of the Exponential distribution, this suggests that they are a random process; when a no-hitter will happen is independent of when the last no-hitter was.
Now sample out of an exponential distribution with $\tau$ being twice as large as the optimal $\tau$. Do it again for $\tau$ half as large. Make CDFs of these samples and overlay them with your data. You can see that they do not reproduce the data as well. Thus, the $\tau$ you computed from the mean inter-no-hitter times is optimal in that it best reproduces the data.
Note: In this and all subsequent exercises, the random number generator is pre-seeded for you to save you some typing.
Instructions
10000
samples out of an Exponential distribution with parameter $\tau_{\frac{1}{2}} =$ tau/2
.10000
samples out of an Exponential distribution with parameter $\tau_{2} =$ 2*tau
.ecdf()
function.# Plot the theoretical CDFs
plt.plot(x_theor, y_theor, label='Theoretical ECDF')
plt.plot(x, y, marker='.', linestyle='none', label='Real ECDF')
plt.margins(0.02)
plt.xlabel('Games between no-hitters')
plt.ylabel('CDF')
# Compute mean no-hitter time: tau
tau = np.mean(nohitter_times)
# Take samples with half tau: samples_half
samples_half = np.random.exponential(tau/2, 10000)
# Take samples with double tau: samples_double
samples_double = np.random.exponential(tau*2, 10000)
# Generate CDFs from these samples
x_half, y_half = ecdf(samples_half)
x_double, y_double = ecdf(samples_double)
# Plot these CDFs as lines
_ = plt.plot(x_half, y_half, label='Half tau')
_ = plt.plot(x_double, y_double, label='Double tau')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
# Show the plot
plt.show()
Notice how the value of tau given by the mean matches the data best. In this way, tau is an optimal parameter.
np.polyfit()
¶slope, intercept = np.polyfit(swing.total_votes, swing.dem_share, 1)
print(slope)
>>> 4.0370717009465555e-05
print(intercept)
>>> 40.113911968641744
g = sns.regplot(x='total_votes', y='dem_share', data=swing, color='purple')
plt.xlim(0, 900000)
plt.ylabel('Percent of Vote for Obama')
plt.xlabel('Total Votes')
plt.title('2008 US Swing State Election Results')
plt.annotate('intercept', xy=(0, 40.1), weight='bold', xytext=(300000, 25),
fontsize=15, arrowprops=dict(arrowstyle="->", color='black'))
plt.text(550000, 70, 'slope', fontsize=15, weight='bold')
lc = mc.LineCollection([[(500000, 69), (720000, 69)], [(500000, 69), (500000, 60)]], color='black')
g.add_collection(lc)
plt.text(520000, 57, 'residual', fontsize=15, weight='bold')
plt.annotate('', xy=(513312, 53.59), xytext=(513312, 61), arrowprops=dict(arrowstyle="<->", color='black'))
plt.show()
In the next few exercises, we will look at the correlation between female literacy and fertility (defined as the average number of children born per woman) throughout the world. For ease of analysis and interpretation, we will work with the illiteracy rate.
It is always a good idea to do some EDA ahead of our analysis. To this end, plot the fertility versus illiteracy and compute the Pearson correlation coefficient. The Numpy array illiteracy
has the illiteracy rate among females for most of the world's nations. The array fertility
has the corresponding fertility data.
Here, it may be useful to refer back to the function you wrote in the previous course to compute the Pearson correlation coefficient.
Instructions
fertility
(y-axis) versus illiteracy
(x-axis) as a scatter plot.illiteracy
and fertility
.fem.head()
illiteracy = 100 - fem['female literacy']
fertility = fem['fertility']
# Plot the illiteracy rate versus fertility
plt.plot(illiteracy, fertility, marker='.', linestyle='none')
# Set the margins and label axes
plt.margins(0.02)
plt.xlabel('percent illiterate')
plt.ylabel('fertility')
# Show the plot
plt.show()
# Show the Pearson correlation coefficient
print(pearson_r(illiteracy, fertility))
print(pearsonr(illiteracy, fertility))
You can see the correlation between illiteracy and fertility by eye, and by the substantial Pearson correlation coefficient of 0.8. It is difficult to resolve in the scatter plot, but there are many points around near-zero illiteracy and about 1.8 children/woman.
We will assume that fertility is a linear function of the female illiteracy rate. That is, $f=ai+b$, where a is the slope and b is the intercept. We can think of the intercept as the minimal fertility rate, probably somewhere between one and two. The slope tells us how the fertility rate varies with illiteracy. We can find the best fit line using np.polyfit()
.
Plot the data and the best fit line. Print out the slope and intercept. (Think: what are their units?)
Instructions
np.polyfit()
. Remember, fertility
is on the y-axis and illiteracy
on the x-axis.x
that consists of 0 and 100 using np.array()
. Then, compute the theoretical values of y
based on your regression parameters. I.e., y = a * x + b
.# Plot the illiteracy rate versus fertility
plt.plot(illiteracy, fertility, marker='.', linestyle='none')
plt.margins(0.02)
plt.xlabel('percent illiterate')
plt.ylabel('fertility')
# Perform a linear regression using np.polyfit(): a, b
a, b = np.polyfit(illiteracy, fertility, 1)
# Print the results to the screen
print('slope =', a, 'children per woman / percent illiterate')
print('intercept =', b, 'children per woman')
# Make theoretical line to plot
x = np.array([0, 100])
y = a * x + b
# Add regression line to your plot
plt.plot(x, y)
# Draw the plot
plt.show()
The function np.polyfit()
that you used to get your regression parameters finds the optimal slope and intercept. It is optimizing the sum of the squares of the residuals, also known as RSS (for residual sum of squares). In this exercise, you will plot the function that is being optimized, the RSS, versus the slope parameter a
. To do this, fix the intercept to be what you found in the optimization. Then, plot the RSS vs. the slope. Where is it minimal?
Instructions
np.linspace()
to get 200
points in the range between 0
and 0.1
. For example, to get 100
points in the range between 0
and 0.5
, you could use np.linspace()
like so: np.linspace(0, 0.5, 100)
.rss
, to contain the RSS using np.empty_like()
and the array you created above. The empty_like()
function returns a new array with the same shape and type as a given array (in this case, a_vals
).for
loop to compute the sum of RSS of the slope. Hint: the RSS is given by np.sum((y_data - a * x_data - b)**2)
. The variable b
you computed in the last exercise is already in your namespace. Here, fertility
is the y_data
and illiteracy
the x_data
.rss
) versus slope (a_vals
).# Specify slopes to consider: a_vals
a_vals = np.linspace(0, 0.1, 200)
# Initialize sum of square of residuals: rss
rss = np.empty_like(a_vals)
# Compute sum of square of residuals for each value of a_vals
for i, a in enumerate(a_vals):
rss[i] = np.sum((fertility - a*illiteracy - b)**2)
# Plot the RSS
plt.plot(a_vals, rss, '-')
plt.xlabel('slope (children per woman / percent illiterate)')
plt.ylabel('sum of square of residuals')
plt.show()
Notice that the minimum on the plot, that is the value of the slope that gives the minimum sum of the square of the residuals, is the same value you got when performing the regression.
x_0, y_0
) might be well modeled with a line, and the regression parameters will be meaningful.x_2, y_2
), but the outlier throws off the slope and interceptx_3, y_3
) might also have a linear relationship between x, and y, but from the plot, you can conclude that you should try to acquire more data for intermediate x values to make sure that it does.x_1, y_1
) is definitely not linear, and you need to choose another model.anscombe
for i in range(1, 5):
plt.subplot(2, 2, i)
g = sns.scatterplot(x=f'x_{i-1}', y=f'y_{i-1}', data=anscombe)
# x & y mean
x_mean = anscombe.loc[:, f'x_{i-1}'].mean()
y_mean = anscombe.loc[:, f'y_{i-1}'].mean()
plt.vlines(anscombe.loc[:, f'x_{i-1}'].mean(), 0, 14, color='g')
plt.text(9.2, 4, f'x-mean: {x_mean}')
plt.hlines(anscombe.loc[:, f'y_{i-1}'].mean(), 2, 20, color='g')
plt.text(13, 6.9, f'y-mean: {y_mean:0.1f}')
# regression line
slope, intercept = np.polyfit(anscombe[f'x_{i-1}'], anscombe[f'y_{i-1}'], 1)
y1 = slope*2 + intercept
y2 = slope*20 + intercept
lc = mc.LineCollection([[(2, y1), (20, y2)]], color='green')
g.add_collection(lc)
plt.text(15, 12, 'reg-line')
# sum of the squares of the residuals
A = np.vstack((anscombe.loc[:, f'x_{i-1}'], np.ones(11))).T
resid = np.linalg.lstsq(A, anscombe.loc[:, f'y_{i-1}'], rcond=-1)[1][0]
plt.text(12.3, 1, f'sum sq. resid: {resid:0.1f}')
plt.ylim(0, 14)
plt.xlim(2, 20)
plt.xticks(list(range(4, 22, 2)))
plt.ylabel('y')
plt.xlabel('x')
Why should exploratory data analysis be the first step in an analysis of data (after getting your data imported and cleaned, of course)?
Answer the question
For practice, perform a linear regression on the data set from Anscombe's quartet that is most reasonably interpreted with linear regression.
Instructions
np.polyfit()
. The Anscombe data are stored in the arrays x
and y
.a
and intercept b
.np.array()
, should consist of 3
and 15
. To generate the $y$ data, multiply the slope by x_theor
and add the intercept.marker='.'
and linestyle='none'
keyword arguments in addition to x
and y
when you plot the Anscombe data as a scatter plot. You do not need these arguments when plotting the theoretical line.x = anscombe.x_0
y = anscombe.y_0
# Perform linear regression: a, b
a, b = np.polyfit(x, y, 1)
# Print the slope and intercept
print(a, b)
# Generate theoretical x and y data: x_theor, y_theor
x_theor = np.array([3, 15])
y_theor = a * x_theor + b
# Plot the Anscombe data and theoretical line
_ = plt.plot(x, y, marker='.', linestyle='none')
_ = plt.plot(x_theor, y_theor)
# Label the axes
plt.xlabel('x')
plt.ylabel('y')
# Show the plot
plt.show()
Now, to verify that all four of the Anscombe data sets have the same slope and intercept from a linear regression, you will compute the slope and intercept for each set. The data are stored in lists; anscombe_x = [x1, x2, x3, x4]
and anscombe_y = [y1, y2, y3, y4]
, where, for example, x2
and y2
are the $x$ and $y$ values for the second Anscombe data set.
Instructions
for
loop to do the following for each Anscombe data set.# Iterate through x,y pairs
for i in range(0, 4):
# Compute the slope and intercept: a, b
x = anscombe.loc[:, f'x_{i}']
y = anscombe.loc[:, f'y_{i}']
a, b = np.polyfit(x, y, 1)
# Print the result
print(f'x_{i} & y_{i} slope: {a:0.4f} intercept: {b:0.4f}')
To "pull yourself up by your bootstraps" is a classic idiom meaning that you achieve a difficult task by yourself with no help at all. In statistical inference, you want to know what would happen if you could repeat your data acquisition an infinite number of times. This task is impossible, but can we use only the data we actually have to get close to the same result as an infinitude of experiments? The answer is yes! The technique to do it is aptly called bootstrapping. This chapter will introduce you to this extraordinarily powerful tool.
[23.3, 27.1, 24.3, 25.7, 26.0]
[27.1, 26.0, 23.3, 25.7, 23.3]
size
argument so we can specify the sample sizenp.random.choice(list(range(1, 6)), size=5)
vel = sol['velocity of light in air (km/s)']
print(f'Mean: {vel.mean()}')
print(f'Median: {vel.median()}')
print(f'STD: {vel.std()}')
bs_sample = np.random.choice(vel, size=100)
print(f'Mean: {bs_sample.mean()}')
print(f'Median: {np.median(bs_sample)}')
print(f'STD: {np.std(bs_sample)}')
Getting tripped up over terminology is a common cause of frustration in students. Unfortunately, you often will read and hear other data scientists using different terminology for bootstrap samples and replicates. This is even more reason why we need everything to be clear and consistent for this course. So, before going forward discussing bootstrapping, let's get our terminology down. If we have a data set with $n$ repeated measurements, a bootstrap sample is an array of length $n$ that was drawn from the original data with replacement. What is a bootstrap replicate?
Answer the question
To help you gain intuition about how bootstrapping works, imagine you have a data set that has only three points, [-1, 0, 1]
. How many unique bootstrap samples can be drawn (e.g., [-1, 0, 1]
and [1, 0, -1]
are unique), and what is the maximum mean you can get from a bootstrap sample? It might be useful to jot down the samples on a piece of paper.
(These are too few data to get meaningful results from bootstrap procedures, but this example is useful for intuition.)
Instructions
n = 3
r = 3
There are 27 total bootstrap samples, and one of them, [1,1,1] has a mean of 1. Conversely, 7 of them have a mean of zero.
In this exercise, you will generate bootstrap samples from the set of annual rainfall data measured at the Sheffield Weather Station in the UK from 1883 to 2015. The data are stored in the NumPy array rainfall
in units of millimeters (mm). By graphically displaying the bootstrap samples with an ECDF, you can get a feel for how bootstrap sampling allows probabilistic descriptions of data.
Instructions
for
loop to acquire 50
bootstrap samples of the rainfall data and plot their ECDF.np.random.choice()
to generate a bootstrap sample from the NumPy array rainfall
. Be sure that the size
of the resampled array is len(rainfall)
.ecdf()
that you wrote in the prequel to this course to generate the x
and y
values for the ECDF of the bootstrap sample bs_sample
.color='gray'
(to make gray dots) and alpha=0.1
(to make them semi-transparent, since we are overlaying so many) in addition to the marker='.'
and linestyle='none'
keyword arguments.ecdf()
to generate x
and y
values for the ECDF of the original rainfall
data available in the array rainfall.rainfall = sheffield.groupby('yyyy').agg({'rain': 'sum'})['rain'].tolist()
for _ in range(50):
# Generate bootstrap sample: bs_sample
bs_sample = np.random.choice(rainfall, size=len(rainfall))
# Compute and plot ECDF from bootstrap sample
x, y = ecdf(bs_sample)
_ = plt.plot(x, y, marker='.', linestyle='none',
color='gray', alpha=0.1)
# Compute and plot ECDF from original data
x, y = ecdf(rainfall)
_ = plt.plot(x, y, marker='.', label='real data')
# Make margins and label axes
plt.margins(0.02)
_ = plt.xlabel('yearly rainfall (mm)')
_ = plt.ylabel('ECDF')
plt.legend()
# Show the plot
plt.show()
Notice how the bootstrap samples give an idea of how the distribution of rainfalls is spread.
bootstrap_replicate_1d
, since it works on 1-dimensional arrays.np.mean
or np.median
for example.def bootstrap_replicate_1d
¶def bootstrap_replicate_1d(data, func):
"""Generate bootstrap replicate of 1D data."""
bs_sample = np.random.choice(data, len(data))
return func(bs_sample)
bootstrap_replicate_1d(sol['velocity of light in air (km/s)'], np.mean)
bootstrap_replicate_1d(sol['velocity of light in air (km/s)'], np.mean)
np.empty
to create the empty array.for-loop
to generate a replicate and store it in the bs_replicates
array.density=True
sets the height of the bars of the histogram such that the total area of the bars is equal to one.np.perentile
to compute the 2.5th and 97.5th percentiles to get the 95% confidence interval.bs_replicates = np.empty(10000)
for i in range(10000):
bs_replicates[i] = bootstrap_replicate_1d(sol['velocity of light in air (km/s)'], np.mean)
conf_int = np.percentile(bs_replicates, [2.5, 97.5])
print(conf_int)
n, bins, _ = plt.hist(bs_replicates, bins=30, density=True)
plt.fill_betweenx([0, n.max()], conf_int[0], conf_int[1], alpha=0.3)
plt.xlabel('mean speed of light (km/s)')
plt.ylabel('PDF')
plt.show()
The function bootstrap_replicate_1d()
from the video is available in your namespace. Now you'll write another function, draw_bs_reps(data, func, size=1)
, which generates many bootstrap replicates from the data set. This function will come in handy for you again and again as you compute confidence intervals and later when you do hypothesis tests.
Instructions
draw_bs_reps(data, func, size=1)
.np.empty()
, initialize an array called bs_replicates
of size size
to hold all of the bootstrap replicates.for
loop that ranges over size
and computes a replicate using bootstrap_replicate_1d()
. Refer to the exercise description above to see the function signature of bootstrap_replicate_1d()
. Store the replicate in the appropriate index of bs_replicates
.bs_replicates
. This has already been done for you.def draw_bs_reps
¶def draw_bs_reps(data, func, size=1):
"""Draw bootstrap replicates."""
# Initialize array of replicates: bs_replicates
bs_replicates = np.empty(size)
# Generate replicates
for i in range(size):
bs_replicates[i] = bootstrap_replicate_1d(data, func)
return bs_replicates
In this exercise, you will compute a bootstrap estimate of the probability density function of the mean annual rainfall at the Sheffield Weather Station. Remember, we are estimating the mean annual rainfall we would get if the Sheffield Weather Station could repeat all of the measurements from 1883 to 2015 over and over again. This is a probabilistic estimate of the mean. You will plot the PDF as a histogram, and you will see that it is Normal.
In fact, it can be shown theoretically that under not-too-restrictive conditions, the value of the mean will always be Normally distributed. (This does not hold in general, just for the mean and a few other statistics.) The standard deviation of this distribution, called the standard error of the mean, or SEM, is given by the standard deviation of the data divided by the square root of the number of data points. I.e., for a data set, sem = np.std(data) / np.sqrt(len(data))
. Using hacker statistics, you get this same result without the need to derive it, but you will verify this result from your bootstrap replicates.
The dataset has been pre-loaded for you into an array called rainfall
.
Instructions
10000
bootstrap replicates of the mean annual rainfall using your draw_bs_reps()
function and the rainfall
array. Hint: Pass in np.mean
for func
to compute the mean.draw_bs_reps()
accepts 3 arguments: data
, func
, and size
.rainfall
.np.std(data) / np.sqrt(len(data))
.bs_replicates
.density=True
keyword argument and 50
bins.# Take 10,000 bootstrap replicates of the mean: bs_replicates
bs_replicates = draw_bs_reps(rainfall, np.mean, 10000)
# Compute and print SEM
sem = np.std(rainfall) / np.sqrt(len(rainfall))
print(sem)
# Compute and print standard deviation of bootstrap replicates
bs_std = np.std(bs_replicates)
print(bs_std)
# Make a histogram of the results
_ = plt.hist(bs_replicates, bins=50, density=True)
_ = plt.xlabel('mean annual rainfall (mm)')
_ = plt.ylabel('PDF')
# Show the plot
plt.show()
Notice that the SEM we got from the known expression and the bootstrap replicates is the same and the distribution of the bootstrap replicates of the mean is Normal.
A confidence interval gives upper and lower bounds on the range of parameter values you might expect to get if we repeat our measurements. For named distributions, you can compute them analytically or look them up, but one of the many beautiful properties of the bootstrap method is that you can take percentiles of your bootstrap replicates to get your confidence interval. Conveniently, you can use the np.percentile()
function.
Use the bootstrap replicates you just generated to compute the 95% confidence interval. That is, give the 2.5th and 97.5th percentile of your bootstrap replicates stored as bs_replicates
. What is the 95% confidence interval?
conf_int = np.percentile(bs_replicates, [2.5, 97.5])
print(conf_int)
We saw in a previous exercise that the mean is Normally distributed. This does not necessarily hold for other statistics, but no worry: as hackers, we can always take bootstrap replicates! In this exercise, you'll generate bootstrap replicates for the variance of the annual rainfall at the Sheffield Weather Station and plot the histogram of the replicates.
Here, you will make use of the draw_bs_reps()
function you defined a few exercises ago. It is provided below for your reference:
def draw_bs_reps(data, func, size=1):
"""Draw bootstrap replicates."""
# Initialize array of replicates
bs_replicates = np.empty(size)
# Generate replicates
for i in range(size):
bs_replicates[i] = bootstrap_replicate_1d(data, func)
return bs_replicates
Instructions
10000
bootstrap replicates of the variance in annual rainfall, stored in the rainfall
dataset, using your draw_bs_reps()
function. Hint: Pass in np.var
for computing the variance.bs_replicates
) by 100
to put the variance in units of square centimeters for convenience.bs_replicates
using the density=True
keyword argument and 50
bins.# Generate 10,000 bootstrap replicates of the variance: bs_replicates
bs_replicates = draw_bs_reps(rainfall, np.var, 10000)
# Put the variance in units of square centimeters
bs_replicates = bs_replicates/100
conf_int = np.percentile(bs_replicates, [2.5, 97.5])
print(conf_int)
# Make a histogram of the results
n, bins, _ = plt.hist(bs_replicates, bins=50, density=True)
plt.fill_betweenx([0, n.max()], conf_int[0], conf_int[1], alpha=0.3)
plt.xlabel('variance of annual rainfall (sq. cm)')
plt.ylabel('PDF')
# Show the plot
plt.show()
This is not normally distributed, as it has a longer tail to the right. Note that you can also compute a confidence interval on the variance, or any other statistic, using np.percentile()
with your bootstrap replicates.
Consider again the inter-no-hitter intervals for the modern era of baseball. Generate 10,000 bootstrap replicates of the optimal parameter $τ$. Plot a histogram of your replicates and report a 95% confidence interval.
Instructions
10000
bootstrap replicates of $τ$ from the nohitter_times
data using your draw_bs_reps()
function. Recall that the optimal $τ$ is calculated as the mean of the data.np.percentile()
and passing in two arguments: The array bs_replicates
, and the list of percentiles - in this case 2.5
and 97.5
.# Draw bootstrap replicates of the mean no-hitter time (equal to tau): bs_replicates
bs_replicates = draw_bs_reps(nohitter_times.nohitter_time, np.mean, 10000)
# Compute the 95% confidence interval: conf_int
conf_int = np.percentile(bs_replicates, [2.5, 97.5])
# Print the confidence interval
print('95% confidence interval =', conf_int, 'games')
# Plot the histogram of the replicates
n, bins, _ = plt.hist(bs_replicates, bins=50, density=True)
plt.fill_betweenx([0, n.max()], conf_int[0], conf_int[1], alpha=0.3)
plt.xlabel(r'$\tau$ (games)')
plt.ylabel('PDF')
# Show the plot
plt.show()
This gives you an estimate of what the typical time between no-hitters is. It could be anywhere between 660 and 870 games.
np.random.choice
must sample a 1D array, we will sample the indices of the data points.np.arrange
function, which gives an array of sequential integers.np.polyfit
on the pairs bootstrap sample to get a bootstrap replicate.np.percentile
.np.arange(7)
# random.choice by indices
inds = np.arange(len(swing.total_votes))
bs_inds = np.random.choice(inds, len(inds))
bs_total_votes = swing.total_votes[bs_inds].tolist()
bs_dem_share = swing.dem_share[bs_inds].tolist()
bs_slope, bs_intercept = np.polyfit(bs_total_votes, bs_dem_share, 1)
print(bs_slope, bs_intercept)
# given the data in a pandas dataframe
bs_df_sample = swing[['total_votes', 'dem_share']].sample(len(swing.index), replace=True).copy()
np.polyfit(bs_df_sample.total_votes, bs_df_sample.dem_share, 1)
# linear regression of original data
np.polyfit(swing.total_votes, swing.dem_share, 1)
def bs_df_poly_1
¶def bs_df_poly_1(df: pd.DataFrame, size: int, x1: int, x2: int) -> list:
"""
Calculates a number of bootstrap samples determined by size
For each bs_df_sample, calculate the slope & intercept
For each slope & intercept, calculate the line y1 & y2 for a line segment
The returned list can be plotted with matplotlib.collections.LineCollection
df: two column dataframe with x & y
size: number of bootstrap samples
x1: first x-intercept
x2: second x_intercpt
returns a list of line coordinates [[(x1, y1), (x2, y2)], [(x1, y1), (x2, y2)]]
returns a list of slope intercepts
"""
bs_df_samples = [df.sample(len(df), replace=True) for _ in range(size)]
poly_samples = [np.polyfit(v.iloc[:, 0], v.iloc[:, 1], 1) for v in bs_df_samples]
collections = [[(x1, v[0]*x1 + v[1]), (x2, v[0]*x2 + v[1])] for v in poly_samples]
return poly_samples, collections
g = sns.scatterplot(x=swing.total_votes, y=swing.dem_share, )
polys, lines = bs_df_poly_1(swing[['total_votes', 'dem_share']], 100, 0, 900000)
# lines can be plotted with slope and intercept
x = np.array([0, 900000])
for poly in polys:
plt.plot(x, poly[0]*x + poly[1], linewidth=0.5, alpha=0.2, color='green')
# or lines can be plotted with a collection of lines
# lc = mc.LineCollection(lines, color='green', alpha=0.2)
# g.add_collection(lc)
plt.title('2008 US Swing State Election Results\nwith Linear Regression lines for bootstrap samples')
plt.xlabel('Total Votes')
plt.ylabel('Percent of Vote for Obama')
plt.show()
As discussed in the video, pairs bootstrap involves resampling pairs of data. Each collection of pairs fit with a line, in this case using np.polyfit()
. We do this again and again, getting bootstrap replicates of the parameter values. To have a useful tool for doing pairs bootstrap, you will write a function to perform pairs bootstrap on a set of x,y
data.
Instructions
draw_bs_pairs_linreg(x, y, size=1)
to perform pairs bootstrap estimates on linear regression parameters.np.arange()
to set up an array of indices going from 0
to len(x)
. These are what you will resample and use them to pick values out of the x
and y
arrays.np.empty()
to initialize the slope and intercept replicate arrays to be of size size
.for
loop to:inds
. Use np.random.choice()
to do this.bs_x
and bs_y
using the the resampled indices bs_inds
. To do this, slice x
and y
with bs_inds
.np.polyfit()
on the new $x$ and $y$ arrays and store the computed slope and intercept.def draw_bs_pairs_linreg
¶def draw_bs_pairs_linreg(x, y, size=1):
"""Perform pairs bootstrap for linear regression."""
# Set up array of indices to sample from: inds
inds = np.arange(len(x))
# Initialize replicates: bs_slope_reps, bs_intercept_reps
bs_slope_reps = np.empty(size)
bs_intercept_reps = np.empty(size)
# Generate replicates
for i in range(size):
bs_inds = np.random.choice(inds, size=len(inds))
bs_x, bs_y = x[bs_inds], y[bs_inds]
bs_slope_reps[i], bs_intercept_reps[i] = np.polyfit(bs_x, bs_y, 1)
return bs_slope_reps, bs_intercept_reps
Using the function you just wrote, perform pairs bootstrap to plot a histogram describing the estimate of the slope from the illiteracy/fertility data. Also report the 95% confidence interval of the slope. The data is available to you in the NumPy arrays illiteracy
and fertility
.
As a reminder, draw_bs_pairs_linreg()
has a function signature of draw_bs_pairs_linreg(x, y, size=1)
, and it returns two values: bs_slope_reps
and bs_intercept_reps
.
Instructions
draw_bs_pairs_linreg()
function to take 1000
bootstrap replicates of the slope and intercept. The x-axis data is illiteracy
and y-axis data is fertility
.# Generate replicates of slope and intercept using pairs bootstrap
bs_slope_reps, bs_intercept_reps = draw_bs_pairs_linreg(illiteracy, fertility, 1000)
# Compute and print 95% CI for slope
conf_int = np.percentile(bs_slope_reps, [2.5, 97.5])
print(conf_int)
# Plot the histogram
n, bins, _ = plt.hist(bs_slope_reps, bins=50, density=True)
plt.fill_betweenx([0, n.max()], conf_int[0], conf_int[1], alpha=0.3)
plt.xlabel('slope')
plt.ylabel('PDF')
plt.show()
A nice way to visualize the variability we might expect in a linear regression is to plot the line you would get from each bootstrap replicate of the slope and intercept. Do this for the first 100 of your bootstrap replicates of the slope and intercept (stored as bs_slope_reps
and bs_intercept_reps
).
Instructions
0
and 100
for the plot of the regression lines. Use the np.array()
function for this.for
loop in which you plot a regression line with a slope and intercept given by the pairs bootstrap replicates. Do this for 100
lines.for
loop, recall the regression equation y = a*x + b
. Here, a
is bs_slope_reps[i]
and b
is bs_intercept_reps[i]
.linewidth=0.5
, alpha=0.2
, and color='red'
in your call to plt.plot()
.illiteracy
on the x-axis and fertility
on the y-axis. Remember to specify the marker='.'
and linestyle='none'
keyword arguments.# Generate array of x-values for bootstrap lines: x
x = np.array([0, 100])
# Plot the bootstrap lines
for i in range(0, 100):
_ = plt.plot(x, bs_slope_reps[i]*x + bs_intercept_reps[i], linewidth=0.5, alpha=0.2, color='green')
# Plot the data
_ = plt.plot(illiteracy, fertility, marker='.', linestyle='none')
# Label axes, set the margins, and show the plot
_ = plt.xlabel('illiteracy')
_ = plt.ylabel('fertility')
plt.margins(0.02)
plt.show()
You now know how to define and estimate parameters given a model. But the question remains: how reasonable is it to observe your data if a model is true? This question is addressed by hypothesis tests. They are the icing on the inference cake. After completing this chapter, you will be able to carefully construct and test hypotheses using hacker statistics.
swing.head()
pa_dem_share = swing.dem_share[swing.state == 'PA']
oh_dem_share = swing.dem_share[swing.state == 'OH']
pa_x, pa_y = ecdf(pa_dem_share)
oh_x, oh_y = ecdf(oh_dem_share)
plt.plot(pa_x, pa_y, marker='.', linestyle='none', label='PA')
plt.plot(oh_x, oh_y, marker='.', linestyle='none', label='OH')
plt.xlabel('Percent of vote for Obama')
plt.ylabel('ECDF')
plt.legend()
plt.show()
df = pd.DataFrame({'PA': pa_dem_share.reset_index(drop=True), 'OH': oh_dem_share.reset_index(drop=True)})
df.head()
pa_oh_stats = df.agg(['mean', 'median', 'std'])
pa_oh_stats['PA - OH'] = pa_oh_stats.PA - pa_oh_stats.OH
pa_oh_stats
np.random.permutation
to permute the entries of the array.dem_share_both = np.concatenate((df.PA.dropna(), df.OH.dropna()))
dem_share_both = np.random.permutation(dem_share_both)
perm_sample_pa = dem_share_both[:len(df.PA.dropna())]
perm_sample_oh = dem_share_both[len(df.PA.dropna()):]
In the video, you learned that permutation sampling is a great way to simulate the hypothesis that two variables have identical probability distributions. This is often a hypothesis you want to test, so in this exercise, you will write a function to generate a permutation sample from two data sets.
Remember, a permutation sample of two arrays having respectively n1
and n2
entries is constructed by concatenating the arrays together, scrambling the contents of the concatenated array, and then taking the first n1
entries as the permutation sample of the first array and the last n2
entries as the permutation sample of the second array.
Instructions
np.concatenate()
. Be sure to pass in data1
and data2
as one argument (data1, data2)
.np.random.permutation()
to permute the concatenated array.len(data1)
entries of permuted_data
as perm_sample_1
and the last len(data2)
entries of permuted_data
as perm_sample_2
. In practice, this can be achieved by using :len(data1)
and len(data1):
to slice permuted_data
.perm_sample_1
and perm_sample_2
.def permutation_sample
¶def permutation_sample(data1, data2, seed=None):
"""Generate a permutation sample from two data sets."""
# Concatenate the data sets: data
data = np.concatenate((data1, data2))
# Permute the concatenated array: permuted_data
np.random.seed(seed)
permuted_data = np.random.permutation(data)
# Split the permuted array into two: perm_sample_1, perm_sample_2
perm_sample_1 = permuted_data[:len(data1)]
perm_sample_2 = permuted_data[len(data1):]
return perm_sample_1, perm_sample_2
To help see how permutation sampling works, in this exercise you will generate permutation samples and look at them graphically.
We will use the Sheffield Weather Station data again, this time considering the monthly rainfall in June (a dry month) and November (a wet month). We expect these might be differently distributed, so we will take permutation samples to see how their ECDFs would look if they were identically distributed.
The data are stored in the Numpy arrays rain_june
and rain_november
.
As a reminder, permutation_sample()
has a function signature of permutation_sample(data_1, data_2)
with a return value of permuted_data[:len(data_1)], permuted_data[len(data_1):]
, where permuted_data = np.random.permutation(np.concatenate((data_1, data_2)))
.
Instructions
loop
to generate 50 permutation samples, compute their ECDFs, and plot them.rain_june
and rain_november
using your permutation_sample()
function.x
and y
values for an ECDF for each of the two permutation samples for the ECDF using your ecdf()
function.x_1
and y_1
) as dots. Do the same for the second permutation sample (x_2
and y_2
).x
and y
values for ECDFs for the rain_june
and rain_november
data and plot the ECDFs using respectively the keyword arguments color='red'
and color='blue'
.rain_june = sheffield['rain'][sheffield.mm == 6]
rain_november = sheffield['rain'][sheffield.mm == 11]
for i in range(50):
# Generate permutation samples
perm_sample_1, perm_sample_2 = permutation_sample(rain_june, rain_november)
# Compute ECDFs
x_1, y_1 = ecdf(perm_sample_1)
x_2, y_2 = ecdf(perm_sample_2)
# Plot ECDFs of permutation sample
_ = plt.plot(x_1, y_1, marker='.', linestyle='none', color='red', alpha=0.02)
_ = plt.plot(x_2, y_2, marker='.', linestyle='none', color='blue', alpha=0.02)
# Create and plot ECDFs from original data
x_1, y_1 = ecdf(rain_june)
x_2, y_2 = ecdf(rain_november)
_ = plt.plot(x_1, y_1, marker='.', linestyle='none', color='red')
_ = plt.plot(x_2, y_2, marker='.', linestyle='none', color='blue')
# Label axes, set margin, and show plot
plt.margins(0.02)
_ = plt.xlabel('monthly rainfall (mm)')
_ = plt.ylabel('ECDF')
plt.show()
Notice that the permutation samples ECDFs overlap and give a purple haze. None of the ECDFs from the permutation samples overlap with the observed data, suggesting that the hypothesis is not commensurate with the data. June and November rainfall are not identically distributed.
dem_share_pa = swing.dem_share[swing.state == 'PA']
dem_share_oh= swing.dem_share[swing.state == 'OH']
perm_sample_PA, perm_sample_OH = permutation_sample(dem_share_pa, dem_share_oh, seed=42)
print(f'For seed={i}: Mean difference: {np.mean(perm_sample_PA) - np.mean(perm_sample_OH)}')
mean_diff = np.mean(dem_share_pa) - np.mean(dem_share_oh)
mean_diff
replicates = list()
for i in range(10000):
perm_sample_PA, perm_sample_OH = permutation_sample(dem_share_pa, dem_share_oh, i)
replicates.append(np.mean(perm_sample_PA) - np.mean(perm_sample_OH))
p = sns.histplot(replicates, bins=30, kde=True)
plt.vlines(mean_diff, 0, 0.25, color='r', label='mean')
plt.annotate('p-value', xy=(3.5, 0.05), weight='bold', color='teal',
xytext=(4, 0.15), fontsize=15, arrowprops=dict(arrowstyle="->", color='teal'))
for rectangle in p.patches:
if rectangle.get_x() >= mean_diff:
rectangle.set_facecolor('teal')
width = p.patches[19].get_x() - mean_diff # edge of the rectangle following the mean
height = p.patches[18].get_height()
plt.bar(x=mean_diff+0.5*width, height=height, width=width, color='#99cccc')
ticks, labels = plt.xticks()
plt.legend()
plt.ylabel('PDF')
plt.xlabel('PA - OH mean percent replicate vote difference')
plt.show()
When performing hypothesis tests, your choice of test statistic should be:
Answer the question
The p-value is generally a measure of:
Answer the question
As discussed in the video, a permutation replicate is a single value of a statistic computed from a permutation sample. As the draw_bs_reps()
function you wrote in chapter 2 is useful for you to generate bootstrap replicates, it is useful to have a similar function, draw_perm_reps()
, to generate permutation replicates. You will write this useful function in this exercise.
The function has call signature draw_perm_reps(data_1, data_2, func, size=1)
. Importantly, func
must be a function that takes two arrays as arguments. In most circumstances, func
will be a function you write yourself.
Instructions
draw_perm_reps(data_1, data_2, func, size=1)
.np.empty()
.for
loop to:permutation_sample()
functionfunc()
to compute the replicate and store the result in your array of replicates.def draw_perm_reps
¶def draw_perm_reps(data_1, data_2, func, size=1, seed=None):
"""Generate multiple permutation replicates."""
# Initialize array of replicates: perm_replicates
perm_replicates = np.empty(size)
for i in range(size):
# Generate permutation sample
perm_sample_1, perm_sample_2 = permutation_sample(data_1, data_2, seed)
# Compute the test statistic
perm_replicates[i] = func(perm_sample_1, perm_sample_2)
return perm_replicates
Kleinteich and Gorb (Sci. Rep., 4, 5225, 2014) performed an interesting experiment with South American horned frogs. They held a plate connected to a force transducer, along with a bait fly, in front of them. They then measured the impact force and adhesive force of the frog's tongue when it struck the target.
Frog A is an adult and Frog B is a juvenile. The researchers measured the impact force of 20 strikes for each frog. In the next exercise, we will test the hypothesis that the two frogs have the same distribution of impact forces. But, remember, it is important to do EDA first! Let's make a bee swarm plot for the data. They are stored in a Pandas data frame, df
, where column ID
is the identity of the frog and column impact_force
is the impact force in Newtons (N).
Instructions
sns.swarmplot()
to make a bee swarm plot of the data by specifying the x
, y
, and data
keyword arguments.# Make bee swarm plot
sns.swarmplot(x='ID', y='impact force (mN)', data=frogs)
# Label axes
plt.xlabel('frog')
plt.ylabel('impact force (mN)')
# Show the plot
plt.show()
Eyeballing it, it does not look like they come from the same distribution. Frog II, the adult, has three or four very hard strikes, and Frog IV, the juvenile, has a couple weak ones. However, it is possible that with only 20 samples it might be too difficult to tell if they have difference distributions, so we should proceed with the hypothesis test.
The average strike force of Frog A was 0.71 Newtons (N), and that of Frog B was 0.42 N for a difference of 0.29 N. It is possible the frogs strike with the same force and this observed difference was by chance. You will compute the probability of getting at least a 0.29 N difference in mean strike force under the hypothesis that the distributions of strike forces for the two frogs are identical. We use a permutation test with a test statistic of the difference of means to test this hypothesis.
For your convenience, the data has been stored in the arrays force_a
and force_b
.
Instructions
diff_of_means(data_1, data_2)
that returns the differences in means between two data sets, mean of data_1
minus mean of data_2
.def diff_of_means
¶def diff_of_means(data_1, data_2):
"""Difference in means of two arrays."""
# The difference of means of data_1, data_2: diff
diff = np.mean(data_1) - np.mean(data_2)
return diff
force_a = frogs['impact force (mN)'][frogs.ID == 'II'].div(1000).to_list()
force_b = frogs['impact force (mN)'][frogs.ID == 'IV'].div(1000).to_list()
# Compute difference of mean impact force from experiment: empirical_diff_means
empirical_diff_means = diff_of_means(force_a, force_b)
# Draw 10,000 permutation replicates: perm_replicates
perm_replicates = draw_perm_reps(force_a, force_b, diff_of_means, size=10000)
# Compute p-value: p
p = np.sum(perm_replicates >= empirical_diff_means) / len(perm_replicates)
# Print the result
print(f'p-value = {p}')
The p-value tells you that there is about a 0.6% chance that you would get the difference of means observed in the experiment if frogs were exactly the same. A p-value below 0.01 is typically said to be "statistically significant," but: warning! warning! warning! You have computed a p-value; it is a number. I encourage you not to distill it to a yes-or-no phrase. p = 0.006 and p = 0.000000006 are both said to be "statistically significant," but they are definitely not the same!
diff_from_newcomb
, to compute it, and compute the observed test statistic.draw_bs_reps
function to generate the bootstrap replicates, which is the value of the test statistic computed from a bootstrap sample.newcomb_value = 299860 #km/s
michelson_speed_of_light = sol['velocity of light in air (km/s)']
michelson_mean = np.mean(michelson_speed_of_light)
michelson_shifted = michelson_speed_of_light - michelson_mean + newcomb_value
m_x, m_y = ecdf(michelson_speed_of_light)
m_x_shifted, m_y_shifted = ecdf(michelson_shifted)
sns.scatterplot(x=m_x, y=m_y, label="Michelson's Data")
sns.scatterplot(x=m_x_shifted, y=m_y_shifted, label="Michelson's Data shifted to Newcomb's Mean", color='g')
plt.vlines(michelson_mean, 0, 1, colors='b', linestyles='--', label=f"Michelson's Mean: {michelson_mean}")
plt.vlines(newcomb_value, 0, 1, colors='g', linestyles='--', label=f"Newcomb's Mean: {newcomb_value}")
plt.legend()
plt.ylabel('CDF')
plt.xlabel('Speed of Light (km/s)')
plt.show()
def diff_from_newcomb
¶# calculate the test statistic
def diff_from_newcomb(data: list, newcomb_value: int=299860) -> float:
return np.mean(data) - newcomb_value
diff_obs = diff_from_newcomb(michelson_speed_of_light)
diff_obs
# compute the p-value
bs_replicates = draw_bs_reps(michelson_shifted, diff_from_newcomb, 10000)
p_value = np.sum(bs_replicates <= diff_obs) / 10000
print(f"p-value: {p_value}. Only {p_value*100:0.02f}% of the replicates are <= to Newcomb's mean.")
Another juvenile frog was studied, Frog C, and you want to see if Frog B and Frog C have similar impact forces. Unfortunately, you do not have Frog C's impact forces available, but you know they have a mean of 0.55 N. Because you don't have the original data, you cannot do a permutation test, and you cannot assess the hypothesis that the forces from Frog B and Frog C come from the same distribution. You will therefore test another, less restrictive hypothesis: The mean strike force of Frog B is equal to that of Frog C.
To set up the bootstrap hypothesis test, you will take the mean as our test statistic. Remember, your goal is to calculate the probability of getting a mean impact force less than or equal to what was observed for Frog B if the hypothesis that the true mean of Frog B's impact forces is equal to that of Frog C is true. You first translate all of the data of Frog B such that the mean is 0.55 N. This involves adding the mean force of Frog C and subtracting the mean force of Frog B from each measurement of Frog B. This leaves other properties of Frog B's distribution, such as the variance, unchanged.
Instructions
draw_bs_reps()
function to take 10,000 bootstrap replicates of the mean of your translated forces.force_b
.force_c = frogs['impact force (mN)'][frogs.ID == 'III'].div(1000).to_list()
force_c_mean = np.mean(force_c)
print(f'Frog C Mean Impact Force (N): {force_c_mean:0.02f}')
# Make an array of translated impact forces: translated_force_b
translated_force_b = force_b - np.mean(force_b) + 0.55
# Take bootstrap replicates of Frog B's translated impact forces: bs_replicates
bs_replicates = draw_bs_reps(translated_force_b, np.mean, 10000)
# Compute fraction of replicates that are less than the observed Frog B force: p
p = np.sum(bs_replicates <= np.mean(force_b)) / 10000
# Print the p-value
print('p = ', p)
Great work! The low p-value suggests that the null hypothesis that Frog B and Frog C have the same mean impact force is false.
We now want to test the hypothesis that Frog A and Frog B have the same mean impact force, but not necessarily the same distribution, which is also impossible with a permutation test.
To do the two-sample bootstrap test, we shift both arrays to have the same mean, since we are simulating the hypothesis that their means are, in fact, equal. We then draw bootstrap samples out of the shifted arrays and compute the difference in means. This constitutes a bootstrap replicate, and we generate many of them. The p-value is the fraction of replicates with a difference in means greater than or equal to what was observed.
The objects forces_concat
and empirical_diff_means
are already in your namespace.
Instructions
forces_concat
) using np.mean()
.force_a
and force_b
such that the mean of each is the mean of the concatenated array of impact forces.forces_concat = force_a + force_b
empirical_diff_means = np.mean(force_a) - np.mean(force_b)
# Compute mean of all forces: mean_force
mean_force = np.mean(forces_concat)
# Generate shifted arrays
force_a_shifted = force_a - np.mean(force_a) + mean_force
force_b_shifted = force_b - np.mean(force_b) + mean_force
# Compute 10,000 bootstrap replicates from shifted arrays
bs_replicates_a = draw_bs_reps(force_a_shifted, np.mean, 10000)
bs_replicates_b = draw_bs_reps(force_b_shifted, np.mean, 10000)
# Get replicates of difference of means: bs_replicates
bs_replicates = bs_replicates_a - bs_replicates_b
# Compute and print p-value: p
p = np.sum(bs_replicates >= empirical_diff_means) / 10000
print('p-value =', p)
You got a similar result as when you did the permutation test. Nonetheless, remember that it is important to carefully think about what question you want to ask. Are you only interested in the mean impact force, or in the distribution of impact forces?
As you saw from the last chapter, hypothesis testing can be a bit tricky. You need to define the null hypothesis, figure out how to simulate it, and define clearly what it means to be "more extreme" in order to compute the p-value. Like any skill, practice makes perfect, and this chapter gives you some good practice with hypothesis tests.
diff_frac
for our test statistic.diff_frac
.draw_perm_reps
function you wrote in the exercises; we'll generate 10,000 replicates.def diff_frac
¶def diff_frac(data_A: np.array, data_B: np.array) -> float:
frac_A = np.sum(data_A) / len(data_A)
frac_B = np.sum(data_B) / len(data_B)
return frac_B - frac_A
A_0 = np.zeros((455, 1))
A_1 = np.ones((45, 1))
clickthrough_A = np.concatenate((A_0, A_1), axis=0)
B_0 = np.zeros((433, 1))
B_1 = np.ones((67, 1))
clickthrough_B = np.concatenate((B_0, B_1), axis=0)
perm_replicates = np.empty(10000)
for i in range(10000):
perm_replicates[i] = draw_perm_reps(clickthrough_A, clickthrough_B, diff_frac)
diff_frac_obs = diff_frac(clickthrough_A, clickthrough_B)
p_value = np.sum(perm_replicates >= diff_frac_obs) / 10000
p_value
The Civil Rights Act of 1964 was one of the most important pieces of legislation ever passed in the USA. Excluding "present" and "abstain" votes, 153 House Democrats and 136 Republicans voted yea. However, 91 Democrats and 35 Republicans voted nay. Did party affiliation make a difference in the vote?
To answer this question, you will evaluate the hypothesis that the party of a House member has no bearing on his or her vote. You will use the fraction of Democrats voting in favor as your test statistic and evaluate the probability of observing a fraction of Democrats voting in favor at least as small as the observed fraction of 153/244. (That's right, at least as small as. In 1964, it was the Democrats who were less progressive on civil rights issues.) To do this, permute the party labels of the House voters and then arbitrarily divide them into "Democrats" and "Republicans" and compute the fraction of Democrats voting yea.
Instructions
dems
and reps
that contain the votes of the respective parties; e.g., dems
has 153 True
entries and 91 False
entries.frac_yea_dems(dems, reps)
that returns the fraction of Democrats that voted yea. The first input is an array of Booleans, Two inputs are required to use your draw_perm_reps()
function, but the second is not used.draw_perm_reps()
function to draw 10,000 permutation replicates of the fraction of Democrat yea votes.# Construct arrays of data: dems, reps
dems = np.array([True] * 153 + [False] * 91)
reps = np.array([True] * 136 + [False] * 35)
def frac_yea_dems(dems, reps):
"""Compute fraction of Democrat yea votes."""
frac = np.sum(dems) / len(dems)
return frac
# Acquire permutation samples: perm_replicates
perm_replicates = draw_perm_reps(dems, reps, frac_yea_dems, 10000)
# Compute and print p-value: p
p = np.sum(perm_replicates <= 153/244) / len(perm_replicates)
print('p-value =', p)
This small p-value suggests that party identity had a lot to do with the voting. Importantly, the South had a higher fraction of Democrat representatives, and consequently also a more racist bias.
You have experience matching stories to probability distributions. Similarly, you use the same procedure for two different A/B tests if their stories match. In the Civil Rights Act example you just did, you performed an A/B test on voting data, which has a Yes/No type of outcome for each subject (in that case, a voter). Which of the following situations involving testing by a web-based company has an equivalent set up for an A/B test as the one you just did with the Civil Rights Act of 1964?
Answer the question
The "Democrats" are those who view the ad before the color change, and the "Republicans" are those who view it after.
It turns out that you already did a hypothesis test analogous to an A/B test where you are interested in how much time is spent on the website before and after an ad campaign. The frog tongue force (a continuous quantity like time on the website) is an analog. "Before" = Frog A and "after" = Frog B. Let's practice this again with something that actually is a before/after scenario.
We return to the no-hitter data set. In 1920, Major League Baseball implemented important rule changes that ended the so-called dead ball era. Importantly, the pitcher was no longer allowed to spit on or scuff the ball, an activity that greatly favors pitchers. In this problem you will perform an A/B test to determine if these rule changes resulted in a slower rate of no-hitters (i.e., longer average time between no-hitters) using the difference in mean inter-no-hitter time as your test statistic. The inter-no-hitter times for the respective eras are stored in the arrays nht_dead
and nht_live
, where "nht" is meant to stand for "no-hitter time."
Since you will be using your draw_perm_reps()
function in this exercise, it may be useful to remind yourself of its call signature: draw_perm_reps(d1, d2, func, size=1)
or even referring back to the chapter 3 exercise in which you defined it.
Instructions
diff_of_means()
.draw_perm_reps()
.nht_dead = baseball_df.games_since_last_nht[baseball_df.date.dt.year < 1920].to_numpy()
nht_live = baseball_df.games_since_last_nht[baseball_df.date.dt.year >= 1920].to_numpy()
# Compute the observed difference in mean inter-no-hitter times: nht_diff_obs
nht_diff_obs = diff_of_means(nht_dead, nht_live)
# Acquire 10,000 permutation replicates of difference in mean no-hitter time: perm_replicates
perm_replicates = draw_perm_reps(nht_dead, nht_live, diff_of_means, 10000)
# Compute and print the p-value: p
p = np.sum(perm_replicates <= nht_diff_obs) / len(perm_replicates)
print('p-val =', p)
The p-value is 0.0001, which means that only one out of your 10,000 replicates had a result as extreme as the actual difference between the dead ball and live ball eras. This suggests strong statistical significance. Watch out, though, you could very well have gotten zero replicates that were as extreme as the observed value. This just means that the p-value is quite small, almost certainly smaller than 0.001.
That was a nice hypothesis test you just did to check out whether the rule changes in 1920 changed the rate of no-hitters. But what should you have done with the data first?
Answer the question
Always a good idea to do first! I encourage you to go ahead and plot the ECDFs right now. You will see by eye that the null hypothesis that the distributions are the same is almost certainly not true.
x_dead, y_dead = ecdf(nht_dead)
x_live, y_live = ecdf(nht_live)
sns.scatterplot(x=x_dead, y=y_dead, label='dead ball: through 1919')
sns.scatterplot(x=x_live, y=y_live, label='live ball: from 1920', color='purple')
plt.vlines(np.mean(nht_dead), 0, 1, linestyles='--', label=f'dead ball mean: {np.mean(nht_dead):0.02f}', color='b')
plt.vlines(np.mean(nht_live), 0, 1, linestyles='--', label=f'live ball mean: {np.mean(nht_live):0.02f}', color='purple')
plt.ylabel('CDF')
plt.xlabel('Number of games between no-hitters')
plt.legend(loc=7)
plt.title('ECDF for Number of Games Between No-Hitters\nDead Ball Era vs. Live Ball Era')
plt.show()
obama_pearson_observed = swing.corr().loc['total_votes', 'dem_share'] # from pandas dataframe function
g = sns.regplot(x='total_votes', y='dem_share', data=swing)
plt.xlim(0, 900000)
plt.text(400000, 30, f'ρ = {obama_pearson_observed:0.02f}', fontsize=20)
plt.ylabel('Percent of Vote for Obama')
plt.xlabel('Total Votes')
plt.title('2008 US Swing State Election Results')
plt.show()
# Initialize permutation replicates: perm_replicates
obama_perm_replicates = np.empty(10000)
# Draw replicates
for i in range(len(obama_perm_replicates)):
# Permute dem_share measurments
dem_share_permuted = np.random.permutation(swing.dem_share)
# Compute Pearson correlation
obama_perm_replicates[i] = pearson_r(dem_share_permuted, swing.total_votes)
sns.histplot(obama_perm_replicates, kde=True)
plt.axvline(obama_pearson_observed, 0, 1, label=f'ovserved ρ: {obama_pearson_observed:0.02f}', color='firebrick')
plt.legend()
plt.xlabel('Pearson correlation')
plt.ylabel('probability under null')
plt.title('10000 Replicates of Pearson\nShare of Total Votes for Obama')
plt.show()
The observed correlation between female illiteracy
and fertility
in the data set of 162 countries may just be by chance; the fertility of a given country may actually be totally independent of its illiteracy. You will test this null hypothesis in the next exercise.
To do the test, you need to simulate the data assuming the null hypothesis is true. Of the following choices, which is the best way to do it?
Answer the question
The observed correlation between female illiteracy and fertility may just be by chance; the fertility of a given country may actually be totally independent of its illiteracy. You will test this hypothesis. To do so, permute the illiteracy values but leave the fertility values fixed. This simulates the hypothesis that they are totally independent of each other. For each permutation, compute the Pearson correlation coefficient and assess how many of your permutation replicates have a Pearson correlation coefficient greater than the observed one.
The function pearson_r()
that you wrote in the prequel to this course for computing the Pearson correlation coefficient is already in your name space.
Instructions
illiteracy
and fertility
.for loop
to draw 10,000 replicates:illiteracy
measurements using np.random.permutation()
.illiteracy_permuted
, and fertility
.illiteracy = 100 - fem['female literacy'].to_numpy()
fertility = fem.fertility.to_numpy()
# Compute observed correlation: r_obs
r_obs = pearson_r(illiteracy, fertility)
# Initialize permutation replicates: perm_replicates
perm_replicates = np.empty(10000)
# Draw replicates
for i in range(len(perm_replicates)):
# Permute illiteracy measurments: illiteracy_permuted
illiteracy_permuted = np.random.permutation(illiteracy)
# Compute Pearson correlation
perm_replicates[i] = pearson_r(illiteracy_permuted, fertility)
# Compute p-value: p
p = sum(perm_replicates >= r_obs) / len(perm_replicates)
print('p-val =', p)
You got a p-value of zero. In hacker statistics, this means that your p-value is very low, since you never got a single replicate in the 10,000 you took that had a Pearson correlation greater than the observed one. You could try increasing the number of replicates you take to continue to move the upper bound on your p-value lower and lower.
sns.histplot(perm_replicates, kde=True)
plt.axvline(r_obs, 0, 1, label=f'observed ρ: {r_obs:0.02f}', color='firebrick')
plt.legend()
plt.xlabel('Pearson correlation')
plt.ylabel('probability under null')
plt.title('10000 Replicates of Pearson\nIlliteracy as Correlated to Fertility')
plt.show()
As a final exercise in hypothesis testing before we put everything together in our case study in the next chapter, you will investigate the effects of neonicotinoid insecticides on bee reproduction. These insecticides are very widely used in the United States to combat aphids and other pests that damage plants.
In a recent study, Straub, et al. (Proc. Roy. Soc. B, 2016) investigated the effects of neonicotinoids on the sperm of pollinating bees. In this and the next exercise, you will study how the pesticide treatment affected the count of live sperm per half milliliter of semen.
First, we will do EDA, as usual. Plot ECDFs of the alive sperm count for untreated bees (stored in the Numpy array control
) and bees treated with pesticide (stored in the Numpy array treated
).
Instructions
ecdf()
function to generate x,y
values from the control
and treated
arrays for plotting the ECDFs.# the values used in the DataCamp exercise are multipled by 2. I have not done that here.
control = bees['Alive Sperm Millions'][bees.Treatment == 'Control'].to_numpy()
treated = bees['Alive Sperm Millions'][bees.Treatment == 'Pesticide'].to_numpy()
# Compute x,y values for ECDFs
x_control, y_control = ecdf(control)
x_treated, y_treated = ecdf(treated)
# Plot the ECDFs
plt.plot(x_control, y_control, marker='.', linestyle='none', color='b', label='control ecdf')
plt.plot(x_treated, y_treated, marker='.', linestyle='none', color='g', label='treated ecdf')
plt.axvline(np.mean(control), color='b', linestyle='--', label=f'observed control mean: {np.mean(control):0.02f}')
plt.axvline(np.mean(treated), color='g', linestyle='--', label=f'observed treated mean: {np.mean(treated):0.02f}')
# Set the margins
plt.margins(0.02)
# Add a legend
plt.legend(loc=7)
# Label axes and show plot
plt.xlabel('millions of alive sperm per mL')
plt.ylabel('ECDF')
plt.show()
The ECDFs show a pretty clear difference between the treatment and control; treated bees have fewer alive sperm. Let's now do a hypothesis test in the next exercise.
Now, you will test the following hypothesis: On average, male bees treated with neonicotinoid insecticide have the same number of active sperm per milliliter of semen than do untreated male bees. You will use the difference of means as your test statistic.
For your reference, the call signature for the draw_bs_reps()
function you wrote in chapter 2 is draw_bs_reps(data, func, size=1)
.
Instructions
control
minus that of treated
.control
and treated
and take the mean of the concatenated array.control
and treated
such that the shifted data sets have the same mean.draw_bs_reps()
function.# Compute the difference in mean sperm count: diff_means
diff_means = diff_of_means(control, treated)
# Compute mean of pooled data: mean_count
mean_count = np.mean(np.concatenate((control, treated)))
# Generate shifted data sets
control_shifted = control - np.mean(control) + mean_count
treated_shifted = treated - np.mean(treated) + mean_count
# Generate bootstrap replicates
bs_reps_control = draw_bs_reps(control_shifted, np.mean, size=10000)
bs_reps_treated = draw_bs_reps(treated_shifted, np.mean, size=10000)
# Get replicates of difference of means: bs_replicates
bs_replicates = bs_reps_control - bs_reps_treated
# Compute and print p-value: p
p = np.sum(bs_replicates >= diff_means) / len(bs_replicates)
print('p-value =', p)
The p-value is small, most likely less than 0.0001, since you never saw a bootstrap replicated with a difference of means at least as extreme as what was observed. In fact, when I did the calculation with 10 million replicates, I got a p-value of 2e-05.
# Compute x,y values for ECDFs
x_control, y_control = ecdf(control_shifted)
x_treated, y_treated = ecdf(treated_shifted)
# Plot the ECDFs
plt.plot(x_control, y_control, marker='.', linestyle='none', color='b', label='shifted control ecdf')
plt.plot(x_treated, y_treated, marker='.', linestyle='none', color='g', label='shifted treated ecdf')
plt.axvline(np.mean(control_shifted), color='b', linestyle='--', label=f'shifted control mean: {np.mean(control_shifted):0.02f}')
plt.axvline(np.mean(treated_shifted), color='g', linestyle='--', label=f'shifted treated mean: {np.mean(treated_shifted):0.02f}')
# Set the margins
plt.margins(0.02)
# Add a legend
plt.legend(loc=7)
# Label axes and show plot
plt.title('ECDF of Shifted Control & Treatment Means')
plt.xlabel('millions of alive sperm per mL')
plt.ylabel('ECDF')
plt.show()
sns.histplot(bs_replicates, kde=True)
plt.axvline(diff_means, linestyle='--', color='firebrick', label=f'Observed diff mean: {diff_means:0.02f}')
plt.title('Bootstrap Replicates of Difference of Means\nControl & Treated Mean Shifted to Mean of Combined Data')
plt.xlabel('Difference of Shifted Means')
plt.ylabel('Probability Under Null Hypothesis')
plt.legend()
plt.show()
Every year for the past 40-plus years, Peter and Rosemary Grant have gone to the Galápagos island of Daphne Major and collected data on Darwin's finches. Using your skills in statistical inference, you will spend this chapter with their data, and witness first hand, through data, evolution in action. It's an exhilarating way to end the course!
Your well-equipped toolbox
Darwin and the Finches
Investigations
For your first foray into the Darwin finch data, you will study how the beak depth (the distance, top to bottom, of a closed beak) of the finch species Geospiza scandens has changed over time. The Grants have noticed some changes of beak geometry depending on the types of seeds available on the island, and they also noticed that there was some interbreeding with another major species on Daphne Major, Geospiza fortis. These effects can lead to changes in the species over time.
In the next few problems, you will look at the beak depth of G. scandens on Daphne Major in 1975 and in 2012. To start with, let's plot all of the beak depth measurements in 1975 and 2012 in a bee swarm plot.
The data are stored in a pandas DataFrame called df
with columns 'year'
and 'beak_depth'
. The units of beak depth are millimeters (mm).
Instructions
finch.species.unique()
sns.swarmplot(data=finch[finch.species == 'scandens'], x='year', y='beak_depth')
plt.xlabel('year')
plt.ylabel('beak depth (mm)')
plt.show()
It is kind of hard to see if there is a clear difference between the 1975 and 2012 data set. Eyeballing it, it appears as though the mean of the 2012 data set might be slightly higher, and it might have a bigger variance.
While bee swarm plots are useful, we found that ECDFs are often even better when doing EDA. Plot the ECDFs for the 1975 and 2012 beak depth measurements on the same plot.
For your convenience, the beak depths for the respective years has been stored in the NumPy arrays bd_1975
and bd_2012
.
Instructions
bd_1975 = finch_1975.beak_depth[finch_1975.species == 'scandens'].values
bd_2012 = finch_2012.beak_depth[finch_2012.species == 'scandens'].values
# Compute ECDFs
x_1975, y_1975 = ecdf(bd_1975)
x_2012, y_2012 = ecdf(bd_2012)
# Plot the ECDFs
plt.scatter(x_1975, y_1975, marker='.', s=10)
plt.scatter(x_2012, y_2012, marker='.', s=10)
# Set the margins
plt.margins(0.02)
# Add axis labels and legend
plt.xlabel('beak depth (mm)')
plt.ylabel('ECDF')
plt.legend(('1975', '2012'), loc='lower right')
# Show the plot
plt.show()
The differences are much clearer in the ECDF. The mean is larger in the 2012 data, and the variance does appear larger as well.
# seaborn 0.11 can do the same thing from one dataframe and no separate function
sns.ecdfplot(data=finch[finch.species == 'scandens'], x='beak_depth', hue='year')
Estimate the difference of the mean beak depth of the G. scandens samples from 1975 and 2012 and report a 95% confidence interval.
Since in this exercise you will use the draw_bs_reps()
function you wrote in chapter 2, it may be helpful to refer back to it.
Instructions
# Compute the difference of the sample means: mean_diff
mean_diff = np.mean(bd_2012) - np.mean(bd_1975)
# Get bootstrap replicates of means
bs_replicates_1975 = draw_bs_reps(bd_1975, np.mean, size=10000)
bs_replicates_2012 = draw_bs_reps(bd_2012, np.mean, size=10000)
# Compute samples of difference of means: bs_diff_replicates
bs_diff_replicates = bs_replicates_2012 - bs_replicates_1975
# Compute 95% confidence interval: conf_int
conf_int = np.percentile(bs_diff_replicates, [2.5, 97.5])
# Print the results
print(f'difference of means: {mean_diff} mm')
print(f'95% confidence interval: {conf_int} mm')
Your plot of the ECDF and determination of the confidence interval make it pretty clear that the beaks of G. scandens on Daphne Major have gotten deeper. But is it possible that this effect is just due to random chance? In other words, what is the probability that we would get the observed difference in mean beak depth if the means were the same?
Be careful! The hypothesis we are testing is not that the beak depths come from the same distribution. For that we could use a permutation test. The hypothesis is that the means are equal. To perform this hypothesis test, we need to shift the two data sets so that they have the same mean and then use bootstrap sampling to compute the difference of means.
Instructions
bd_1975
and bd_2012
such that their means are equal to the one you just computed for the combined data set.mean_diff
.# Compute mean of combined data set: combined_mean
combined_mean = np.mean(np.concatenate((bd_1975, bd_2012)))
# Shift the samples
bd_1975_shifted = bd_1975 - np.mean(bd_1975) + combined_mean
bd_2012_shifted = bd_2012 - np.mean(bd_2012) + combined_mean
# Get bootstrap replicates of shifted data sets
bs_replicates_1975 = draw_bs_reps(bd_1975_shifted, np.mean, size=10000)
bs_replicates_2012 = draw_bs_reps(bd_2012_shifted, np.mean, size=10000)
# Compute replicates of difference of means: bs_diff_replicates
bs_diff_replicates = bs_replicates_2012 - bs_replicates_1975
# Compute the p-value
p = np.sum(bs_diff_replicates >= mean_diff) / len(bs_diff_replicates)
# Print p-value
print(f'p: {p}')
sns.ecdfplot(data=bs_replicates_1975, label='1975')
sns.ecdfplot(data=bs_replicates_2012, label='2012')
plt.legend(title='year')
plt.xlabel('beak depth (mm)')
plt.title('ECDF of Replicates with Shifted Mean')
plt.show()
draw_bs_pairs_linreg
function you wrote, will be helpful in computing confidence intervals for you linear regression parameters.The beak length data are stored as bl_1975
and bl_2012
, again with units of millimeters (mm). You still have the beak depth data stored in bd_1975
and bd_2012
. Make scatter plots of beak depth (y-axis) versus beak length (x-axis) for the 1975 and 2012 specimens.
Instructions
color='blue'
keyword argument. Also use an alpha=0.5
keyword argument to have transparency in case data points overlap.color='red'
keyword argument.bl_1975 = finch_1975.beak_length[finch_1975.species == 'scandens'].values
bl_2012 = finch_2012.beak_length[finch_2012.species == 'scandens'].values
# Make scatter plot of 1975 data
plt.scatter(bl_1975, bd_1975, marker='.', linestyle='None', color='blue', alpha=0.5)
# Make scatter plot of 2012 data
plt.scatter(bl_2012, bd_2012, marker='.', linestyle='None', color='red', alpha=0.5)
# Label axes and make legend
plt.xlabel('beak length (mm)')
plt.ylabel('beak depth (mm)')
plt.legend(('1975', '2012'), loc='upper left')
# Show the plot
plt.show()
Great work! In looking at the plot, we see that beaks got deeper (the red points are higher up in the y-direction), but not really longer. If anything, they got a bit shorter, since the red dots are to the left of the blue dots. So, it does not look like the beaks kept the same shape; they became shorter and deeper.
Perform a linear regression for both the 1975 and 2012 data. Then, perform pairs bootstrap estimates for the regression parameters. Report 95% confidence intervals on the slope and intercept of the regression line.
You will use the draw_bs_pairs_linreg()
function you wrote back in chapter 2.
As a reminder, its call signature is draw_bs_pairs_linreg(x, y, size=1)
, and it returns bs_slope_reps
and bs_intercept_reps
. The beak length data are stored as bl_1975
and bl_2012
, and the beak depth data is stored in bd_1975
and bd_2012
.
Instructions
draw_bs_pairs_linreg()
function.# Compute the linear regressions
slope_1975, intercept_1975 = np.polyfit(bl_1975, bd_1975, deg=1)
slope_2012, intercept_2012 = np.polyfit(bl_2012, bd_2012, deg=1)
# Perform pairs bootstrap for the linear regressions
bs_slope_reps_1975, bs_intercept_reps_1975 = draw_bs_pairs_linreg(bl_1975, bd_1975, size=1000)
bs_slope_reps_2012, bs_intercept_reps_2012 = draw_bs_pairs_linreg(bl_2012, bd_2012, size=1000)
# Compute confidence intervals of slopes
slope_conf_int_1975 = np.percentile(bs_slope_reps_1975, [2.5, 97.5])
slope_conf_int_2012 = np.percentile(bs_slope_reps_2012, [2.5, 97.5])
intercept_conf_int_1975 = np.percentile(bs_intercept_reps_1975, [2.5, 97.5])
intercept_conf_int_2012 = np.percentile(bs_intercept_reps_2012, [2.5, 97.5])
# Print the results
print(f'1975: slope: {slope_1975:0.2f} conf int: {np.round(slope_conf_int_1975, 2)}')
print(f'1975: intercept: {intercept_1975:0.2f} conf int: {np.round(intercept_conf_int_1975, 2)}')
print(f'2012: slope: {slope_2012:0.2f} conf int: {np.round(slope_conf_int_2012, 2)}')
print(f'2012: intercept: {intercept_2012:0.2f} conf int: {np.round(intercept_conf_int_2012, 2)}')
Now, you will display your linear regression results on the scatter plot, the code for which is already pre-written for you from your previous exercise. To do this, take the first 100 bootstrap samples (stored in bs_slope_reps_1975
, bs_intercept_reps_1975
, bs_slope_reps_2012
, and bs_intercept_reps_2012
) and plot the lines with alpha=0.2
and linewidth=0.5
keyword arguments to plt.plot()
.
Instructions
np.array()
. They should consist of 10 mm and 17 mm.for
loop to plot 100 of the bootstrap lines for the 1975 and 2012 data sets. The lines for the 1975 data set should be 'blue'
and those for the 2012 data set should be 'red'
.# Make scatter plot of 1975 data
plt.plot(bl_1975, bd_1975, marker='.', linestyle='none', color='blue', alpha=0.5)
# Make scatter plot of 2012 data
plt.plot(bl_2012, bd_2012, marker='.', linestyle='none', color='red', alpha=0.5)
# Label axes and make legend
plt.xlabel('beak length (mm)')
plt.ylabel('beak depth (mm)')
plt.legend(('1975', '2012'), loc='upper left')
# Generate x-values for bootstrap lines: x
x = np.array([10, 17])
# Plot the bootstrap lines
for i in range(100):
plt.plot(x, bs_slope_reps_1975[i]*x + bs_intercept_reps_1975[i], linewidth=0.5, alpha=0.2, color='blue')
plt.plot(x, bs_slope_reps_2012[i]*x + bs_intercept_reps_2012[i], linewidth=0.5, alpha=0.2, color='red')
# Draw the plot again
plt.show()
The linear regressions showed interesting information about the beak geometry. The slope was the same in 1975 and 2012, suggesting that for every millimeter gained in beak length, the birds gained about half a millimeter in depth in both years. However, if we are interested in the shape of the beak, we want to compare the ratio of beak length to beak depth. Let's make that comparison.
Remember, the data are stored in bd_1975
, bd_2012
, bl_1975
, and bl_2012
.
Instructions
draw_bs_reps()
function.# Compute length-to-depth ratios
ratio_1975 = bl_1975 / bd_1975
ratio_2012 = bl_2012 / bd_2012
# Compute means
mean_ratio_1975 = np.mean(ratio_1975)
mean_ratio_2012 = np.mean(ratio_2012)
# Generate bootstrap replicates of the means
bs_replicates_1975 = draw_bs_reps(ratio_1975, np.mean, size=10000)
bs_replicates_2012 = draw_bs_reps(ratio_2012, np.mean, size=10000)
# Compute the 99% confidence intervals
conf_int_1975 = np.percentile(bs_replicates_1975, [0.5, 99.5])
conf_int_2012 = np.percentile(bs_replicates_2012, [0.5, 99.5])
# Print the results
print(f'1975: mean ratio: {mean_ratio_1975:0.2f} conf int: {np.round(conf_int_1975, 2)}')
print(f'2012: mean ratio: {mean_ratio_2012:0.2f} conf int: {np.round(conf_int_2012, 2)}')
In the previous exercise, you computed the mean beak length to depth ratio with 99% confidence intervals for 1975 and for 2012. The results of that calculation are shown graphically in the plot accompanying this problem. In addition to these results, what would you say about the ratio of beak length to depth?
Instructions
plt.figure(figsize=(6, 4))
sns.scatterplot(x=[mean_ratio_1975, mean_ratio_2012], y=['1975', '2012'])
plt.hlines(y=['1975', '2012'], xmin=[conf_int_1975[0], conf_int_2012[0]], xmax=[conf_int_1975[1], conf_int_2012[1]])
plt.xlabel('beak length / beak depth')
plt.show()
fortis_parent_offspr = pd.read_csv(data_paths[5]) # has separate male/female measurements
fortis_parent_offspr['mid_parent'] = fortis_parent_offspr.iloc[:, 1:].mean(axis=1)
scandens_parent_offspr = pd.read_csv(data_paths[8]) # has mean of male/female measurements
bd_parent_fortis = fortis_parent_offspr['mid_parent'].values
bd_offspring_fortis = fortis_parent_offspr['Mid-offspr'].values
bd_parent_scandens = scandens_parent_offspr['mid_parent'].values
bd_offspring_scandens = scandens_parent_offspr['mid_offspring'].values
The array bd_parent_scandens
contains the average beak depth (in mm) of two parents of the species G. scandens. The array bd_offspring_scandens
contains the average beak depth of the offspring of the respective parents. The arrays bd_parent_fortis
and bd_offspring_fortis
contain the same information about measurements from G. fortis birds.
Make a scatter plot of the average offspring beak depth (y-axis) versus average parental beak depth (x-axis) for both species. Use the alpha=0.5
keyword argument to help you see overlapping points.
Instructions
# Make scatter plots
plt.scatter(bd_parent_fortis, bd_offspring_fortis, s=10, color='blue', alpha=0.5)
plt.scatter(bd_parent_scandens, bd_offspring_scandens, s=10, color='red', alpha=0.5)
# Label axes
plt.xlabel('parental beak depth (mm)')
plt.ylabel('offspring beak depth (mm)')
# Add legend
plt.legend(('G. fortis', 'G. scandens'), loc='lower right')
# Show plot
plt.show()
It appears as though there is a stronger correlation in G. fortis than in G. scandens. This suggests that beak depth is more strongly inherited in G. fortis. We'll quantify this correlation next.
fortis_parent_offspr[['Mid-offspr', 'mid_parent']].corr()
scandens_parent_offspr.corr()
In an effort to quantify the correlation between offspring and parent beak depths, we would like to compute statistics, such as the Pearson correlation coefficient, between parents and offspring. To get confidence intervals on this, we need to do a pairs bootstrap.
You have already written a function to do pairs bootstrap to get estimates for parameters derived from linear regression. Your task in this exercise is to make a new function with call signature draw_bs_pairs(x, y, func, size=1)
that performs pairs bootstrap and computes a single statistic on pairs samples defined. The statistic of interest is computed by calling func(bs_x, bs_y)
. In the next exercise, you will use pearson_r
for func.
Instructions
size
.for
loop to draw the samples.x
values and y
values from the input array using the indices you just chose to generate a bootstrap sample.func
to compute the statistic of interest from the bootstrap samples of x
and y
and store it in your array of bootstrap replicates.def draw_bs_pairs
¶def draw_bs_pairs(x, y, func, size=1):
"""Perform pairs bootstrap for a single statistic."""
# Set up array of indices to sample from: inds
inds = np.arange(len(x))
# Initialize replicates: bs_replicates
bs_replicates = np.empty(size)
# Generate replicates
for i in range(size):
bs_inds = np.random.choice(inds, size=len(inds))
bs_x, bs_y = x[bs_inds], y[bs_inds]
bs_replicates[i] = func(bs_x, bs_y)
return bs_replicates
The Pearson correlation coefficient seems like a useful measure of how strongly the beak depth of parents are inherited by their offspring. Compute the Pearson correlation coefficient between parental and offspring beak depths for G. scandens. Do the same for G. fortis. Then, use the function you wrote in the last exercise to compute a 95% confidence interval using pairs bootstrap.
Remember, the data are stored in bd_parent_scandens
, bd_offspring_scandens
, bd_parent_fortis
, and bd_offspring_fortis
.
Instructions
pearson_r()
function to compute the Pearson correlation coefficient for G. scandens and G. fortis.draw_bs_pairs()
function you wrote in the previous exercise for G. scandens and G. fortis.# Compute the Pearson correlation coefficients
r_scandens = pearson_r(bd_parent_scandens, bd_offspring_scandens)
r_fortis = pearson_r(bd_parent_fortis, bd_offspring_fortis)
# Acquire 1000 bootstrap replicates of Pearson r
bs_replicates_scandens = draw_bs_pairs(bd_parent_scandens, bd_offspring_scandens, pearson_r, size=1000)
bs_replicates_fortis = draw_bs_pairs(bd_parent_fortis, bd_offspring_fortis, pearson_r, size=1000)
# Compute 95% confidence intervals
conf_int_scandens = np.percentile(bs_replicates_scandens, [2.5, 97.5])
conf_int_fortis = np.percentile(bs_replicates_fortis, [2.5, 97.5])
# Print results
print(f'G. scandens: {r_scandens:0.2f}, {np.round(conf_int_scandens, 2)}')
print(f'G. fortis: {r_fortis:0.2f}, {np.round(conf_int_fortis, 2)}')
It is clear from the confidence intervals that beak depth of the offspring of G. fortis parents is more strongly correlated with their offspring than their G. scandens counterparts.
Remember that the Pearson correlation coefficient is the ratio of the covariance to the geometric mean of the variances of the two data sets. This is a measure of the correlation between parents and offspring, but might not be the best estimate of heritability. If we stop and think, it makes more sense to define heritability as the ratio of the covariance between parent and offspring to the variance of the parents alone. In this exercise, you will estimate the heritability and perform a pairs bootstrap calculation to get the 95% confidence interval.
This exercise highlights a very important point. Statistical inference (and data analysis in general) is not a plug-n-chug enterprise. You need to think carefully about the questions you are seeking to answer with your data and analyze them appropriately. If you are interested in how heritable traits are, the quantity we defined as the heritability is more apt than the off-the-shelf statistic, the Pearson correlation coefficient.
Remember, the data are stored in bd_parent_scandens
, bd_offspring_scandens
, bd_parent_fortis
, and bd_offspring_fortis
.
Instructions
heritability(parents, offspring)
that computes heritability defined as the ratio of the covariance of the trait in parents and offspring divided by the variance of the trait in the parents. Hint: Remind yourself of the np.cov()
function we covered in the prequel to this course.def heritability(parents, offspring):
"""Compute the heritability from parent and offspring samples."""
covariance_matrix = np.cov(parents, offspring)
return covariance_matrix[0, 1] / covariance_matrix[0,0]
# Compute the heritability
heritability_scandens = heritability(bd_parent_scandens, bd_offspring_scandens)
heritability_fortis = heritability(bd_parent_fortis, bd_offspring_fortis)
# Acquire 1000 bootstrap replicates of heritability
replicates_scandens = draw_bs_pairs(bd_parent_scandens, bd_offspring_scandens, heritability, size=1000)
replicates_fortis = draw_bs_pairs(bd_parent_fortis, bd_offspring_fortis, heritability, size=1000)
# Compute 95% confidence intervals
conf_int_scandens = np.percentile(replicates_scandens, [2.5, 97.5])
conf_int_fortis = np.percentile(replicates_fortis, [2.5, 97.5])
# Print results
print(f'G. scandens: {heritability_scandens:0.2f}, {np.round(conf_int_scandens, 2)}')
print(f'G. fortis: {heritability_fortis:0.2f}, {np.round(conf_int_fortis, 2)}')
Here again, we see that G. fortis
has stronger heritability than G. scandens
. This suggests that the traits of G. fortis
may be strongly incorporated into G. scandens
by introgressive hybridization.
The heritability of beak depth in G. scandens seems low. It could be that this observed heritability was just achieved by chance and beak depth is actually not really heritable in the species. You will test that hypothesis here. To do this, you will do a pairs permutation test.
Instructions
for
loop to generate your replicates.bd_parent_scandens
array using np.random.permutation()
.bd_offspring_scandens
array using the heritability(
) function you wrote in the last exercise. Store the result in the replicates array.heritability_scandens
you computed in the last exercise.# Initialize array of replicates: perm_replicates
perm_replicates = np.empty(10000)
# Draw replicates
for i in range(10000):
# Permute parent beak depths
bd_parent_permuted = np.random.permutation(bd_parent_scandens)
perm_replicates[i] = heritability(bd_parent_permuted, bd_offspring_scandens)
# Compute p-value: p
p = np.sum(perm_replicates >= heritability_scandens) / len(perm_replicates)
# Print the p-value
print('p-val =', p)
You get a p-value of zero, which means that none of the 10,000 permutation pairs replicates you drew had a heritability high enough to match that which was observed. This strongly suggests that beak depth is heritable in G. scandens, just not as much as in G. fortis. If you like, you can plot a histogram of the heritability replicates to get a feel for how extreme of a value of heritability you might expect by chance.
plt.hist(perm_replicates, bins=100)
plt.show()