# Analysis performed with Python 3.5.1
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import os
%pylab inline
script_dir = os.getcwd()
file = "titanic_data.csv"
abs_file_path = os.path.join(script_dir, file)
titanic_df = pd.read_csv(abs_file_path)
# Head of the dataset
titanic_df.head()
# Tail of the dataset
titanic_df.tail()
# Determine which parameters have missing values
titanic_df.info()
# Give gender a numeric value; 0 = male, 1 = female
titanic_df['Sex_Numeric'] = (titanic_df['Sex'].astype('category')).cat.codes
grouped_survived = titanic_df.groupby(['Sex_Numeric', 'Pclass', 'Age', 'Embarked'])
grouped_survived['Survived'].describe()
# Create Survival Label Column
titanic_df['Survival'] = titanic_df.Survived.map({0 : 'Died', 1 : 'Survived'})
titanic_df.Survival.head()
# Create Pclass Label Column
titanic_df['Class'] = titanic_df.Pclass.map({1 : '1st Class', 2 : '2nd Class', 3 : '3rd Class'})
titanic_df.Class.head()
# Create Sex Label Column
titanic_df['Gender'] = titanic_df.Sex.map({'female' : 'Female', 'male' : 'Male'})
titanic_df.Gender.head()
# Replace blanks with NaN
titanic_df['Embarked'].replace(r'\s+', np.nan, regex=True).head()
# Create Port Label Column
titanic_df['Ports'] = titanic_df.Embarked.map({'S' : 'Southhampton', 'C' : 'Cherbourg', 'Q' : 'Queenstown', NaN : 'unknown'})
titanic_df.Ports.head()
# Mosaic Chart
plt.rc('figure', figsize=(17, 5))
from statsmodels.graphics.mosaicplot import mosaic
mosaic(titanic_df,['Survival', 'Class', 'Gender'], axes_label=False, title='Survival: Red=Died, Green=Survived')
plt.xlabel('Gender: Male & Female')
plt.ylabel('Passenger Class: 1st, 2nd & 3rd Class')
plt.show()
# Figure size
plt.rc('figure', figsize=(15, 15))
# Histogram bin size
bin_size = 10
# Setup a figue of plots
#fig = plt.figure()
fig_size = (3, 2)
# Plot mortality counts
plt.subplot2grid(fig_size, (0, 0))
titanic_df['Survival'].value_counts().plot(kind='bar', title='Mortality')
plt.xticks(rotation=0)
plt.ylabel('Count')
# Plot Pclass counts
plt.subplot2grid(fig_size, (0, 1))
titanic_df['Class'].value_counts().plot(kind='bar', title='Passenger Class')
plt.xticks(rotation=0)
plt.ylabel('Count')
# Plot Gender counts
plt.subplot2grid(fig_size, (1, 0))
titanic_df['Gender'].value_counts().plot(kind='bar', title='Gender')
plt.xticks(rotation=0)
plt.ylabel('Count')
# Plot Port counts
plt.subplot2grid(fig_size, (1, 1))
titanic_df['Ports'].value_counts().plot(kind='bar', title='Ports of Embarkation')
plt.xticks(rotation=0)
plt.ylabel('Count')
# Plot Age histogram
plt.subplot2grid(fig_size, (2, 0))
titanic_df['Age'].hist()
plt.title('Age')
plt.ylabel('Count')
plt.xlabel('Age Categories by Decade (years)')
# Passangers with no age
ageisnull = titanic_df[titanic_df['Age'].isnull()]
ageisnull.head()
print('Total passengers with no age: ', len(ageisnull))
In the Dataset Exploration section, it was determined there were only 714 of 891 valid age related records. We can see there are 177 NaN entries for Age.
# Mean age
titanic_df['Age'].mean()
# Mean age by Sex
(titanic_df.groupby(['Gender']))['Age'].mean()
# Mean age by Pclass and Sex
(titanic_df.groupby(['Class', 'Gender']))['Age'].mean()
# Mean age by Pclass, Survived and Sex
(titanic_df.groupby(['Class', 'Survival', 'Gender']))['Age'].mean()
# General statistics of Age by Class, Survival and Gender
(titanic_df.groupby(['Class', 'Survival', 'Gender']))['Age'].describe()
# Survival count by Sex, Pclass and Age < 20
sex = titanic_df['Gender']
survived = titanic_df['Survival']
pclass = titanic_df['Class']
age_youth = titanic_df['Age'] < 20
pd.crosstab([sex, pclass, age_youth], survived)
A decision is required to determine the best method of dealing with NaN values.
# Maintain Age and create Age_Fill (populate missing ages)
titanic_df['Age_Fill'] = titanic_df['Age']
# Populate Age_Fill
titanic_df['Age_Fill'] = titanic_df['Age_Fill'] \
.groupby([titanic_df['Pclass'], titanic_df['Survived'], titanic_df['Sex']]) \
.apply(lambda x: x.fillna(x.mean()))
Create a new category called Age_Fill and fill NaN with an age based upon the mean of Pclass, Survived and Sex.
# Example of Age_Fill - #5, 17 & 19
print(titanic_df['Age'].head(20))
print(titanic_df['Age_Fill'].head(20))
# Set figure size
plt.rc('figure', figsize=(10, 10))
# Setup a figue of plots
df1 = titanic_df[titanic_df['Survived'] == 0]['Age']
df2 = titanic_df[titanic_df['Survived'] == 1]['Age']
df3 = titanic_df[titanic_df['Survived'] == 0]['Age_Fill']
df4 = titanic_df[titanic_df['Survived'] == 1]['Age_Fill']
plt.legend(('Died', 'Survived'), loc='best')
plt.title('Survivors by Age Group')
plt.xlabel('Age')
plt.ylabel('Count')
max_age = max(titanic_df['Age_Fill'])
plt.subplot(311)
plt.hist([df1, df2],
bins=8,
range=(1, max_age),
stacked=True)
plt.legend(('Died', 'Survived'), loc='best')
plt.title('Survivors by Age Group (not filled)')
plt.ylabel('Count')
plt.subplot(312)
plt.hist([df3, df4],
bins=8,
range=(1, max_age),
stacked=True)
plt.legend(('Died', 'Survived'), loc='best')
plt.title('Survivors by Age Group (filled)')
plt.xlabel('Age')
plt.ylabel('Count')
plt.show()
# Maximum age
titanic_df['Age'].max()
# Create a new column that has all ages by bin category: 0-10:10, 10-20:20, 20-30:30, 30-40:40
# 40-50:50, 50-60:60, 60-70:70, 70-80:80
bins = [0, 10, 20, 30, 40, 50, 60, 70, 80]
group_names = [10, 20, 30, 40, 50, 60, 70, 80]
titanic_df['Age_Categories'] = pd.cut(titanic_df['Age_Fill'], bins, labels=group_names)
titanic_df[['Age', 'Age_Fill', 'Age_Categories']].head()
titanic_df['Age_Categories'] = pd.to_numeric(titanic_df['Age_Categories'])
An Age_Categories column has been inserted into the dataframe to simplify certain visualizations and calculations, as there are to many individual ages to easily draw conclusions or see patterns.
# Survival Count by Age_Categories
titanic_df.groupby('Survival')[['Age_Categories']].count()
# Mosaic Plot
plt.rc('figure', figsize=(18, 6)) # figure size
from statsmodels.graphics.mosaicplot import mosaic
mosaic(titanic_df,['Survival', 'Class', 'Age_Categories'], axes_label=False, title='Survival: Red=Died, Green=Survived')
plt.xlabel('Age Categories by Decades (years)')
plt.ylabel('Passenger Class: 1st, 2nd & 3rd Class')
plt.show()
# Mosaic Plot
mosaic(titanic_df,['Survival', 'Gender', 'Age_Categories'], axes_label=False, title='Survival: Red=Died, Green=Survived')
plt.xlabel('Age Categories by Decades (years)')
plt.ylabel('Gender: Male & Female')
plt.show()
# Survival count by Pclass
pclass_ct = titanic_df.groupby('Class')['Survival'].value_counts().unstack()
pclass_ct
# Survival Rate
titanic_df.groupby('Class')['Survival'].value_counts(normalize = True).unstack()
# Set figure size
plt.rc('figure', figsize=(10, 5))
# Setup a figue of plots
pclass_ct.plot(kind='bar',
stacked=True)
plt.legend(('Died', 'Survived'), loc='best')
plt.title('Survivors by Pclass')
plt.xlabel('Pclass')
plt.ylabel('Count')
plt.xticks(rotation=0)
plt.show()
Pclass is not a strong indicator for surviving, however 3rd Class is a stong indicator for dying.
# Survival count by sex
sex_ct = titanic_df.groupby('Gender')['Survival'].value_counts().unstack()
sex_ct
# Survival rate by sex
titanic_df.groupby('Gender')['Survival'].value_counts(normalize = True).unstack()
plt.rc('figure', figsize=(10, 5))
sex_ct.plot(kind='bar',
stacked=True)
plt.legend(('Died', 'Survived'), loc='best')
plt.title('Survivors by Gender')
plt.xlabel('Gender')
plt.ylabel('Count')
plt.xticks(rotation=0)
plt.show()
Gender is a strong indicator for survivability, with a significant portion of females (74%) surviving and males 81% dying.
# Survival count by Embarked
embarked_ct = titanic_df.groupby('Ports')['Survival'].value_counts().unstack()
embarked_ct
# Survival rate by embarked
titanic_df.groupby('Ports')['Survival'].value_counts(normalize = True).unstack()
plt.rc('figure', figsize=(10, 5))
embarked_ct.plot(kind='bar',
stacked=True)
plt.legend(('Died', 'Survived'), loc='best')
plt.title('Survivors by Embarked')
plt.xlabel('Port of Embarkation')
plt.ylabel('Count')
plt.xticks(rotation=0)
plt.show()
# Survival count by Sex, Embarked_Numeric, Pclass and Age Category
embarked = titanic_df['Ports']
sex = titanic_df['Gender']
survived = titanic_df['Survival']
pclass = titanic_df['Class']
age_cat = titanic_df['Age_Categories']
pd.crosstab([sex, embarked, pclass], [survived, age_cat])
import statsmodels.formula.api as sm
# OLS modeling for Survived and Gender
result_1 = sm.ols(formula='Survived ~ Gender', data=titanic_df).fit()
result_1.summary()
# OLS modeling for Survived and Class
result_2 = sm.ols(formula='Survived ~ Class', data=titanic_df).fit()
result_2.summary()
# OLS modeling for Survived and Ports
result_3 = sm.ols(formula='Survived ~ Ports', data=titanic_df).fit()
result_3.summary()
# OLS modeling for Survived and Age_Fill
result_4 = sm.ols(formula='Survived ~ Age_Fill', data=titanic_df).fit()
result_4.summary()
# OLS modeling for Survived and Gender + Class + Age_Fill + Ports
result_5 = sm.ols(formula='Survived ~ Gender + Class + Age_Fill + Ports', data=titanic_df).fit()
result_5.summary()
# OLS modeling for Survived and Gender + Class + Age_Fill
result_6 = sm.ols(formula='Survived ~ Gender + Class + Age_Fill', data=titanic_df).fit()
result_6.summary()
# Dataframe for statistical data
comp_index_4 = 'Gender + Class + Age_Fill + Ports'
comp_index_3 = 'Gender + Class + Age_Fill'
statistics_df = pd.DataFrame(
data=[[result_1.rsquared_adj, np.sqrt(result_1.rsquared_adj)],
[result_2.rsquared_adj, np.sqrt(result_2.rsquared_adj)],
[result_3.rsquared_adj, np.sqrt(result_3.rsquared_adj)],
[result_4.rsquared_adj, np.sqrt(result_4.rsquared_adj)],
[result_5.rsquared_adj, np.sqrt(result_5.rsquared_adj)],
[result_6.rsquared_adj, np.sqrt(result_6.rsquared_adj)]],
index=['Gender', 'Class', 'Ports', 'Age_Fill', comp_index_4, comp_index_3],
columns=['R-squared', 'Correlation to Survival']
)
statistics_df
Ordinary least squares (OLS) regression modeling has been used to determine which metric or combination of metrics provides the best prediction of survival. As can be determined by reviewing the coefficient of determination (R-squared), the individual models for Ports and Age_Fill indicate a large proportion of variance for survival. Gender and a combination of metrics are better models. The square root of R-squared equals the Pearson correlation coefficient of predicted to actual values; Gender is the single metric with the strongest correlation. However, the combination of metrics, Gender + Class + Age_Fill + Ports, shows the strongest correlation to survival for the model used.