Skillz Analysis

In [1]:
import pandas as pd
from pandas.plotting import register_matplotlib_converters
import matplotlib.pyplot as plt
import seaborn as sns
from pprint import pprint as pp
from pathlib import Path
import matplotlib.dates as mdates

sns.set()
register_matplotlib_converters()
# %matplotlib inline

pandas options

In [2]:
pd.set_option('max_columns', 200)
pd.set_option('max_rows', 300)
pd.set_option('display.expand_frame_repr', True)

Data

In [3]:
data_dir_path = Path.cwd()
skillz_data = data_dir_path / 'Skillz Analytics H2H Homework Data_v0.4.xlsx'

Pandas DataFrame

In [4]:
skillz_df = pd.read_excel(skillz_data, '1. Aggregate')

Data Exploration

In [5]:
skillz_df.head()
Out[5]:
experiment_group date daily_installs new_depositors z_daily_active_users cash_daily_active_users z_games cash_games entry_fees deposits
0 Control 2017-10-19 191 9 37 9 566 93 114.00 40
1 Control 2017-10-20 335 15 140 28 2400 559 555.26 155
2 Control 2017-10-21 333 17 205 44 3412 813 927.63 130
3 Control 2017-10-22 365 20 264 59 4798 1315 1232.51 185
4 Control 2017-10-23 357 22 324 77 5674 1350 1324.40 170
In [6]:
skillz_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 134 entries, 0 to 133
Data columns (total 10 columns):
experiment_group           134 non-null object
date                       134 non-null datetime64[ns]
daily_installs             134 non-null int64
new_depositors             134 non-null int64
z_daily_active_users       134 non-null int64
cash_daily_active_users    134 non-null int64
z_games                    134 non-null int64
cash_games                 134 non-null int64
entry_fees                 134 non-null float64
deposits                   134 non-null int64
dtypes: datetime64[ns](1), float64(1), int64(7), object(1)
memory usage: 10.5+ KB
In [7]:
skillz_df.set_index('date', inplace=True)
In [18]:
skillz_df.index = skillz_df.index.to_period('D')

7-day Centered Rolling Mean

In [8]:
numeric_columns = skillz_df.columns[1:]
In [9]:
skillz_7d_control = skillz_df[numeric_columns][skillz_df.experiment_group == 'Control'].rolling(7, center=True).mean()
skillz_7d_test = skillz_df[numeric_columns][skillz_df.experiment_group == 'Test'].rolling(7, center=True).mean()
In [24]:
skillz_7d_control.head(10)
Out[24]:
daily_installs new_depositors z_daily_active_users cash_daily_active_users z_games cash_games entry_fees deposits
date
2017-10-19 NaN NaN NaN NaN NaN NaN NaN NaN
2017-10-20 NaN NaN NaN NaN NaN NaN NaN NaN
2017-10-21 NaN NaN NaN NaN NaN NaN NaN NaN
2017-10-22 328.714286 18.000000 248.428571 59.142857 4271.857143 1130.428571 1268.330000 220.000000
2017-10-23 349.857143 20.857143 307.285714 75.428571 5263.142857 1421.571429 1590.985714 262.142857
2017-10-24 358.714286 22.142857 358.142857 91.857143 6129.000000 1653.000000 1847.217143 300.714286
2017-10-25 354.571429 21.428571 400.000000 105.714286 6929.714286 1873.142857 2097.965714 343.571429
2017-10-26 356.142857 21.285714 440.000000 117.857143 7822.571429 2044.428571 2337.335714 367.142857
2017-10-27 354.428571 20.571429 482.142857 129.000000 8796.428571 2245.142857 2610.718571 415.000000
2017-10-28 340.142857 19.142857 522.714286 138.857143 9685.857143 2359.142857 2744.488571 429.285714
In [25]:
skillz_7d_test.head(10)
Out[25]:
daily_installs new_depositors z_daily_active_users cash_daily_active_users z_games cash_games entry_fees deposits
date
2017-10-19 NaN NaN NaN NaN NaN NaN NaN NaN
2017-10-20 NaN NaN NaN NaN NaN NaN NaN NaN
2017-10-21 NaN NaN NaN NaN NaN NaN NaN NaN
2017-10-22 319.285714 14.142857 247.571429 56.857143 4571.571429 1439.571429 1614.420000 180.000000
2017-10-23 345.285714 15.857143 309.428571 74.571429 5717.000000 1805.000000 2244.482857 213.571429
2017-10-24 343.857143 17.571429 363.285714 93.142857 6487.571429 2217.571429 3121.850000 264.285714
2017-10-25 344.142857 18.142857 407.142857 109.428571 7179.714286 2511.714286 3658.224286 305.000000
2017-10-26 345.571429 19.000000 444.142857 124.857143 7766.142857 2827.142857 4255.007143 360.714286
2017-10-27 344.142857 18.571429 481.571429 138.857143 8456.142857 3146.714286 4893.014286 392.857143
2017-10-28 333.285714 17.714286 515.857143 151.571429 9012.714286 3421.142857 5069.988571 379.285714

Weekly Mean

In [12]:
skillz_weekly_control = skillz_df[numeric_columns][skillz_df.experiment_group == 'Control'].resample('W').mean()
skillz_weekly_test = skillz_df[numeric_columns][skillz_df.experiment_group == 'Test'].resample('W').mean()
In [20]:
skillz_weekly_control
Out[20]:
daily_installs new_depositors z_daily_active_users cash_daily_active_users z_games cash_games entry_fees deposits
date
2017-10-22 306.000000 15.250000 161.500000 35.000000 2794.000000 695.000000 707.350000 127.500000
2017-10-29 356.142857 21.285714 440.000000 117.857143 7822.571429 2044.428571 2337.335714 367.142857
2017-11-05 277.571429 21.571429 683.857143 182.714286 13168.428571 3244.857143 3773.835714 587.857143
2017-11-12 264.714286 23.428571 890.000000 231.714286 17258.142857 3934.428571 4294.037143 613.571429
2017-11-19 342.285714 23.857143 1066.428571 269.714286 20781.857143 4671.142857 5626.441429 905.142857
2017-11-26 405.285714 21.000000 1366.428571 324.142857 26173.428571 5165.428571 6196.648571 948.571429
2017-12-03 398.714286 26.142857 1587.571429 366.428571 28694.428571 5928.857143 7383.361429 1089.571429
2017-12-10 417.142857 28.714286 1766.857143 413.285714 31762.000000 6341.428571 8609.475714 1049.714286
2017-12-17 415.142857 31.285714 1877.571429 463.857143 33402.571429 7077.571429 10453.257143 1526.571429
2017-12-24 376.714286 27.285714 2004.285714 485.000000 35178.714286 7955.285714 13478.264286 1640.428571
In [21]:
skillz_weekly_test
Out[21]:
daily_installs new_depositors z_daily_active_users cash_daily_active_users z_games cash_games entry_fees deposits
date
2017-10-22 296.500000 10.750000 154.250000 29.000000 3003.000000 783.750000 648.415000 85.000000
2017-10-29 345.571429 19.000000 444.142857 124.857143 7766.142857 2827.142857 4255.007143 360.714286
2017-11-05 274.285714 21.428571 643.857143 199.142857 11149.857143 4355.285714 7442.741429 628.000000
2017-11-12 263.857143 20.857143 833.142857 263.000000 14991.714286 4928.285714 4886.754286 577.142857
2017-11-19 355.857143 27.000000 1032.285714 323.000000 18599.714286 6293.000000 5396.095714 745.857143
2017-11-26 405.428571 24.714286 1252.571429 386.285714 22815.285714 7221.000000 4879.284286 647.857143
2017-12-03 399.142857 30.428571 1498.714286 460.571429 26149.142857 8458.571429 8528.761429 1047.857143
2017-12-10 407.142857 31.857143 1699.000000 561.714286 29113.142857 10432.428571 8982.507143 1140.142857
2017-12-17 422.714286 25.571429 1826.571429 593.000000 30755.571429 10802.428571 13109.272857 1466.571429
2017-12-24 373.714286 32.000000 1893.142857 630.857143 31943.428571 11348.142857 14717.127143 1760.000000

Plots

Daily Counts

In [15]:
df_columns = skillz_df.columns[1:]
In [19]:
fix, axes = plt.subplots(8, 1, figsize=(15, 22), sharex=True)
for name, ax in zip(df_columns, axes):
    sns.barplot(data=skillz_df, x=skillz_df.index, y=name, ax=ax, hue='experiment_group')
    plt.xticks(rotation=90)

Weekly Mean Resample

In [16]:
weeks = mdates.WeekdayLocator()
In [17]:
fix, axes = plt.subplots(8, 1, figsize=(15, 60), sharex=True)

for name, ax in zip(df_columns, axes):
    ax.plot(skillz_df[name][skillz_df.experiment_group == 'Control'], marker='.', linestyle='-', linewidth=0.5, label='Control: Daily')
    ax.plot(skillz_weekly_control[name], marker='o', markersize=8, linestyle='-', label='Control: Weekly Mean Resample')
    ax.plot(skillz_7d_control[name], marker='.', markersize=5, linestyle='-', label='Control: 7-Day Rolling Mean')
    
    ax.plot(skillz_df[name][skillz_df.experiment_group == 'Test'], marker='.', linestyle='-', linewidth=0.5, label='Test: Daily')
    ax.plot(skillz_weekly_test[name], marker='o', markersize=8, linestyle='-', label='Test: Weekly Mean Resample')
    ax.plot(skillz_7d_test[name], marker='.', markersize=5, linestyle='-', label='Test: 7-Day Rolling Mean')
    plt.xticks(rotation=90)
    ax.set_ylabel(name)
    ax.xaxis.set_major_locator(weeks)
    ax.legend()

entry_fees vs. cash_daily_active_users

In [34]:
plt.subplots(figsize=(7, 7))
sns.scatterplot(skillz_df.cash_daily_active_users, skillz_df.entry_fees, hue=skillz_df.experiment_group)
plt.show()

cash_games vs. entry_fees

In [35]:
plt.subplots(figsize=(7, 7))
sns.scatterplot(skillz_df.entry_fees, skillz_df.cash_games, hue=skillz_df.experiment_group)
plt.show()

cash_games vs. deposits

In [36]:
plt.subplots(figsize=(7, 7))
sns.scatterplot(skillz_df.deposits, skillz_df.cash_games, hue=skillz_df.experiment_group)
plt.show()

deposits vs. new_depositors

In [37]:
plt.subplots(figsize=(7, 7))
sns.scatterplot(skillz_df.new_depositors, skillz_df.deposits, hue=skillz_df.experiment_group)
plt.show()

z_games vs. z_daily_active_users

In [38]:
plt.subplots(figsize=(7, 7))
sns.scatterplot(skillz_df.z_daily_active_users, skillz_df.z_games, hue=skillz_df.experiment_group)
plt.show()

Questions

The provided data is inadequate and associated questions aren't relavant for fraud analytics

  1. What additional data would you have liked to see in this analysis?
    • Individual account data would be required for fraud detection
    • Tagged data to implement supervised learning models
In [ ]:
 
In [ ]: