### Project 2: A/B Test ### """ A/B Testing (Imre Deak) Data: https://www.kaggle.com/datasets/faviovaz/marketing-ab-testing Target: To measure success of an ad campaign - did it successfully alter sales? Structure: > importing packages > loading/cleaning data, basic EDA > Univariate analysis (visualising variables, independent of each other) > Bivariate analysis (boxplots) > Statistical testing: > Chi-squared test (variable dependence test) > Picking suitable test based on assumption checks: > T-test - if data is normalised & variance is even > Mann-Whitney U test (nonparametric) - if any of the conditions above aren't met > Neither met: Mann-Whitney U test """ ### importing packages ---- import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns ### loading, cleaning data & visualisation palette ---- df = pd.read_csv("H:/my_directory/Website_Github/Portfolio/Project2\marketing_AB.csv") df.head() df.info() df.duplicated(subset = 'user id').sum() # sum of total of non-distinct data points df.drop(['Unnamed: 0', 'user id'], axis = 1, inplace = True) # removing data index & id df.columns df_cat = df[['test group', 'converted', 'most ads day', 'most ads hour']] # subset of categorical variables df_cat.nunique() # equivalent of 'levels' in R. # listing 'levels' for i in df_cat.columns: print(i.upper(), ":", df_cat[i].unique()) # colour palettes palette = sns.color_palette("RdBu") pie_colors = sns.color_palette("RdBu") ### Univariate analysis ---- # test group variable = 'test group' plt.figure(figsize = (6,4)) # Count plot plt.subplot(1,2,1) sns.countplot(x=variable, data=df_cat, palette = palette) plt.title(f'Count plot - {variable}') # Pie chart plt.subplot(1,2,2) counts = df_cat[variable].value_counts() plt.pie(counts, labels=counts.index, autopct='%0.2f%%', colors=pie_colors) plt.title(f'Pie chart - {variable}') # Show plot plt.show() # Conversion rates variable = 'converted' plt.figure(figsize = (6,4)) # Count plot plt.subplot(1,2,1) sns.countplot(x=variable, data=df_cat, palette = palette) plt.title(f'Count plot - {variable}') # Pie chart plt.subplot(1,2,2) counts = df_cat[variable].value_counts() plt.pie(counts, labels=counts.index, autopct='%0.2f%%', colors=pie_colors) plt.title(f'Pie chart - {variable}') # Show plot plt.show() # Most ads received a day variable = 'most ads day' plt.figure(figsize = (6,4)) # Count plot plt.subplot(1,2,1) sns.countplot(x=variable, data=df_cat, palette = palette, order = df_cat['most ads day'].value_counts().index) # value_counts() does descending order arrangement. plt.title(f'Count plot - {variable}') plt.xticks(rotation=90) # Pie chart plt.subplot(1,2,2) counts = df_cat[variable].value_counts() plt.pie(counts, labels=counts.index, autopct='%0.2f%%', colors=pie_colors) plt.title(f'Pie chart - {variable}') # Show plot plt.tight_layout() plt.show() # most ads hour variable = 'most ads hour' plt.figure(figsize = (8,4)) # Count plot plt.subplot(1,2,1) sns.countplot(x=variable, data=df_cat, order = df_cat['most ads hour'].value_counts().index) plt.title(f'Count plot - {variable}') plt.xticks(rotation=90) # Pie chart plt.subplot(1,2,2) counts = df_cat[variable].value_counts() plt.pie(counts, labels=counts.index, autopct='%0.2f%%') plt.title(f'Pie chart - {variable}') # Show plot plt.tight_layout() # makes spacing nicer plt.show() # total ads (non-categorical numeric) variable = 'total ads' plt.figure(figsize=(6,4)) # Histogram plt.subplot(1,2,1) sns.histplot(x=variable, data=df, palette=palette) plt.title(f'Histogram - {variable}') # Pie chart plt.subplot(1,2,2) sns.boxplot(y = variable, data = df, color=pie_colors) plt.title(f'Boxplot - {variable}') # Layout/show plot plt.tight_layout() plt.show() # <-- large data N, hence many outliers. Poor visualisation w/out filter df['total ads'].describe() # descriptive stats # Repeating previous plot but w/ a filter (trial and error estimation of upper boundary) variable = 'total ads' plt.figure(figsize=(6,4)) # Histogram plt.subplot(1,2,1) sns.histplot(x=variable, data=df[df['total ads'] < 50] ) plt.title(f'Histogram - {variable}') # Pie chart plt.subplot(1,2,2) sns.boxplot(y = variable, data = df[df['total ads'] < 50] ) plt.title(f'Boxplot - {variable}') # Layout/show plot plt.tight_layout() plt.show() # <-- boxplot spread is easily visible, covers all quartiles. ### Bivariate analysis ---- # Crosstab = creates a 2x2 table for two variables ct_conversion_test_group = pd.crosstab(df['test group'], df['converted'], normalize = 'index') ct_conversion_test_group ct_conversion_test_group.plot.bar(stacked = True) # visualising crosstab # repeating w/ other variables against 'converted' ct_conversion_day = pd.crosstab(df['most ads day'], df['converted'], normalize = 'index') print(ct_conversion_day.sort_values(by = True, ascending = False)) ct_conversion_day.plot.bar(stacked = True) ct_conversion_day = pd.crosstab(df['most ads hour'], df['converted'], normalize = 'index') print(ct_conversion_day.sort_values(by = True, ascending = False)) ct_conversion_day.plot.bar(stacked = True) # Raw & filtered boxplot comparison, converted = x-axis category sns.boxplot(x = 'converted', y = 'total ads', palette=palette, data = df) sns.boxplot(x = 'converted', y = 'total ads', palette=palette, data = df[df['total ads'] < 50]) ''' Plot above shows that the median number of times people who purchased the product saw the ad was 25, as opposed to ~10 for those who have not purchased the product (ie. converted) -> repeated targeting seems to yield a higher sales rate ''' ### Statistical testing ---- # Chi-squared test (Automated @ 5% significance level) from scipy.stats import chi2_contingency alpha = 0.05 for variable in df_cat.columns: if variable != 'converted': # Create crosstab contingency_table = pd.crosstab(df_cat[variable], df_cat['converted']) # Chi-squared test chi2, p, _, _ = chi2_contingency(contingency_table) # Display results print(f"\nChi-squared test for {variable} vs. converted:") print(f"Chi-squared test value: {chi2}") print(f"p-value: {p}") # Check for significance if p < alpha: print(f"The difference in conversion rates across {variable} is statistically significant.") else: print(f"There is no significant difference in conversion rates across {variable}.") # Checking assumptions for final test (t-test vs nonparametric alt.) from scipy.stats import shapiro, levene, ttest_ind, mannwhitneyu # Normality assumption shapiro_stat_true, shapiro_p_value_true = shapiro(df[df['converted'] == True]['total ads']) shapiro_stat_false, shapiro_p_value_false = shapiro(df[df['converted'] == False]['total ads']) print(f"Shapiro-Wilk test for normality (True group): p-value - {shapiro_p_value_true}") print(f"Shapiro-Wilk test for normality (False group): p-value - {shapiro_p_value_false}") # Equality of variances assumption levene_stat, levene_p_value = levene(df[df['converted']]['total ads'], df[~df['converted']]['total ads']) print(f"Levene's test for equality of variances: p-value = {levene_p_value}") # Automation of correct test based on assumption test results alpha = 0.05 if shapiro_p_value_true > alpha and shapiro_p_value_false > alpha and levene_p_value > alpha: # if assumptions are met - deploys t-test (based on mean) t_stat, t_p_value = ttest_ind(df[df['converted']]['total ads'], df[~df['converted']]['total ads']) print(f"Independent two-sample t-test: p-value = {t_p_value}") else: # otherwise - use Mann-Whitney U test (based on median) u_stat, u_p_value = mannwhitneyu(df[df['converted']]['total ads'], df[~df['converted']]['total ads']) print(f"Mann-Whitney U test: p-value = {u_p_value}") ''' Non-parametric Mann-Whitney U test, with p-value of 0.0 -> There is sufficient evidence to reject null hypothesis -> Median amount of total ads seen makes a statistically significant difference on aggregate purchasing decision '''