# Data and array manipulation
import pandas as pd
import numpy as np
import datetime as dt

# Regression analysis
import statsmodels.api as sm
import statsmodels.formula.api as smf

# Plotting and Visualization
import matplotlib.pyplot as plt
import plotly.express as px
import cufflinks as cf

# Interactive charts
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

# Options
init_notebook_mode(connected = True)
cf.go_offline()
pd.options.display.float_format = '{:,.3f}'.format
pd.set_option('display.max_columns', None)


# Load the data
df = pd.read_csv('candy-data.csv')

# Assign dummy and numerical columns
dummies = ['chocolate', 'fruity', 'caramel', 'peanutyalmondy', 'nougat', 'crispedricewafer', 'hard', 'bar', 'pluribus']
numerical = ['sugarpercent', 'pricepercent', 'winpercent']

# Show the data
df\
    .sort_values(dummies, ascending = False)\
    .set_index('competitorname')\
    .style\
    .format({'sugarpercent': '{:.2%}', 'pricepercent': '{:.2%}', 'winpercent': '{:,.2f}'})\
    .highlight_max(dummies, color = 'lightgreen')\
    .highlight_min(dummies, color = 'white')\
    .bar(color = '#ff8b59', subset = numerical, align = 'zero', height = 80)


# Generate empty count table
feature = df[dummies].columns
count = np.repeat(0, 9)
tab_co = pd.DataFrame([count, count, count,
                       count, count, count,
                       count, count, count], 
                       index = feature, 
                       columns = feature)

# count co-occurences
df_dummies = df[dummies]
for i in range(9):
    for j in range(i + 1, 9, 1):
        for x in df_dummies.index:
            if (df_dummies.iloc[x, i] == 1) and (df_dummies.iloc[x, j] == 1):
                tab_co.iloc[i, j] += 1
                tab_co.iloc[j, i] += 1

# print the table
fig = px.imshow(tab_co, text_auto = '.0f', width = 800, height = 800, title = 'Co-occurence matrix')
fig.update_traces(textfont = dict(size = 25))
fig.show()


# Feature frequencies 
fig = px.bar(x = df[dummies].sum(axis = 0).sort_values(), 
             y = df[dummies].sum(axis = 0).sort_values().index,
             labels = {'x': '', 'y': ''},
             title = 'Feature frequencies',
             text_auto = True,
             width = 800,
             height = 600)
fig.update_layout(yaxis_visible = True, xaxis_visible = False, xaxis_showticklabels = False)
fig.update_traces(textfont = dict(size = 20))
fig.show()


# Scatter plot matrix
fig = px.scatter_matrix(df[numerical],
                        height = 800,
                        width = 800,
                        title = 'Scatter matrix of sugarpercent, pricepercent, and winpercent')

# Update layout to reduce clutter
fig.update_traces(diagonal_visible = False)

# Show figure
fig.show()


# ordering variables for later use
order_column = ['winpercent', 'chocolate', 'peanutyalmondy', 
                'fruity', 'sugarpercent', 'hard', 
                'crispedricewafer', 'pricepercent', 'caramel', 
                'pluribus', 'nougat', 'bar']

# Correlation matrix
fig = px.imshow(df[order_column].corr(), text_auto = '.2f', width = 800, height = 800, 
                title = 'Correlation matrix')
fig.show()


# Prepare variables for modelling
df_y = df['winpercent']
df_x = sm.add_constant(df.drop(['competitorname', 'winpercent'], axis = 1))

# Fit the model
model_full = sm.OLS(df_y, df_x)
result_full = model_full.fit()


# performance diagnostic
result_full.summary().tables[0]


# residuals diagnostic
result_full.summary().tables[2]


# formatting functions for highlighting 
def add_color_p(x):
    if x < 0.05:
        color = '#03fca9'
    elif x < 0.1:
        color = '#f0d88b'
    else:
        color = '#e83c5c'
    return f'background: {color}'

def add_color_t(x):
    if np.abs(x) > 1.96:
        color = '#03fca9'
    elif np.abs(x) > 1.645:
        color = '#f0d88b'
    else:
        color = '#e83c5c'
    return f'background: {color}'

# effects diagnostic
eff_full = result_full.summary().tables[1]
eff_full = pd.read_html(eff_full.as_html(), header = 0, index_col = 0)[0]
eff_full.sort_values('P>|t|')\
    .style\
    .format({'P>|t|': '{:.2%}', 'coef': '{:.2f}', 'std err': '{:.2f}', 't': '{:.2f}', '[0.025': '{:.2f}', '0.975]': '{:.2f}'})\
    .bar(color = ['#e83c5c', '#03fca9'], subset = ['coef', 'std err'], align = 'zero', height = 80)\
    .applymap(add_color_t, subset = ['t'])\
    .applymap(add_color_p, subset = ['P>|t|'])


def forward_selected(data, response):
    """Linear model designed by forward selection.

    Parameters:
    -----------
    data : pandas DataFrame with all possible predictors and response

    response: string, name of response column in data

    Returns:
    --------
    model: an "optimal" fitted statsmodels linear model
           with an intercept
           selected by forward selection
           evaluated by adjusted R-squared
    """
    remaining = set(data.columns)
    remaining.remove(response)
    selected = []
    current_score, best_new_score = 0.0, 0.0
    
    while remaining and current_score == best_new_score:
        scores_with_candidates = []
        for candidate in remaining:
            formula = "{} ~ {} + 1".format(response, ' + '.join(selected + [candidate]))
            score = smf.ols(formula, data).fit().rsquared_adj
            scores_with_candidates.append((score, candidate))
        scores_with_candidates.sort()
        best_new_score, best_candidate = scores_with_candidates.pop()
        if current_score < best_new_score:
            remaining.remove(best_candidate)
            selected.append(best_candidate)
            current_score = best_new_score
    formula = "{} ~ {} + 1".format(response, ' + '.join(selected))
    model = smf.ols(formula, data).fit()
    
    return model


model = forward_selected(df.drop('competitorname', axis = 1), 'winpercent')
model.model.formula

'winpercent ~ chocolate + peanutyalmondy + fruity + crispedricewafer + sugarpercent + hard + pricepercent + 1'


model_selected = smf.ols('winpercent ~ chocolate + peanutyalmondy + fruity + crispedricewafer + sugarpercent + hard + 1', data = df)
result_selected = model_selected.fit()


result_selected.summary().tables[0]


result_selected.summary().tables[2]


# effects diagnostic
eff_selected = result_selected.summary().tables[1]
eff_selected = pd.read_html(eff_selected.as_html(), header = 0, index_col = 0)[0]
eff_selected\
    .sort_values('P>|t|')\
    .style\
    .format({'P>|t|': '{:.2%}', 'coef': '{:.2f}', 'std err': '{:.2f}', 't': '{:.2f}', '[0.025': '{:.2f}', '0.975]': '{:.2f}'})\
    .bar(color = ['#e83c5c', '#03fca9'], subset = ['coef', 'std err'], align = 'zero', height = 80)\
    .applymap(add_color_t, subset = ['t'])\
    .applymap(add_color_p, subset = ['P>|t|'])

	chocolate	fruity	caramel	peanutyalmondy	nougat	crispedricewafer	hard	bar	pluribus	sugarpercent	pricepercent	winpercent
competitorname
Tootsie Pop	1	1	0	0	0	0	1	0	0	60.40%	32.50%	48.98
Baby Ruth	1	0	1	1	1	0	0	1	0	60.40%	76.70%	56.91
Snickers	1	0	1	1	1	0	0	1	0	54.60%	65.10%	76.67
Snickers Crisper	1	0	1	1	0	1	0	1	0	60.40%	65.10%	59.53
Milky Way	1	0	1	0	1	0	0	1	0	60.40%	65.10%	73.10
Milky Way Midnight	1	0	1	0	1	0	0	1	0	73.20%	44.10%	60.80
100 Grand	1	0	1	0	0	1	0	1	0	73.20%	86.00%	66.97
Twix	1	0	1	0	0	1	0	1	0	54.60%	90.60%	81.64
Milky Way Simply Caramel	1	0	1	0	0	0	0	1	0	96.50%	86.00%	64.35
Milk Duds	1	0	1	0	0	0	0	0	1	30.20%	51.10%	55.06
Rolo	1	0	1	0	0	0	0	0	1	86.00%	86.00%	65.72
Almond Joy	1	0	0	1	0	0	0	1	0	46.50%	76.70%	50.35
Mr Good Bar	1	0	0	1	0	0	0	1	0	31.30%	91.80%	54.53
Nestle Butterfinger	1	0	0	1	0	0	0	1	0	60.40%	76.70%	70.74
Peanut butter M&MÕs	1	0	0	1	0	0	0	0	1	82.50%	65.10%	71.47
Peanut M&Ms	1	0	0	1	0	0	0	0	1	59.30%	65.10%	69.48
ReeseÕs pieces	1	0	0	1	0	0	0	0	1	40.60%	65.10%	73.43
ReeseÕs Miniatures	1	0	0	1	0	0	0	0	0	3.40%	27.90%	81.87
ReeseÕs Peanut Butter cup	1	0	0	1	0	0	0	0	0	72.00%	65.10%	84.18
ReeseÕs stuffed with pieces	1	0	0	1	0	0	0	0	0	98.80%	65.10%	72.89
3 Musketeers	1	0	0	0	1	0	0	1	0	60.40%	51.10%	67.60
Charleston Chew	1	0	0	0	1	0	0	1	0	60.40%	51.10%	38.98
HersheyÕs Krackel	1	0	0	0	0	1	0	1	0	43.00%	91.80%	62.28
Kit Kat	1	0	0	0	0	1	0	1	0	31.30%	51.10%	76.77
Nestle Crunch	1	0	0	0	0	1	0	1	0	31.30%	76.70%	66.47
Whoppers	1	0	0	0	0	1	0	0	1	87.20%	84.80%	49.52
HersheyÕs Milk Chocolate	1	0	0	0	0	0	0	1	0	43.00%	91.80%	56.49
HersheyÕs Special Dark	1	0	0	0	0	0	0	1	0	43.00%	91.80%	59.24
Mounds	1	0	0	0	0	0	0	1	0	31.30%	86.00%	47.83
Tootsie Roll Snack Bars	1	0	0	0	0	0	0	1	0	46.50%	32.50%	49.65
HersheyÕs Kisses	1	0	0	0	0	0	0	0	1	12.70%	9.30%	55.38
Junior Mints	1	0	0	0	0	0	0	0	1	19.70%	51.10%	57.22
M&MÕs	1	0	0	0	0	0	0	0	1	82.50%	65.10%	66.57
Sixlets	1	0	0	0	0	0	0	0	1	22.00%	8.10%	34.72
Nestle Smarties	1	0	0	0	0	0	0	0	1	26.70%	97.60%	37.89
Tootsie Roll Midgies	1	0	0	0	0	0	0	0	1	17.40%	1.10%	45.74
Tootsie Roll Juniors	1	0	0	0	0	0	0	0	0	31.30%	51.10%	43.07
Caramel Apple Pops	0	1	1	0	0	0	0	0	0	60.40%	32.50%	34.52
Gobstopper	0	1	0	0	0	0	1	0	1	90.60%	45.30%	46.78
Jawbusters	0	1	0	0	0	0	1	0	1	9.30%	51.10%	28.13
Nerds	0	1	0	0	0	0	1	0	1	84.80%	32.50%	55.35
Pop Rocks	0	1	0	0	0	0	1	0	1	60.40%	83.70%	41.27
Runts	0	1	0	0	0	0	1	0	1	87.20%	27.90%	42.85
Smarties candy	0	1	0	0	0	0	1	0	1	26.70%	11.60%	46.00
Strawberry bon bons	0	1	0	0	0	0	1	0	1	56.90%	5.80%	34.58
Dum Dums	0	1	0	0	0	0	1	0	0	73.20%	3.40%	39.46
Fun Dip	0	1	0	0	0	0	1	0	0	73.20%	32.50%	39.19
Lemonhead	0	1	0	0	0	0	1	0	0	4.60%	10.40%	39.14
Ring pop	0	1	0	0	0	0	1	0	0	73.20%	96.50%	35.29
Warheads	0	1	0	0	0	0	1	0	0	9.30%	11.60%	39.01
Chewey Lemonhead Fruit Mix	0	1	0	0	0	0	0	0	1	73.20%	51.10%	36.02
Chiclets	0	1	0	0	0	0	0	0	1	4.60%	32.50%	24.52
Dots	0	1	0	0	0	0	0	0	1	73.20%	51.10%	42.27
Fruit Chews	0	1	0	0	0	0	0	0	1	12.70%	3.40%	43.09
Haribo Gold Bears	0	1	0	0	0	0	0	0	1	46.50%	46.50%	57.12
Haribo Sour Bears	0	1	0	0	0	0	0	0	1	46.50%	46.50%	51.41
Haribo Twin Snakes	0	1	0	0	0	0	0	0	1	46.50%	46.50%	42.18
Mike & Ike	0	1	0	0	0	0	0	0	1	87.20%	32.50%	46.41
Nik L Nip	0	1	0	0	0	0	0	0	1	19.70%	97.60%	22.45
Now & Later	0	1	0	0	0	0	0	0	1	22.00%	32.50%	39.45
Red vines	0	1	0	0	0	0	0	0	1	58.10%	11.60%	37.35
Skittles original	0	1	0	0	0	0	0	0	1	94.10%	22.00%	63.09
Skittles wildberry	0	1	0	0	0	0	0	0	1	94.10%	22.00%	55.10
Sour Patch Kids	0	1	0	0	0	0	0	0	1	6.90%	11.60%	59.86
Sour Patch Tricksters	0	1	0	0	0	0	0	0	1	6.90%	11.60%	52.83
Starburst	0	1	0	0	0	0	0	0	1	15.10%	22.00%	67.04
Swedish Fish	0	1	0	0	0	0	0	0	1	60.40%	75.50%	54.86
Trolli Sour Bites	0	1	0	0	0	0	0	0	1	31.30%	25.50%	47.17
WelchÕs Fruit Snacks	0	1	0	0	0	0	0	0	1	31.30%	31.30%	44.38
Air Heads	0	1	0	0	0	0	0	0	0	90.60%	51.10%	52.34
Laffy Taffy	0	1	0	0	0	0	0	0	0	22.00%	11.60%	41.39
Lifesavers big ring gummies	0	1	0	0	0	0	0	0	0	26.70%	27.90%	52.91
Super Bubble	0	1	0	0	0	0	0	0	0	16.20%	11.60%	27.30
Twizzlers	0	1	0	0	0	0	0	0	0	22.00%	11.60%	45.47
WertherÕs Original Caramel	0	0	1	0	0	0	1	0	0	18.60%	26.70%	41.90
Sugar Babies	0	0	1	0	0	0	0	0	1	96.50%	76.70%	33.44
Sugar Daddy	0	0	1	0	0	0	0	0	0	41.80%	32.50%	32.23
Payday	0	0	0	1	1	0	0	1	0	46.50%	76.70%	46.30
Boston Baked Beans	0	0	0	1	0	0	0	0	1	31.30%	51.10%	23.42
Root Beer Barrels	0	0	0	0	0	0	1	0	1	73.20%	6.90%	29.70
Candy Corn	0	0	0	0	0	0	0	0	1	90.60%	32.50%	38.01
Haribo Happy Cola	0	0	0	0	0	0	0	0	1	46.50%	46.50%	34.16
Pixie Sticks	0	0	0	0	0	0	0	0	1	9.30%	2.30%	37.72
One dime	0	0	0	0	0	0	0	0	0	1.10%	11.60%	32.26
One quarter	0	0	0	0	0	0	0	0	0	1.10%	51.10%	46.12

Dep. Variable:	winpercent	R-squared:	0.540
Model:	OLS	Adj. R-squared:	0.471
Method:	Least Squares	F-statistic:	7.797
Date:	Fri, 16 Sep 2022	Prob (F-statistic):	9.50e-09
Time:	05:33:05	Log-Likelihood:	-315.64
No. Observations:	85	AIC:	655.3
Df Residuals:	73	BIC:	684.6
Df Model:	11
Covariance Type:	nonrobust

Omnibus:	1.025	Durbin-Watson:	1.728
Prob(Omnibus):	0.599	Jarque-Bera (JB):	1.000
Skew:	-0.104	Prob(JB):	0.606
Kurtosis:	2.511	Cond. No.	10.0

	coef	std err	t	P>\|t\|	[0.025	0.975]
const	34.53	4.32	7.99	0.00%	25.92	43.14
chocolate	19.75	3.90	5.07	0.00%	11.98	27.52
peanutyalmondy	10.07	3.62	2.79	0.70%	2.86	17.28
fruity	9.42	3.76	2.50	1.50%	1.92	16.92
sugarpercent	9.09	4.66	1.95	5.50%	-0.20	18.37
hard	-6.17	3.46	-1.78	7.90%	-13.05	0.72
crispedricewafer	8.92	5.27	1.69	9.50%	-1.58	19.42
pricepercent	-5.93	5.51	-1.07	28.60%	-16.92	5.06
caramel	2.22	3.66	0.61	54.50%	-5.07	9.51
pluribus	-0.85	3.04	-0.28	77.90%	-6.91	5.20
nougat	0.80	5.72	0.14	88.80%	-10.59	12.20
bar	0.44	5.06	0.09	93.10%	-9.64	10.53

Dep. Variable:	winpercent	R-squared:	0.528
Model:	OLS	Adj. R-squared:	0.492
Method:	Least Squares	F-statistic:	14.54
Date:	Fri, 16 Sep 2022	Prob (F-statistic):	4.62e-11
Time:	05:33:05	Log-Likelihood:	-316.76
No. Observations:	85	AIC:	647.5
Df Residuals:	78	BIC:	664.6
Df Model:	6
Covariance Type:	nonrobust

Candy bar product design¶

Table of contents

1 Case description

2 Data description

3 Requirements

4 Important packages and analytical pipeline

5 Initial investigation

6 Regression analysis

6.1 Simple OLS regression

6.2 Stepwise selection

7 Executive summary and Conclusion

Omnibus:	0.545	Durbin-Watson:	1.735
Prob(Omnibus):	0.761	Jarque-Bera (JB):	0.682
Skew:	-0.093	Prob(JB):	0.711
Kurtosis:	2.602	Cond. No.	6.32

	coef	std err	t	P>\|t\|	[0.025	0.975]
Intercept	32.94	3.52	9.37	0.00%	25.94	39.94
chocolate	19.15	3.59	5.34	0.00%	12.01	26.29
peanutyalmondy	9.48	3.45	2.75	0.70%	2.62	16.34
fruity	8.88	3.56	2.49	1.50%	1.79	15.97
sugarpercent	7.98	4.13	1.93	5.70%	-0.24	16.20
crispedricewafer	8.39	4.48	1.87	6.50%	-0.54	17.31
hard	-5.67	3.29	-1.72	8.90%	-12.22	0.88