leggingsfortheworld











This is my very first post written as an ipython notebook.
Thanks to my dear friends at leggingsfortheworld.com,
I obtained their entire customer purchase database.

Here is a quick exploratory data analysis,
mainly to practice some python skills:

In [25]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display, HTML

%matplotlib inline
color = sns.color_palette()

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

from subprocess import check_output
#print(check_output(["ls", "../input"]).decode("utf8"))

# Any results you write to the current directory are saved as output.
In [26]:
df = pd.read_csv('dataSmall.csv',encoding="ISO-8859-1",dtype={'CustomerID': str,'InvoiceID': str})
df.InvoiceDate = pd.to_datetime(df.InvoiceDate, format="%m/%d/%Y %H:%M")
df.info() 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8678 entries, 0 to 8677
Data columns (total 8 columns):
InvoiceNo      8678 non-null object
StockCode      8678 non-null object
Description    8652 non-null object
Quantity       8678 non-null int64
InvoiceDate    8678 non-null datetime64[ns]
UnitPrice      8678 non-null float64
CustomerID     6518 non-null object
Country        8678 non-null object
dtypes: datetime64[ns](1), float64(1), int64(1), object(5)
memory usage: 542.5+ KB
In [27]:
order_canceled = df['InvoiceNo'].apply(lambda x:int('C' in x))
n1 = order_canceled.sum()
n2 = df.shape[0]
print('Number of orders canceled: {}/{} ({:.2f}%) '.format(n1, n2, n1/n2*100))
Number of orders canceled: 185/8678 (2.13%) 

There is 209 transaction that have negative quantity or unitPrice

In [28]:
# Item sale with Quantity <=0 or unitPrice < 0
print (((df['Quantity'] <= 0) | (df['UnitPrice'] < 0)).value_counts())

#Delete the negative values 
df = df.loc[(df['Quantity'] > 0) | (df['UnitPrice'] >= 0)]
False    8469
True      209
dtype: int64
In [29]:
%%timeit
df['yearmonth'] = df['InvoiceDate'].apply(lambda x: (100*x.year) + x.month)
df['Week'] = df['InvoiceDate'].apply(lambda x: x.strftime('%W'))
df['day'] = df['InvoiceDate'].apply(lambda x: x.strftime('%d'))
df['Weekday'] = df['InvoiceDate'].apply(lambda x: x.strftime('%w'))
df['hour'] = df['InvoiceDate'].apply(lambda x: x.strftime('%H'))
335 ms ± 83 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

Number of Invoice

Number Invoice group by Month

In [30]:
plt.figure(figsize=(12,6))
plt.title("Frequency of order by Month", fontsize=15)
InvoiceDate = df.groupby(['InvoiceNo'])['yearmonth'].unique()
InvoiceDate.value_counts().sort_index().plot.bar()
Out[30]:
<matplotlib.axes._subplots.AxesSubplot at 0x28b6ac04710>
In [31]:
# 2010-12 Dataset end at
df.loc[df['yearmonth'] == 201012]['InvoiceDate'].max()
Out[31]:
NaT
In [32]:
# 2011-12 Dataset end at
df.loc[df['yearmonth'] == 201112]['InvoiceDate'].max()
Out[32]:
NaT

Number Invoice group by Week

In [33]:
plt.figure(figsize=(12,6))
plt.title("Frequency of order by Week", fontsize=15)
InvoiceDate = df.groupby(['InvoiceNo'])['Week'].unique()
InvoiceDate.value_counts().sort_index().plot.bar()
Out[33]:
<matplotlib.axes._subplots.AxesSubplot at 0x28b6947dc50>

Number Invoice group by day

In [34]:
plt.figure(figsize=(12,6))
plt.title("Frequency of order by Day", fontsize=15)
InvoiceDate = df.groupby(['InvoiceNo'])['day'].unique()
InvoiceDate.value_counts().sort_index().plot.bar()
Out[34]:
<matplotlib.axes._subplots.AxesSubplot at 0x28b692ce0b8>
In [35]:
grouped_df  = df.groupby(["day", "hour"])["InvoiceNo"].unique().reset_index()
grouped_df["InvoiceNo"] = grouped_df["InvoiceNo"].apply(len)

#grouped_df = df.groupby(["Weekday", "hour"])["InvoiceNo"].aggregate("count").reset_index()
grouped_df = grouped_df.pivot('day', 'hour', 'InvoiceNo')

plt.figure(figsize=(12,6))
sns.heatmap(grouped_df)
plt.title("Frequency of Day Vs Hour of day")
plt.show()

Number Invoice group by Weekday

[0] ~ Sunday

In [36]:
#[0] ~ Sunday
plt.figure(figsize=(12,6))
plt.title("Frequency of order by Weekday", fontsize=15)
InvoiceDate = df.groupby(['InvoiceNo'])['Weekday'].unique()
InvoiceDate.value_counts().sort_index().plot.bar()
Out[36]:
<matplotlib.axes._subplots.AxesSubplot at 0x28b692c4588>

Number Invoice group by Hour

In [37]:
plt.figure(figsize=(12,6))
plt.title("Frequency of order by hour of day", fontsize=15)
InvoiceDate = df.groupby(['InvoiceNo'])['hour'].unique()
(InvoiceDate.value_counts()).iloc[0:-1].sort_index().plot.bar()
Out[37]:
<matplotlib.axes._subplots.AxesSubplot at 0x28b69e4c3c8>

Frequency of Day of week Vs Hour of day

In [38]:
grouped_df  = df.groupby(["Weekday", "hour"])["InvoiceNo"].unique().reset_index()
grouped_df["InvoiceNo"] = grouped_df["InvoiceNo"].apply(len)

#grouped_df = df.groupby(["Weekday", "hour"])["InvoiceNo"].aggregate("count").reset_index()
grouped_df = grouped_df.pivot('Weekday', 'hour', 'InvoiceNo')

plt.figure(figsize=(12,6))
sns.heatmap(grouped_df)
plt.title("Frequency of Day of week Vs Hour of day")
plt.show()

Total Sales (First order vs Reorder)

In [39]:
df = pd.read_csv('dataSmall.csv',encoding="ISO-8859-1",dtype={'CustomerID': str,'InvoiceID': str})
df.InvoiceDate = pd.to_datetime(df.InvoiceDate, format="%m/%d/%Y %H:%M")

#remove the negative values and replace with nan
df.loc[df['Quantity'] <= 0, 'Quantity'] = np.nan
df.loc[df['UnitPrice'] < 0, 'UnitPrice'] = np.nan

df.dropna(inplace=True)

df['total_dollars'] = df['Quantity']*df['UnitPrice']

df['yearmonth'] = df['InvoiceDate'].apply(lambda x: (100*x.year) + x.month)
df['Week'] = df['InvoiceDate'].apply(lambda x: x.strftime('%W'))
df['day'] = df['InvoiceDate'].apply(lambda x: x.strftime('%d'))
df['Weekday'] = df['InvoiceDate'].apply(lambda x: x.strftime('%w'))
df['hour'] = df['InvoiceDate'].apply(lambda x: x.strftime('%H'))
In [40]:
#First Item Order
df_sort = df.sort_values(['CustomerID', 'StockCode', 'InvoiceDate'])
df_sort_shift1 = df_sort.shift(1)
df_sort_reorder = df_sort.copy()
df_sort_reorder['reorder'] = np.where(df_sort['StockCode'] == df_sort_shift1['StockCode'], 1,0)

Sales by Month

In [41]:
notreorder = (df_sort_reorder[df_sort_reorder['reorder'] == 0 ].groupby(['yearmonth'])['total_dollars'].sum())
reorder = (df_sort_reorder[df_sort_reorder['reorder'] == 1 ].groupby(['yearmonth'])['total_dollars'].sum())

yearmonth = pd.DataFrame([notreorder , reorder], index=['First Buy', 'Reorder']).transpose()

fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(14,9))

yearmonth.plot.bar(stacked=True, ax=axes[0])
yearmonth.plot.box(ax=axes[1])
Out[41]:
<matplotlib.axes._subplots.AxesSubplot at 0x28b6c8d6d68>

Sales by Week

In [42]:
notreorder = (df_sort_reorder[df_sort_reorder['reorder'] == 0 ].groupby(['Week'])['total_dollars'].sum())
reorder = (df_sort_reorder[df_sort_reorder['reorder'] == 1 ].groupby(['Week'])['total_dollars'].sum())

yearmonth = pd.DataFrame([notreorder , reorder], index=['First Buy', 'Reorder']).transpose()

fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(14,9))

yearmonth.plot.bar(stacked=True, ax=axes[0])
yearmonth.plot.box(ax=axes[1])
Out[42]:
<matplotlib.axes._subplots.AxesSubplot at 0x28b6cb37128>

Sales by Day

In [43]:
notreorder = (df_sort_reorder[df_sort_reorder['reorder'] == 0 ].groupby(['day'])['total_dollars'].sum())
reorder = (df_sort_reorder[df_sort_reorder['reorder'] == 1 ].groupby(['day'])['total_dollars'].sum())

yearmonth = pd.DataFrame([notreorder , reorder], index=['First Buy', 'Reorder']).transpose()

fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(14,9))

yearmonth.plot.bar(stacked=True, ax=axes[0])
yearmonth.plot.box(ax=axes[1])
Out[43]:
<matplotlib.axes._subplots.AxesSubplot at 0x28b6ad34978>