-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathretail-rfm.py
125 lines (102 loc) · 4.38 KB
/
retail-rfm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#%%: import libraries
import pandas as pd
import numpy as np
import datetime as dt
import seaborn as sns
#%% set console preferences
pd.set_option('display.width', 200)
sns.set(style='white')
#%% import data and clone working dataframe
df_raw = pd.read_excel('retail-data.xlsx')
df_raw.head()
df = df_raw.copy()
#%% get number of unique countries and their names
df.Country.nunique()
df.Country.unique()
#%% drop duplicates and group by country and customer ID
cc = df[['Country','CustomerID']].drop_duplicates()
cc.groupby(['Country'])['CustomerID']. \
aggregate('count').reset_index(). \
sort_values('CustomerID', ascending=False)
#%% isolate to United Kingdom only
df = df.loc[df['Country'] == 'United Kingdom']
#%% remove customers without customer ID
df = df[pd.notnull(df['CustomerID'])]
df.isnull().sum(axis=0)
#%% ensure only positive quantities and prices
df.UnitPrice.min()
df.Quantity.min()
df = df[(df['Quantity']>0)]
#%% check unique value for each column
def unique_counts(df):
for i in df.columns:
count = df[i].nunique()
print(i, ": ", count)
unique_counts(df)
#%% add column for total price
df['TotalPrice'] = df['Quantity'] * df['UnitPrice']
#%% determine first and last order date
df['InvoiceDate'].min()
df['InvoiceDate'].max()
#%% establish day after last purchase as point of calculation for recency
now = dt.datetime(2011,12,10)
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])
#%% create RFM table
rfmTable = df.groupby('CustomerID').agg({
'InvoiceDate': lambda x: (now - x.max()).days, #recency
'InvoiceNo': lambda x: len(x), #frequency
'TotalPrice': lambda x: x.sum()}) #monetary
rfmTable['InvoiceDate'] = rfmTable['InvoiceDate'].astype(int)
#%% convert invoice date to integer and rename columns for RFM
rfmTable.rename(columns={
'InvoiceDate': 'Recency',
'InvoiceNo': 'Frequency',
'TotalPrice': 'Monetary'}, inplace=True)
#%% shift rfmTable data to quantiles for segmentation
quantiles = rfmTable.quantile(q=[0.25,0.5,0.75])
quantiles = quantiles.to_dict()
quantiles
#%% create a segmented RFM table
rfmSegment = rfmTable.copy()
#%% create functions for calculating scores per order
def scoreRecency(x,p,d): # low recency is best and is assigned 1
if x <= d[p][0.25]:return 1
elif x <= d[p][0.50]:return 2
elif x <= d[p][0.75]:return 3
else:return 4
def scoreFrequency(x,p,d): # high frequency is best and is assigned 1
if x <= d[p][0.25]:return 4
elif x <= d[p][0.50]:return 3
elif x <= d[p][0.75]:return 2
else:return 1
def scoreMonetary(x,p,d): # high monetary is best and is assigned 1
if x <= d[p][0.25]:return 4
elif x <= d[p][0.50]:return 3
elif x <= d[p][0.75]:return 2
else:return 1
#%% create new columns for RFM and assign values based on quantile
rfmSegment['R_qt'] = rfmSegment['Recency'].apply(scoreRecency, args=('Recency',quantiles,))
rfmSegment['F_qt'] = rfmSegment['Frequency'].apply(scoreFrequency, args=('Frequency',quantiles,))
rfmSegment['M_qt'] = rfmSegment['Monetary'].apply(scoreMonetary, args=('Monetary',quantiles,))
rfmSegment.head()
#%% calculate total RFM score as string composed of individual RFM quantiles
rfmSegment['RFM'] = rfmSegment.R_qt.map(str) \
+ rfmSegment.F_qt.map(str) \
+ rfmSegment.M_qt.map(str)
#%% translate raw RFM values to log values for plotting, common log
rfmSegment = rfmSegment.assign(R_lg = lambda x: np.log10(x.Recency))
rfmSegment = rfmSegment.assign(F_lg = lambda x: np.log10(x.Frequency))
rfmSegment = rfmSegment.assign(M_lg = lambda x: np.log10(x.Monetary))
#%% isolate to high/low value customers and sort by customer monetary value
bestCustomers = rfmSegment[rfmSegment['RFM'].isin(['311', '411'])].sort_values('Monetary', ascending=False)
worstCustomers = rfmSegment[rfmSegment['RFM'] == '444'].sort_values('Monetary', ascending=False)
#%% plot frequency versus monetary of best customers
g = sns.JointGrid(x="F_lg", y="M_lg", data=bestCustomers, size=6)
g = g.plot(sns.regplot, sns.distplot). \
plot_joint(sns.kdeplot, zorder=0, n_levels=6). \
set_axis_labels("Frequency (log10)", "Monetary (log10)")
#%% use hue argument to provide a factor variable by RFM score
h = sns.lmplot(x="F_lg", y="M_lg", data=bestCustomers, size=6, hue='RFM'). \
set_axis_labels("Frequency (log10)", "Monetary (log10)")