Skip to content

Commit 434f278

Browse files
authored
Add files via upload
0 parents  commit 434f278

7 files changed

+2905
-0
lines changed

Customer_Segmentation_Part1.ipynb

Lines changed: 809 additions & 0 deletions
Large diffs are not rendered by default.

Customer_Segmentation_Part2.ipynb

Lines changed: 1858 additions & 0 deletions
Large diffs are not rendered by default.

cluster.pkl

1.59 KB
Binary file not shown.

modeling.py

Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,109 @@
1+
from kmodes.kmodes import KModes
2+
from kmodes.kprototypes import KPrototypes
3+
import pandas as pd
4+
import seaborn as sns
5+
import matplotlib.pyplot as plt
6+
import pickle
7+
8+
class Model_segmen:
9+
def __init__(self, df_model):
10+
self.df_model = df_model
11+
12+
def find_optimalCluster(self): # Mencari Jumlah Cluster yang Optimal
13+
# Melakukan Iterasi untuk Mendapatkan nilai Cost
14+
cost = {}
15+
for k in range(2,10):
16+
kproto = KPrototypes(n_clusters = k, random_state = 75)
17+
kproto.fit_predict(self.df_model, categorical = [0,1,2])
18+
cost[k]= kproto.cost_
19+
20+
# Memvisualisasikan Elbow Plot
21+
sns.pointplot(x = list(cost.keys()), y = list(cost.values()))
22+
plt.show()
23+
24+
def making_model(self):
25+
kproto = KPrototypes ( n_clusters = 5, random_state = 75)
26+
kproto = kproto.fit(self.df_model, categorical=[0,1,2])
27+
28+
#Save Model
29+
pickle.dump(kproto, open('cluster.pkl', 'wb'))
30+
31+
self.kproto = kproto
32+
33+
def use_model(self):
34+
df = pd.read_csv("https://dqlab-dataset.s3-ap-southeast-1.amazonaws.com/customer_segments.txt", sep="\t")
35+
# Menentukan segmen tiap pelanggan
36+
clusters = self.kproto.predict(self.df_model, categorical=[0,1,2])
37+
print('segmen pelanggan: {}\n'.format(clusters))
38+
39+
# Menggabungkan data awal dan segmen pelanggan
40+
df_final = df.copy()
41+
df_final['cluster'] = clusters
42+
print(df_final.head())
43+
44+
self.df_final = df_final
45+
46+
def Showing_EachCustomerCluster(self):
47+
# Menampilkan data pelanggan berdasarkan cluster nya
48+
for i in range (0,5):
49+
print('\nPelanggan cluster: {}\n'.format(i))
50+
print(self.df_final[self.df_final['cluster'] == i])
51+
52+
def VisualizationClusteringResults_BoxPlot(self):
53+
# Data Numerical
54+
kolom_numerik = ['Umur','NilaiBelanjaSetahun']
55+
56+
for i in kolom_numerik:
57+
plt.figure(figsize=(6,4))
58+
ax = sns.boxplot(x = 'cluster',y = i, data = self.df_final)
59+
plt. title('\nBox Plot {}\n'.format(i), fontsize=12)
60+
plt.show()
61+
62+
def VisualizationClusteringResults_CountPlot(self):
63+
# Data Kategorikal
64+
kolom_categorical = ['Jenis Kelamin','Profesi','Tipe Residen']
65+
66+
for i in kolom_categorical:
67+
plt.figure(figsize=(6,4))
68+
ax = sns.countplot(data = self.df_final, x = 'cluster', hue = i )
69+
plt.title('\nCount Plot {}\n'.format(i), fontsize=12)
70+
ax. legend (loc="upper center")
71+
for p in ax.patches:
72+
ax.annotate(format(p.get_height(), '.0f'),
73+
(p.get_x() + p.get_width() / 2., p.get_height()),
74+
ha = 'center',
75+
va = 'center',
76+
xytext = (0, 10),
77+
textcoords = 'offset points')
78+
79+
sns.despine(right=True, top = True, left = True)
80+
ax.axes.yaxis.set_visible(False)
81+
plt.show()
82+
83+
def NamingCluster(self):
84+
# Mapping nama kolom
85+
self.df_final['segmen'] = self.df_final['cluster'].map({
86+
0: 'Diamond Young Member',
87+
1: 'Diamond Senior Member',
88+
2: 'Silver Member',
89+
3: 'Gold Young Member',
90+
4: 'Gold Senior Member'
91+
})
92+
93+
print(self.df_final.info())
94+
print(self.df_final.head())
95+
96+
# Save to CSV
97+
self.df_final.to_csv (r'data\df-customer-segmentation-final.csv', index = False)
98+
print('\nDataframe Sudah Tersimpan!')
99+
100+
df_model = pd.read_csv('https://dqlab-dataset.s3-ap-southeast-1.amazonaws.com/df-customer-segmentation.csv')
101+
102+
app = Model_segmen(df_model)
103+
# app.find_optimalCluster()
104+
app.making_model()
105+
app.use_model()
106+
app.Showing_EachCustomerCluster()
107+
# app.VisualizationClusteringResults_BoxPlot()
108+
# app.VisualizationClusteringResults_CountPlot()
109+
app.NamingCluster()

plot_kategorikal_data.png

37 KB
Loading

plot_numerik_data.png

61.5 KB
Loading

prepare_data.py

Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,129 @@
1+
import pandas as pd
2+
import matplotlib.pyplot as plt
3+
import seaborn as sns
4+
from sklearn.preprocessing import LabelEncoder
5+
from kmodes.kmodes import KModes
6+
from kmodes.kprototypes import KPrototypes
7+
import pickle
8+
from pathlib import Path
9+
import matplotlib.pyplot as plt
10+
import seaborn as sns
11+
from sklearn.preprocessing import StandardScaler
12+
13+
class Cust_segment:
14+
def __init__(self):
15+
self.data = data
16+
17+
def read_data(self):
18+
# import dataset
19+
df = pd.read_csv(data, sep="\t")
20+
21+
# menampilkan data
22+
print(df.head())
23+
24+
# Menampilkan informasi data
25+
print(df.info())
26+
27+
return df
28+
29+
def eksplorasi_dataNumerik(self, df):
30+
sns.set(style='white')
31+
plt.clf()
32+
33+
# Fungsi untuk membuat plot
34+
def observasi_num(features):
35+
fig, axs = plt.subplots(2, 2, figsize=(10, 9))
36+
for i, kol in enumerate(features):
37+
sns.boxplot(df[kol], ax = axs[i][0])
38+
sns.distplot(df[kol], ax = axs[i][1])
39+
axs[i][0].set_title('mean = %.2f\n median = %.2f\n std = %.2f'%(df[kol].mean(), df[kol].median(), df[kol].std()))
40+
plt.setp(axs)
41+
plt.tight_layout()
42+
plt.show()
43+
44+
# Memanggil fungsi untuk membuat Plot untuk data numerik
45+
kolom_numerik = ['Umur','NilaiBelanjaSetahun']
46+
observasi_num(kolom_numerik)
47+
48+
def eksplorasi_dataKategorikal(self, df):
49+
sns.set(style='white')
50+
plt.clf()
51+
52+
# Menyiapkan kolom kategorikal
53+
kolom_kategorikal = ['Jenis Kelamin','Profesi','Tipe Residen']
54+
55+
# Membuat canvas
56+
fig, axs = plt.subplots(3,1,figsize=(7,10))
57+
58+
# Membuat plot untuk setiap kolom kategorikal
59+
for i, kol in enumerate(kolom_kategorikal):
60+
# Membuat Plot
61+
sns.countplot(df[kol], order = df[kol].value_counts().index, ax = axs[i])
62+
axs[i].set_title('\nCount Plot %s\n'%(kol), fontsize=15)
63+
64+
# Memberikan anotasi
65+
for p in axs[i].patches:
66+
axs[i].annotate(format(p.get_height(), '.0f'),
67+
(p.get_x() + p.get_width() / 2., p.get_height()),
68+
ha = 'center',
69+
va = 'center',
70+
xytext = (0, 10),
71+
textcoords = 'offset points')
72+
73+
# Setting Plot
74+
sns.despine(right=True,top = True, left = True)
75+
axs[i].axes.yaxis.set_visible(False)
76+
plt.setp(axs[i])
77+
# plt.setp(ax)
78+
plt.tight_layout()
79+
80+
# Tampilkan plot
81+
plt.show()
82+
83+
def preparation_data(self, df):
84+
# Standarisasi Kolom Numerik
85+
kolom_numerik = ['Umur','NilaiBelanjaSetahun']
86+
87+
# Statistik sebelum Standardisasi
88+
print('Statistik Sebelum Standardisasi\n')
89+
print(df[kolom_numerik].describe().round(1))
90+
91+
# Standardisasi
92+
df_std = StandardScaler().fit_transform(df[kolom_numerik])
93+
94+
# Membuat DataFrame
95+
df_std = pd.DataFrame(data=df_std, index=df.index, columns=df[kolom_numerik].columns)
96+
97+
# Menampilkan contoh isi data dan summary statistic
98+
print('Contoh hasil standardisasi\n')
99+
print(df_std.head())
100+
101+
print('Statistik hasil standardisasi\n')
102+
print(df_std.describe().round(0))
103+
104+
# Konversi Kategorikal Data
105+
# Inisiasi nama kolom kategorikal
106+
kolom_kategorikal = ['Jenis Kelamin','Profesi','Tipe Residen']
107+
108+
# Membuat salinan data frame
109+
df_encode = df[kolom_kategorikal].copy()
110+
111+
# Melakukan labelEncoder untuk semua kolom kategorikal
112+
for col in kolom_kategorikal:
113+
df_encode[col]= LabelEncoder().fit_transform(df_encode[col])
114+
115+
# Menampilkan data
116+
print(df_encode.head())
117+
118+
# Menggabungkan data frame
119+
df_model = df_encode.merge(df_std, left_index = True, right_index=True, how= 'left')
120+
print (df_model.head())
121+
122+
123+
data = "https://dqlab-dataset.s3-ap-southeast-1.amazonaws.com/customer_segments.txt"
124+
125+
app = Cust_segment()
126+
data_raw = app.read_data()
127+
app.eksplorasi_dataNumerik(data_raw)
128+
app.eksplorasi_dataKategorikal(data_raw)
129+
final_data = app.preparation_data(data_raw)

0 commit comments

Comments
 (0)