-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfinal_year_project.py
725 lines (467 loc) · 22 KB
/
final_year_project.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
#!/usr/bin/env python
# coding: utf-8
# In[2]:
#import all the necessary packages.
from PIL import Image
import requests
from io import BytesIO
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import warnings
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
import math
import time
import re
import os
import seaborn as sns
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import pairwise_distances
from matplotlib import gridspec
from scipy.sparse import hstack
warnings.filterwarnings("ignore")
# In[3]:
# we have give a json file which consists of all information about
# the products
# loading the data using pandas' read_json file.
data = pd.read_json(r'tops_fashion.json')
# In[4]:
print ('Number of data points : ', data.shape[0], 'Number of features/variables:', data.shape[1])
# In[5]:
# each product/item has 19 features in the raw dataset.
data.columns # prints column-names or feature-names.
# Of these 19 features, we will be using only 6 features in this project.
# 1. asin ( Amazon standard identification number)
# 2. brand ( brand to which the product belongs to )
# 3. color ( Color information of apparel, it can contain many colors as a value ex: red and black stripes )
# 4. product_type_name (type of the apperal, ex: SHIRT/TSHIRT )
# 5. medium_image_url ( url of the image )
# 6. title (title of the product.)
# 7. formatted_price (price of the product)
# In[6]:
data = data[['asin', 'brand', 'color', 'medium_image_url', 'product_type_name', 'title', 'formatted_price']]
# In[7]:
print ('Number of data points : ', data.shape[0], 'Number of features:', data.shape[1])
data.head() # prints the top rows in the table.
# ### [5.1] Missing data for various features.
# #### Basic stats for the feature: product_type_name
# In[8]:
# We have total 72 unique type of product_type_names
print(data['product_type_name'].describe())
# 91.62% (167794/183138) of the products are shirts,
# In[9]:
# names of different product types
print(data['product_type_name'].unique())
# In[10]:
# find the 10 most frequent product_type_names.
product_type_count = Counter(list(data['product_type_name']))
product_type_count.most_common(10)
# #### Basic stats for the feature: brand
# In[11]:
# there are 10577 unique brands
print(data['brand'].describe())
# 183138 - 182987 = 151 missing values.
# In[12]:
brand_count = Counter(list(data['brand']))
brand_count.most_common(10)
# #### Basic stats for the feature: color
# In[13]:
print(data['color'].describe())
# we have 7380 unique colors
# 7.2% of products are black in color
# 64956 of 183138 products have brand information. That's approx 35.4%.
# In[14]:
color_count = Counter(list(data['color']))
color_count.most_common(10)
# #### Basic stats for the feature: formatted_price
# In[15]:
print(data['formatted_price'].describe())
# Only 28,395 (15.5% of whole data) products with price information
# In[16]:
price_count = Counter(list(data['formatted_price']))
price_count.most_common(10)
# #### Basic stats for the feature: title
#
# In[17]:
print(data['title'].describe())
# All of the products have a title.
# Titles are fairly descriptive of what the product is.
# We use titles extensively in this workshop
# as they are short and informative.
# In[18]:
# consider products which have price information
# data['formatted_price'].isnull() => gives the information
#about the dataframe row's which have null values price == None|Null
data = data.loc[~data['formatted_price'].isnull()]
print('Number of data points After eliminating price=NULL :', data.shape[0])
# In[19]:
# consider products which have color information
# data['color'].isnull() => gives the information about the dataframe row's which have null values price == None|Null
data =data.loc[~data['color'].isnull()]
print('Number of data points After eliminating color=NULL :', data.shape[0])
# ### [5.2] Remove near duplicate items
# #### [5.2.1] Understand about duplicates.
# In[20]:
# read data from pickle file from previous stage
data = pd.read_pickle(r'28k_apparel_data')
# find number of products that have duplicate titles.
print(sum(data.duplicated('title')))
# we have 2325 products which have same title but different color
# In[21]:
data.head()
# In[22]:
# Remove All products with very few words in title
data_sorted = data[data['title'].apply(lambda x: len(x.split())>4)]
print("After removal of products with short description:", data_sorted.shape[0])
# data_sorted.to_pickle("28k_apperal_data")
# After removal of products with short description: 27949
# In[23]:
# Sort the whole data based on title (alphabetical order of title)
data_sorted.sort_values('title',inplace=True, ascending=False)
data_sorted.head()
# #### Some examples of dupliacte titles that differ only in the last few words.
# <pre>
# Titles 1:
# 16. woman's place is in the house and the senate shirts for Womens XXL White
# 17. woman's place is in the house and the senate shirts for Womens M Grey
#
# Title 2:
# 25. tokidoki The Queen of Diamonds Women's Shirt X-Large
# 26. tokidoki The Queen of Diamonds Women's Shirt Small
# 27. tokidoki The Queen of Diamonds Women's Shirt Large
#
# Title 3:
# 61. psychedelic colorful Howling Galaxy Wolf T-shirt/Colorful Rainbow Animal Print Head Shirt for woman Neon Wolf t-shirt
# 62. psychedelic colorful Howling Galaxy Wolf T-shirt/Colorful Rainbow Animal Print Head Shirt for woman Neon Wolf t-shirt
# 63. psychedelic colorful Howling Galaxy Wolf T-shirt/Colorful Rainbow Animal Print Head Shirt for woman Neon Wolf t-shirt
# 64. psychedelic colorful Howling Galaxy Wolf T-shirt/Colorful Rainbow Animal Print Head Shirt for woman Neon Wolf t-shirt
# </pre>
# In[24]:
# stores the order of indices in data_sorted
indices = []
for i,row in data_sorted.iterrows():
indices.append(i)
# In[25]:
import itertools
stage1_dedupe_asins = []
i = 0
j = 0
num_data_points = data_sorted.shape[0]
while i < num_data_points and j < num_data_points:
previous_i = i
# store the list of words of ith string in a, ex: a = ['tokidoki', 'The', 'Queen', 'of', 'Diamonds', 'Women's', 'Shirt', 'X-Large']
a = data['title'].loc[indices[i]].split()
# search for the similar products sequentially
j = i+1
while j < num_data_points:
# store the list of words of jth string in b, ex: b = ['tokidoki', 'The', 'Queen', 'of', 'Diamonds', 'Women's', 'Shirt', 'Small']
b = data['title'].loc[indices[j]].split()
# store the maximum length of two strings
length = max(len(a), len(b))
# count is used to store the number of words that are matched in both strings
count = 0
# itertools.zip_longest(a,b): will map the corresponding words in both strings, it will appened None in case of unequal strings
# example: a =['a', 'b', 'c', 'd']
# b = ['a', 'b', 'd']
# itertools.zip_longest(a,b): will give [('a','a'), ('b','b'), ('c','d'), ('d', None)]
for k in itertools.zip_longest(a,b):
if (k[0] == k[1]):
count += 1
# if the number of words in which both strings differ are > 2 , we are considering it as those two apperals are different
# if the number of words in which both strings differ are < 2 , we are considering it as those two apperals are same, hence we are ignoring them
if (length - count) > 2: # number of words in which both sensences differ
# if both strings are differ by more than 2 words we include the 1st string index
stage1_dedupe_asins.append(data_sorted['asin'].loc[indices[i]])
# if the comaprision between is between num_data_points, num_data_points-1 strings and they differ in more than 2 words we include both
if j == num_data_points-1: stage1_dedupe_asins.append(data_sorted['asin'].loc[indices[j]])
# start searching for similar apperals corresponds 2nd string
i = j
break
else:
j += 1
if previous_i == i:
break
# In[26]:
data = data.loc[data['asin'].isin(stage1_dedupe_asins)]
# data.to_pickle("17k_apperal_datal")
# now only 17593 are left 17k_appereal_data
# #### We removed the dupliactes which differ only at the end.
# In[27]:
print('Number of data points : ', data.shape[0])
# #### [5.2.3] Remove duplicates : Part 2
# <pre>
#
# In the previous cell, we sorted whole data in alphabetical order of titles.Then, we removed titles which are adjacent and very similar title
#
# But there are some products whose titles are not adjacent but very similar.
#
# Examples:
#
# Titles-1
# 86261. UltraClub Women's Classic Wrinkle-Free Long Sleeve Oxford Shirt, Pink, XX-Large
# 115042. UltraClub Ladies Classic Wrinkle-Free Long-Sleeve Oxford Light Blue XXL
#
# TItles-2
# 75004. EVALY Women's Cool University Of UTAH 3/4 Sleeve Raglan Tee
# 109225. EVALY Women's Unique University Of UTAH 3/4 Sleeve Raglan Tees
# 120832. EVALY Women's New University Of UTAH 3/4-Sleeve Raglan Tshirt
#
# </pre>
# In[28]:
data = pd.read_pickle(r'17k_apperal_data')
# In[29]:
# This code snippet takes significant amount of time.
# O(n^2) time.
# Takes about an hour to run on a decent computer.
indices = []
for i,row in data.iterrows():
indices.append(i)
stage2_dedupe_asins = []
while len(indices)!=0:
i = indices.pop()
stage2_dedupe_asins.append(data['asin'].loc[i])
# consider the first apperal's title
a = data['title'].loc[i].split()
# store the list of words of ith string in a, ex: a = ['tokidoki', 'The', 'Queen', 'of', 'Diamonds', 'Women's', 'Shirt', 'X-Large']
for j in indices:
b = data['title'].loc[j].split()
# store the list of words of jth string in b, ex: b = ['tokidoki', 'The', 'Queen', 'of', 'Diamonds', 'Women's', 'Shirt', 'X-Large']
length = max(len(a),len(b))
# count is used to store the number of words that are matched in both strings
count = 0
# itertools.zip_longest(a,b): will map the corresponding words in both strings, it will appened None in case of unequal strings
# example: a =['a', 'b', 'c', 'd']
# b = ['a', 'b', 'd']
# itertools.zip_longest(a,b): will give [('a','a'), ('b','b'), ('c','d'), ('d', None)]
for k in itertools.zip_longest(a,b):
if (k[0]==k[1]):
count += 1
# if the number of words in which both strings differ are < 3 , we are considering it as those two apperals are same, hence we are ignoring them
if (length - count) < 3:
indices.remove(j)
# In[30]:
# from whole previous products we will consider only
# the products that are found in previous cell
data = data.loc[data['asin'].isin(stage2_dedupe_asins)]
# In[31]:
print('Number of data points after stage two of dedupe: ',data.shape[0])
# data.to_pickle("16k_apperal_data")
# from 17k apperals we reduced to 16k apperals
# # 6. Text pre-processing
# In[32]:
data = pd.read_pickle(r'16k_apperal_data')
# NLTK download stop words. [RUN ONLY ONCE]
# goto Terminal (Linux/Mac) or Command-Prompt (Window)
# In the temrinal, type these commands
# $python3
# $import nltk
# $nltk.download()
# In[33]:
# we use the list of stop words that are downloaded from nltk lib.
import nltk
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
print ('list of stop words:', stop_words)
def nlp_preprocessing(total_text, index, column):
if type(total_text) is not int:
string = ""
for words in total_text.split():
# remove the special chars in review like '"#$@!%^&*()_+-~?>< etc.
word = ("".join(e for e in words if e.isalnum()))
# Convert all letters to lower-case
word = word.lower()
# stop-word removal
if not word in stop_words:
string += word + " "
data[column][index] = string
# In[34]:
start_time = time.clock()
# we take each title and we text-preprocess it.
for index, row in data.iterrows():
nlp_preprocessing(row['title'], index, 'title')
# we print the time it took to preprocess whole titles
print(time.clock() - start_time, "seconds")
# In[35]:
# data.to_pickle("16k_apperal_data_preprocessed")
# In[36]:
data.head()
# In[37]:
from nltk.stem.porter import *
stemmer = PorterStemmer()
print(stemmer.stem('arguing'))
print(stemmer.stem('fishing'))
# We tried using stemming on our titles and it didnot work very well.
# # [8] Text based product similarity
# In[38]:
data = pd.read_pickle(r'16k_apperal_data_preprocessed')
data.head()
# In[39]:
# Utility Functions which we will use through the rest of the workshop.
#Display an image
def display_img(url,ax,fig):
# we get the url of the apparel and download it
response = requests.get(url)
img = Image.open(BytesIO(response.content))
# we will display it in notebook
plt.imshow(img)
#plotting code to understand the algorithm's decision.
def plot_heatmap(keys, values, labels, url, text):
# keys: list of words of recommended title
# values: len(values) == len(keys), values(i) represents the occurence of the word keys(i)
# labels: len(labels) == len(keys), the values of labels depends on the model we are using
# if model == 'bag of words': labels(i) = values(i)
# if model == 'tfidf weighted bag of words':labels(i) = tfidf(keys(i))
# if model == 'idf weighted bag of words':labels(i) = idf(keys(i))
# url : apparel's url
# we will devide the whole figure into two parts
gs = gridspec.GridSpec(2, 2, width_ratios=[4,1], height_ratios=[4,1])
fig = plt.figure(figsize=(25,3))
# 1st, ploting heat map that represents the count of commonly ocurred words in title2
ax = plt.subplot(gs[0])
# it displays a cell in white color if the word is intersection(lis of words of title1 and list of words of title2), in black if not
ax = sns.heatmap(np.array([values]), annot=np.array([labels]))
ax.set_xticklabels(keys) # set that axis labels as the words of title
ax.set_title(text) # apparel title
# 2nd, plotting image of the the apparel
ax = plt.subplot(gs[1])
# we don't want any grid lines for image and no labels on x-axis and y-axis
ax.grid(False)
ax.set_xticks([])
ax.set_yticks([])
# we call dispaly_img based with paramete url
display_img(url, ax, fig)
# displays combine figure ( heat map and image together)
plt.show()
def plot_heatmap_image(doc_id, vec1, vec2, url, text, model):
# doc_id : index of the title1
# vec1 : input apparels's vector, it is of a dict type {word:count}
# vec2 : recommended apparels's vector, it is of a dict type {word:count}
# url : apparels image url
# text: title of recomonded apparel (used to keep title of image)
# model, it can be any of the models,
# 1. bag_of_words
# 2. tfidf
# 3. idf
# we find the common words in both titles, because these only words contribute to the distance between two title vec's
intersection = set(vec1.keys()) & set(vec2.keys())
# we set the values of non intersecting words to zero, this is just to show the difference in heatmap
for i in vec2:
if i not in intersection:
vec2[i]=0
# for labeling heatmap, keys contains list of all words in title2
keys = list(vec2.keys())
# if ith word in intersection(lis of words of title1 and list of words of title2): values(i)=count of that word in title2 else values(i)=0
values = [vec2[x] for x in vec2.keys()]
# labels: len(labels) == len(keys), the values of labels depends on the model we are using
# if model == 'bag of words': labels(i) = values(i)
# if model == 'tfidf weighted bag of words':labels(i) = tfidf(keys(i))
# if model == 'idf weighted bag of words':labels(i) = idf(keys(i))
if model == 'bag_of_words':
labels = values
elif model == 'tfidf':
labels = []
for x in vec2.keys():
# tfidf_title_vectorizer.vocabulary_ it contains all the words in the corpus
# tfidf_title_features[doc_id, index_of_word_in_corpus] will give the tfidf value of word in given document (doc_id)
if x in tfidf_title_vectorizer.vocabulary_:
labels.append(tfidf_title_features[doc_id, tfidf_title_vectorizer.vocabulary_[x]])
else:
labels.append(0)
elif model == 'idf':
labels = []
for x in vec2.keys():
# idf_title_vectorizer.vocabulary_ it contains all the words in the corpus
# idf_title_features[doc_id, index_of_word_in_corpus] will give the idf value of word in given document (doc_id)
if x in idf_title_vectorizer.vocabulary_:
labels.append(idf_title_features[doc_id, idf_title_vectorizer.vocabulary_[x]])
else:
labels.append(0)
plot_heatmap(keys, values, labels, url, text)
# this function gets a list of wrods along with the frequency of each
# word given "text"
def text_to_vector(text):
word = re.compile(r'\w+')
words = word.findall(text)
# words stores list of all words in given string, you can try 'words = text.split()' this will also gives same result
return Counter(words) # Counter counts the occurence of each word in list, it returns dict type object {word1:count}
def get_result(doc_id, content_a, content_b, url, model):
text1 = content_a
text2 = content_b
# vector1 = dict{word11:#count, word12:#count, etc.}
vector1 = text_to_vector(text1)
# vector1 = dict{word21:#count, word22:#count, etc.}
vector2 = text_to_vector(text2)
plot_heatmap_image(doc_id, vector1, vector2, url, text2, model)
# ## [8.2] Bag of Words (BoW) on product titles.
# In[40]:
from sklearn.feature_extraction.text import CountVectorizer
title_vectorizer = CountVectorizer()
title_features = title_vectorizer.fit_transform(data['title'])
title_features.get_shape() # get number of rows and columns in feature matrix.
# title_features.shape = #data_points * #words_in_corpus
# CountVectorizer().fit_transform(corpus) returns
# the a sparase matrix of dimensions #data_points * #words_in_corpus
# What is a sparse vector?
# title_features[doc_id, index_of_word_in_corpus] = number of times the word occured in that doc
# In[41]:
def bag_of_words_model(doc_id, num_results):
# doc_id: apparel's id in given corpus
# pairwise_dist will store the distance from given input apparel to all remaining apparels
# the metric we used here is cosine, the coside distance is mesured as K(X, Y) = <X, Y> / (||X||*||Y||)
# http://scikit-learn.org/stable/modules/metrics.html#cosine-similarity
pairwise_dist = pairwise_distances(title_features,title_features[doc_id])
# np.argsort will return indices of the smallest distances
indices = np.argsort(pairwise_dist.flatten())[0:num_results]
#pdists will store the smallest distances
pdists = np.sort(pairwise_dist.flatten())[0:num_results]
#data frame indices of the 9 smallest distace's
df_indices = list(data.index[indices])
for i in range(0,len(indices)):
# we will pass 1. doc_id, 2. title1, 3. title2, url, model
get_result(indices[i],data['title'].loc[df_indices[0]], data['title'].loc[df_indices[i]], data['medium_image_url'].loc[df_indices[i]], 'bag_of_words')
print('ASIN :',data['asin'].loc[df_indices[i]])
print ('Brand:', data['brand'].loc[df_indices[i]])
print ('Title:', data['title'].loc[df_indices[i]])
print ('Euclidean similarity with the query image :', pdists[i])
print('='*60)
#call the bag-of-words model for a product to get similar products.
bag_of_words_model(100, 20) # change the index if you want to.
# In the output heat map each value represents the count value
# of the label word, the color represents the intersection
# with inputs title.
# ## [8.5] TF-IDF based product similarity
# In[42]:
tfidf_title_vectorizer = TfidfVectorizer(min_df = 0)
tfidf_title_features = tfidf_title_vectorizer.fit_transform(data['title'])
# tfidf_title_features.shape = #data_points * #words_in_corpus
# CountVectorizer().fit_transform(courpus) returns the a sparase matrix of dimensions #data_points * #words_in_corpus
# tfidf_title_features[doc_id, index_of_word_in_corpus] = tfidf values of the word in given doc
# In[45]:
def tfidf_model(doc_id, num_results):
# doc_id: apparel's id in given corpus
# pairwise_dist will store the distance from given input apparel to all remaining apparels
# the metric we used here is cosine, the coside distance is mesured as K(X, Y) = <X, Y> / (||X||*||Y||)
# http://scikit-learn.org/stable/modules/metrics.html#cosine-similarity
pairwise_dist = pairwise_distances(tfidf_title_features,tfidf_title_features[doc_id])
# np.argsort will return indices of 9 smallest distances
indices = np.argsort(pairwise_dist.flatten())[0:num_results]
#pdists will store the 9 smallest distances
pdists = np.sort(pairwise_dist.flatten())[0:num_results]
#data frame indices of the 9 smallest distace's
df_indices = list(data.index[indices])
for i in range(0,len(indices)):
# we will pass 1. doc_id, 2. title1, 3. title2, url, model
get_result(indices[i], data['title'].loc[df_indices[0]], data['title'].loc[df_indices[i]], data['medium_image_url'].loc[df_indices[i]], 'tfidf')
print('ASIN :',data['asin'].loc[df_indices[i]])
print('BRAND :',data['brand'].loc[df_indices[i]])
print ('Eucliden distance from the given image :', pdists[i])
print('='*125)
tfidf_model(100, 10)
# in the output heat map each value represents the tfidf values of the label word, the color represents the intersection with inputs title
# In[ ]: