-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathknn_sim.py
231 lines (179 loc) · 12.1 KB
/
knn_sim.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
#--------- Imports ---------
import spotipy
import warnings
import pandas as pd
import streamlit as st
import plotly.express as px
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.pipeline import Pipeline
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from spotipy.oauth2 import SpotifyClientCredentials
warnings.filterwarnings("ignore")
#--------- Constants ---------
CLIENT_ID = "3e28cbdec0e841c08ec2d6b9b949cef1"
CLIENT_SECRET = "857ea858725348e09cbeb6f1470bcbac"
COLS = ['acousticness', 'danceability', 'energy', 'instrumentalness', 'loudness', 'speechiness', 'tempo', 'valence']
POP1 = '37i9dQZF1DXcBWIGoYBM5M'
HH1 = '37i9dQZF1DX0XUsuxWHRQd'
COUNTRY1 = '37i9dQZF1DX1lVhptIYRda'
LATIN1 = '37i9dQZF1DX10zKzsJ2jva'
ROCK1 = '37i9dQZF1DXcF6B6QPhFDv'
EDM1 = '37i9dQZF1DX4dyzvuaRJ0n'
INDIE1 = '37i9dQZF1DXdwmD5Q7Gxah'
RB1 = '37i9dQZF1DX4SBhb3fqCJd'
JAZZ1 = '37i9dQZF1DX7YCknf2jT6s'
METAL1 = '37i9dQZF1DWXNFSTtym834'
playlist_arr = [POP1, HH1, COUNTRY1, LATIN1, ROCK1, EDM1, INDIE1, RB1, JAZZ1, METAL1]
client_credentials_manager = SpotifyClientCredentials(client_id=CLIENT_ID, client_secret=CLIENT_SECRET)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)
#--------- Functions ---------
def retrieve_spotify_data(playlist_uri_array):
data = []
for playlist_uri in playlist_uri_array:
playlist_id = playlist_uri.split(':')[-1]
playlist = sp.playlist(playlist_id)
tracks = playlist['tracks']['items']
for track in tracks:
track_info = track['track']
track_name = track_info['name']
artist_name = track_info['artists'][0]['name']
audio_features = sp.audio_features(track_info['id'])[0]
genres = sp.artist(track_info['artists'][0]['id'])['genres']
if audio_features is not None and genres is not None:
audio_data = {
'track_name': track_name,
'artist_name': artist_name,
'genres': genres
}
audio_data.update(audio_features)
data.append(audio_data)
playlist_df = pd.DataFrame(data)
playlist_df = playlist_df.drop(['key', 'mode', 'liveness', 'type', 'id', 'uri', 'track_href', 'analysis_url', 'duration_ms', 'time_signature'], axis=1)
playlist_df.to_csv('data/music_data.csv', index=False)
return playlist_df
#playlist_df = retrieve_spotify_data(playlist_arr)
def X_y_split(playlist_df):
X = playlist_df[COLS]
y = playlist_df['genres']
return X,y
def silhouette_graph(X):
range_n_clusters = [5, 10, 15, 20, 25, 30, 35, 40]
silhouette_avg = []
for num_clusters in range_n_clusters:
kmeans = KMeans(n_clusters=num_clusters)
kmeans.fit(X)
cluster_labels = kmeans.labels_
silhouette_avg.append(silhouette_score(X, cluster_labels))
fig, ax = plt.subplots()
ax.plot(range_n_clusters, silhouette_avg, 'bx-')
ax.set_xlabel('Values of K')
ax.set_ylabel('Silhouette score')
ax.set_title('Silhouette analysis For Optimal k')
return fig
def kmeans(X):
kmeans = Pipeline([('scaler', StandardScaler()), ('kmeans', KMeans(n_clusters=5))])
kmeans.fit(X)
playlist_df['cluster'] = kmeans.predict(X)
playlist_df['genres'] = playlist_df['genres']
return playlist_df
def tsne_graph(X, playlist_df):
tsne_pipeline = Pipeline([('scaler', StandardScaler()), ('tsne', TSNE(n_components=2, verbose=1))])
genre_embedding = tsne_pipeline.fit_transform(X)
projection = pd.DataFrame(columns=['x', 'y'], data=genre_embedding)
projection['genres'] = playlist_df['genres']
projection['cluster'] = playlist_df['cluster']
fig = px.scatter(
projection, x='x', y='y', color='cluster', hover_data=['x', 'y', 'genres'])
return fig
def get_audio_features(song_name, artist):
query = f"track:{song_name} artist:{artist}"
results = sp.search(q=query, type='track', limit=1)
if len(results['tracks']['items']) > 0:
track_id = results['tracks']['items'][0]['id']
track_features = sp.audio_features([track_id])
if len(track_features) > 0:
track_info = sp.track(track_id)
popularity = track_info['popularity']
numerical_features = {key: value for key, value in track_features[0].items() if isinstance(value, (int, float))}
numerical_features['popularity'] = popularity
return numerical_features
return None
def knn(k, X, y, audio_feats):
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
knn = KNeighborsClassifier(n_neighbors=k)
knn.fit(X_scaled, y)
num_feat = pd.DataFrame.from_dict(audio_feats, orient='index').T
num_feat = num_feat[COLS]
new_input_scaled = scaler.transform(num_feat.values.reshape(1, -1))
distances, indices = knn.kneighbors(new_input_scaled)
nearest_songs = playlist_df.iloc[indices[0]]
# print("Nearest Songs:")
# print(nearest_songs['track_name'], nearest_songs['artist_name'])
return nearest_songs['genres'], nearest_songs['track_name'], nearest_songs['artist_name']
#--------- STREAMLIT APP ---------#
# Create a sidebar
st.sidebar.title("About Me")
st.sidebar.write("I'm Sameeha Afrulbasha! I'm an undergraduate student studying Data Science, Statistics, and Math at Purdue University. Feel free to checkout my website and other media accounts below!")
st.sidebar.markdown('Website: \n\n\n https://sameehaafr.github.io/sameehaafr/')
st.sidebar.markdown('GitHub: \n\n\n https://github.com/sameehaafr')
st.sidebar.markdown('LinkedIn: \n\n\n https://www.linkedin.com/in/sameeha-afrulbasha/')
st.sidebar.markdown('Medium: \n\n\n https://sameehaafr.medium.com/')
st.title("Spotify Song Recommendation System Using KNN")
st.markdown("In this article, we will explore how to build a Spotify song recommendation system using machine learning techniques. By leveraging the Spotipy library and Spotify's Web API, we can retrieve audio features and genre information for songs from curated playlists. Through data analysis, clustering, and dimensionality reduction, we will uncover patterns in the music data.")
st.markdown("<b>Category:</b> Unsupervised Machine Learning (K-Means and KNN)", unsafe_allow_html=True)
st.markdown("<b>Objective:</b> Cluster data using K-means and build a KNN algorithm to recommend songs based on a users song input.", unsafe_allow_html=True)
st.header("Setting up the Environment")
st.markdown("To begin, we need to import the necessary libraries and set up the [Spotify](https://developer.spotify.com/) client credentials. We import Spotipy for interacting with the Spotify API, as well as other libraries such as pandas, streamlit, plotly, and scikit-learn for data processing and visualization. We also define the client ID and client secret provided by Spotify.")
# --------- READING DATA --------- #
st.header('Retrieving Spotify Data')
st.markdown("The code retrieves data from Spotify playlists using the Spotify Web API and the Spotipy library. A function called retrieve_spotify_data is defined, which takes an array of playlist URIs as input. For each playlist, the code extracts track information, audio features, and genres. The data is stored in a pandas DataFrame and saved as a CSV file for further analysis. The bigger the dataset the better. However, for this simple application I decided to do a playlist for each genre: Pop, Hip-Hop, Country, Latin, Rock, EDM, Indie, R&B, Jazz, and Metal. I will be expanding this soon. Here is the retrieved dataset with less important features removed.")
playlist_df = pd.read_csv('data/music_data.csv')
st.dataframe(playlist_df)
# --------- DATA PROCESSING --------- #
st.header('Data Processing and Feature Engineering')
st.markdown("Next, the code performs data processing and feature engineering tasks on the retrieved Spotify data. The DataFrame is split into input features (X) and the target variable (genres) (y) for classification purposes. Less important features are removed, while important features related to the essence of music, such as acousticness, danceability, energy, instrumentalness, loudness, speechiness, tempo, and valence, are retained. We will next be performing K-Means Clustering on the data to group similar songs together based on their audio features.")
X, y = X_y_split(playlist_df)
# --------- SILHOUETTE ANALYSIS --------- #
st.header('Silhouette Analysis')
st.markdown("The code includes a silhouette analysis to evaluate the quality of the clustering results. The silhouette score is calculated for different values of K (the number of clusters) in K-means clustering. The resulting plot displays the relationship between the number of clusters and the quality of the clustering. A higher silhouette score indicates better separation between clusters, aiding in the determination of the optimal number of clusters for K-means clustering. As the graph below shows, the optimal number of clusters is around 5-6.")
fig = silhouette_graph(X)
st.pyplot(fig)
# --------- KMEANS CLUSTERING --------- #
st.header('K-Means Clustering')
st.markdown("K-Means is a clustering algorithm used in unsupervised machine learning. It partitions a dataset into K distinct clusters where each data point belongs to the cluster with the nearest mean (centroid).")
st.markdown("In this section, K-means clustering is applied to the input features (audio features) using the optimized K value (5 or 6) from the silhouette analysis to group similar tracks together. The K-means algorithm assigns cluster labels to each data point based on their audio feature values. The resulting cluster labels are added as a new column to the DataFrame, enabling further analysis and visualization of the clustered data.")
playlist_df = kmeans(X)
# --------- t-SNE VISUALIZATION --------- #
st.header('t-SNE Visualization')
st.markdown("To visualize the clustered data, the code uses t-SNE (t-Distributed Stochastic Neighbor Embedding) for dimensionality reduction. t-SNE maps high-dimensional data to a lower-dimensional space while preserving local and global structure. The resulting 2D scatter plot displays the relationships between tracks, clusters, and genres, providing insights into the distribution and patterns within the data.")
fig = tsne_graph(X, playlist_df)
st.plotly_chart(fig)
# --------- KNN RECOMMENDER --------- #
st.header('KNN Recommender')
st.markdown("Now it is time to implement the K-nearest neighbors (KNN) recommender system. Users can input a song name and artist to receive personalized recommendations. The code retrieves the audio features for the input song using the Spotify API. If the audio features are found, they are displayed in a table, providing insights into the characteristics of the input song. The KNN algorithm utilizes the input audio features and the previously clustered data to find the nearest songs in terms of audio feature similarity. The recommended genres, tracks, and artists are displayed in a table, offering personalized recommendations based on user input.")
song_name = st.text_input('Song Name', 'Crazy in Love')
artist = st.text_input('Artist', 'Beyonce')
audio_feats = None
if song_name and artist:
audio_feats = []
audio_feats = get_audio_features(song_name, artist)
if audio_feats:
st.write('Audio Features:')
audio_table = pd.DataFrame.from_dict(audio_feats, orient='index', columns=['Value'])
st.table(audio_table)
else:
st.write('No audio features found for the given song and artist.')
genres, track, artist = knn(5, X, y, audio_feats)
# --------- RECOMMENDATIONS --------- #
st.write('Recommendations:')
recommendations_table = pd.DataFrame({'Genres': genres, 'Track': track, 'Artist': artist})
st.table(recommendations_table)
# --------- CONCLUSION --------- #
st.header('Conclusion')
st.markdown("By following this code and understanding the described steps, one can build a Spotify song recommendation system that utilizes audio features, clustering, and dimensionality reduction techniques to generate personalized song recommendations.")
st.markdown("Thanks for reading! :)")