Skip to content

Commit 84fe98c

Browse files
committed
feat: add generated embeddings
1 parent 16b686f commit 84fe98c

File tree

4 files changed

+212
-8
lines changed

4 files changed

+212
-8
lines changed

playground/12-embeddings.ipynb

+204-3
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@
6767
},
6868
{
6969
"cell_type": "code",
70-
"execution_count": 5,
70+
"execution_count": 4,
7171
"metadata": {},
7272
"outputs": [],
7373
"source": [
@@ -77,21 +77,222 @@
7777
},
7878
{
7979
"cell_type": "code",
80-
"execution_count": 6,
80+
"execution_count": 5,
8181
"metadata": {},
8282
"outputs": [],
8383
"source": [
8484
"dataset_path = \"./datasets/movie_plots.csv\"\n",
8585
"df = pd.read_csv(dataset_path)"
8686
]
8787
},
88+
{
89+
"cell_type": "code",
90+
"execution_count": 6,
91+
"metadata": {},
92+
"outputs": [],
93+
"source": [
94+
"movies = df[df[\"Origin/Ethnicity\"] == \"American\"].sort_values(\"Release Year\", ascending=False).head(50)"
95+
]
96+
},
97+
{
98+
"cell_type": "code",
99+
"execution_count": 7,
100+
"metadata": {},
101+
"outputs": [],
102+
"source": [
103+
"movie_plots = movies[\"Plot\"].values"
104+
]
105+
},
106+
{
107+
"attachments": {},
108+
"cell_type": "markdown",
109+
"metadata": {},
110+
"source": [
111+
"## Generating the embeddings"
112+
]
113+
},
114+
{
115+
"cell_type": "code",
116+
"execution_count": 8,
117+
"metadata": {},
118+
"outputs": [],
119+
"source": [
120+
"from tenacity import retry, wait_random_exponential, stop_after_attempt\n",
121+
"import pickle\n",
122+
"import tiktoken"
123+
]
124+
},
125+
{
126+
"cell_type": "code",
127+
"execution_count": 9,
128+
"metadata": {},
129+
"outputs": [],
130+
"source": [
131+
"@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))\n",
132+
"def get_embedding(text, model=\"text-embedding-ada-002\"):\n",
133+
"\n",
134+
" # replace newlines, which can negatively affect performance.\n",
135+
" text = text.replace(\"\\n\", \" \")\n",
136+
"\n",
137+
" return openai.Embedding.create(input=text, model=model)[\"data\"][0][\"embedding\"]"
138+
]
139+
},
140+
{
141+
"cell_type": "code",
142+
"execution_count": 10,
143+
"metadata": {},
144+
"outputs": [],
145+
"source": [
146+
"enc = tiktoken.encoding_for_model(\"text-embedding-ada-002\")"
147+
]
148+
},
149+
{
150+
"cell_type": "code",
151+
"execution_count": 11,
152+
"metadata": {},
153+
"outputs": [],
154+
"source": [
155+
"total_tokens = sum([len(enc.encode(plot)) for plot in movie_plots])"
156+
]
157+
},
88158
{
89159
"cell_type": "code",
90160
"execution_count": 12,
91161
"metadata": {},
162+
"outputs": [
163+
{
164+
"data": {
165+
"text/plain": [
166+
"16751"
167+
]
168+
},
169+
"execution_count": 12,
170+
"metadata": {},
171+
"output_type": "execute_result"
172+
}
173+
],
174+
"source": [
175+
"total_tokens"
176+
]
177+
},
178+
{
179+
"cell_type": "code",
180+
"execution_count": 13,
181+
"metadata": {},
182+
"outputs": [
183+
{
184+
"name": "stdout",
185+
"output_type": "stream",
186+
"text": [
187+
"Estimated cost $0.01\n"
188+
]
189+
}
190+
],
191+
"source": [
192+
"cost = total_tokens * (.0004 / 1000)\n",
193+
"print(f\"Estimated cost ${cost:.2f}\")"
194+
]
195+
},
196+
{
197+
"cell_type": "code",
198+
"execution_count": 16,
199+
"metadata": {},
92200
"outputs": [],
93201
"source": [
94-
"movies = df[df[\"Origin/Ethnicity\"] == \"American\"].sort_values(\"Release Year\", ascending=False).head(500)"
202+
"# establish a cache of embeddings to avoid recomputing\n",
203+
"# cache is a dict of tuples (text, model) -> embedding, saved as a pickle file\n",
204+
"\n",
205+
"# set path to embedding cache\n",
206+
"embedding_cache_path = \"./embeddings/movie_embeddings_cache.pkl\"\n",
207+
"\n",
208+
"# load the cache if it exists, and save a copy to disk\n",
209+
"try:\n",
210+
" embedding_cache = pd.read_pickle(embedding_cache_path)\n",
211+
"except FileNotFoundError:\n",
212+
" embedding_cache = {}\n",
213+
"with open(embedding_cache_path, \"wb\") as embedding_cache_file:\n",
214+
" pickle.dump(embedding_cache, embedding_cache_file)\n",
215+
"\n",
216+
"# define a function to retrieve embeddings from the cache if present, and otherwise request via the API\n",
217+
"def embedding_from_string(\n",
218+
" string,\n",
219+
" model=\"text-embedding-ada-002\",\n",
220+
" embedding_cache=embedding_cache\n",
221+
"):\n",
222+
" \"\"\"Return embedding of given string, using a cache to avoid recomputing.\"\"\"\n",
223+
" if (string, model) not in embedding_cache.keys():\n",
224+
" embedding_cache[(string, model)] = get_embedding(string, model)\n",
225+
" print(f\"GOT EMBEDDING FROM OPENAI FOR {string[:20]}\")\n",
226+
" with open(embedding_cache_path, \"wb\") as embedding_cache_file:\n",
227+
" pickle.dump(embedding_cache, embedding_cache_file)\n",
228+
" return embedding_cache[(string, model)]"
229+
]
230+
},
231+
{
232+
"cell_type": "code",
233+
"execution_count": 15,
234+
"metadata": {},
235+
"outputs": [
236+
{
237+
"name": "stdout",
238+
"output_type": "stream",
239+
"text": [
240+
"GOT EMBEDDING FROM OPENAI FOR Meek clerk Elmer Lam\n",
241+
"GOT EMBEDDING FROM OPENAI FOR Nick and Nora Charle\n",
242+
"GOT EMBEDDING FROM OPENAI FOR A card sharp steps i\n",
243+
"GOT EMBEDDING FROM OPENAI FOR Template:Section Edi\n",
244+
"GOT EMBEDDING FROM OPENAI FOR Taft, a policeman, h\n",
245+
"GOT EMBEDDING FROM OPENAI FOR Geoffrey Sherwood, r\n",
246+
"GOT EMBEDDING FROM OPENAI FOR Stenographer Marilyn\n",
247+
"GOT EMBEDDING FROM OPENAI FOR Kay Parrish is the d\n",
248+
"GOT EMBEDDING FROM OPENAI FOR The film centers on \n",
249+
"GOT EMBEDDING FROM OPENAI FOR Secretary Mirabel Mi\n",
250+
"GOT EMBEDDING FROM OPENAI FOR One year after gradu\n",
251+
"GOT EMBEDDING FROM OPENAI FOR Ellen Garfield refus\n",
252+
"GOT EMBEDDING FROM OPENAI FOR California gubernato\n",
253+
"GOT EMBEDDING FROM OPENAI FOR In San Francisco in \n",
254+
"GOT EMBEDDING FROM OPENAI FOR Freckles, a young ma\n",
255+
"GOT EMBEDDING FROM OPENAI FOR A radical campus gro\n",
256+
"GOT EMBEDDING FROM OPENAI FOR A suicidal woman, Li\n",
257+
"GOT EMBEDDING FROM OPENAI FOR Broadway star Al How\n",
258+
"GOT EMBEDDING FROM OPENAI FOR In 1925 London, midd\n",
259+
"GOT EMBEDDING FROM OPENAI FOR When Mary Beekman (I\n",
260+
"GOT EMBEDDING FROM OPENAI FOR Set somewhere in Vie\n",
261+
"GOT EMBEDDING FROM OPENAI FOR At Hampstead Court H\n",
262+
"GOT EMBEDDING FROM OPENAI FOR When top Broadway bo\n",
263+
"GOT EMBEDDING FROM OPENAI FOR Diamond Jim Brady (E\n",
264+
"GOT EMBEDDING FROM OPENAI FOR Lieut. Bill Branniga\n",
265+
"GOT EMBEDDING FROM OPENAI FOR Rodeo star John Scot\n",
266+
"GOT EMBEDDING FROM OPENAI FOR Paul Madvig (Edward \n",
267+
"GOT EMBEDDING FROM OPENAI FOR Luisa Ginglebusher (\n",
268+
"GOT EMBEDDING FROM OPENAI FOR In the resort of Lak\n",
269+
"GOT EMBEDDING FROM OPENAI FOR John Mason chases af\n",
270+
"GOT EMBEDDING FROM OPENAI FOR In the time of Jesus\n",
271+
"GOT EMBEDDING FROM OPENAI FOR In New York City, Dr\n",
272+
"GOT EMBEDDING FROM OPENAI FOR Don Phelan, the ace \n",
273+
"GOT EMBEDDING FROM OPENAI FOR Wealthy and charitab\n",
274+
"GOT EMBEDDING FROM OPENAI FOR In Manhattan's lower\n",
275+
"GOT EMBEDDING FROM OPENAI FOR In Dublin in 1922, G\n",
276+
"GOT EMBEDDING FROM OPENAI FOR Lawrence (Pat O'Brie\n",
277+
"GOT EMBEDDING FROM OPENAI FOR Jim Buchanan (Marsha\n",
278+
"GOT EMBEDDING FROM OPENAI FOR Kay Bentley (Joan Cr\n",
279+
"GOT EMBEDDING FROM OPENAI FOR In London, Stella Pa\n",
280+
"GOT EMBEDDING FROM OPENAI FOR Annette Monard Stree\n",
281+
"GOT EMBEDDING FROM OPENAI FOR Belle McGill is unaw\n",
282+
"GOT EMBEDDING FROM OPENAI FOR A ranch foreman trie\n",
283+
"GOT EMBEDDING FROM OPENAI FOR A publisher bets an \n",
284+
"GOT EMBEDDING FROM OPENAI FOR A racketeer known as\n",
285+
"GOT EMBEDDING FROM OPENAI FOR Dr. Holden (Ralph Be\n",
286+
"GOT EMBEDDING FROM OPENAI FOR The life and loves o\n",
287+
"GOT EMBEDDING FROM OPENAI FOR Brought up in povert\n",
288+
"GOT EMBEDDING FROM OPENAI FOR Before the First Wor\n",
289+
"GOT EMBEDDING FROM OPENAI FOR Laura Bayles has bee\n"
290+
]
291+
}
292+
],
293+
"source": [
294+
"# This line actaully generates the embeddings\n",
295+
"plot_embeddings = [embedding_from_string(plot, model=\"text-embedding-ada-002\") for plot in movie_plots]"
95296
]
96297
}
97298
],

playground/README.md

+2-3
Original file line numberDiff line numberDiff line change
@@ -82,11 +82,10 @@ You need to create a `.env` file with your `OPENAI_API_KEY`.
8282
## Embeddings
8383

8484
- generating a single embedding.
85-
- creating a movie embedding visualization with Atlas.
8685
- getting our movie data.
8786
- getting our movie data ready.
88-
- generating embeddings for 5000 movies.
89-
- visualizing our embeddings with atlas.
87+
- generating embeddings for 50 movies.
88+
- visualizing our embeddings with Atlas.
9089
- recommending movies using our embeddings.
9190

9291
[Check the notebook](12-embeddings.ipynb)
Binary file not shown.

requirements.txt

+6-2
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,10 @@ comm==0.1.3
2020
debugpy==1.6.7
2121
decorator==5.1.1
2222
defusedxml==0.7.1
23+
et-xmlfile==1.1.0
2324
executing==1.2.0
2425
fastjsonschema==2.16.3
25-
Flask==2.3.1
26+
Flask==2.3.2
2627
fqdn==1.5.1
2728
frozenlist==1.3.3
2829
idna==3.4
@@ -59,8 +60,10 @@ notebook==6.5.4
5960
notebook_shim==0.2.2
6061
numpy==1.24.3
6162
openai==0.27.4
63+
openpyxl==3.1.2
6264
packaging==23.1
6365
pandas==2.0.1
66+
pandas-stubs==2.0.1.230501
6467
pandocfilters==1.5.0
6568
parso==0.8.3
6669
pexpect==4.8.0
@@ -100,6 +103,7 @@ tinycss2==1.2.1
100103
tornado==6.3.1
101104
tqdm==4.65.0
102105
traitlets==5.9.0
106+
types-pytz==2023.3.0.0
103107
typing_extensions==4.5.0
104108
tzdata==2023.3
105109
uri-template==1.2.0
@@ -108,7 +112,7 @@ wcwidth==0.2.6
108112
webcolors==1.13
109113
webencodings==0.5.1
110114
websocket-client==1.5.1
111-
Werkzeug==2.3.0
115+
Werkzeug==2.3.3
112116
widgetsnbextension==4.0.7
113117
yarl==1.9.1
114118
zipp==3.15.0

0 commit comments

Comments
 (0)