feat: add generated embeddings

solygambas · solygambas · commit 84fe98c5798b · 2023-05-03T08:19:15.000+02:00
diff --git a/playground/12-embeddings.ipynb b/playground/12-embeddings.ipynb
@@ -67,7 +67,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -77,21 +77,222 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [],
    "source": [
     "dataset_path = \"./datasets/movie_plots.csv\"\n",
     "df = pd.read_csv(dataset_path)"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "movies = df[df[\"Origin/Ethnicity\"] == \"American\"].sort_values(\"Release Year\", ascending=False).head(50)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "movie_plots = movies[\"Plot\"].values"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Generating the embeddings"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from tenacity import retry, wait_random_exponential, stop_after_attempt\n",
+    "import pickle\n",
+    "import tiktoken"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))\n",
+    "def get_embedding(text, model=\"text-embedding-ada-002\"):\n",
+    "\n",
+    "    # replace newlines, which can negatively affect performance.\n",
+    "    text = text.replace(\"\\n\", \" \")\n",
+    "\n",
+    "    return openai.Embedding.create(input=text, model=model)[\"data\"][0][\"embedding\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "enc = tiktoken.encoding_for_model(\"text-embedding-ada-002\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "total_tokens = sum([len(enc.encode(plot)) for plot in movie_plots])"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 12,
    "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "16751"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "total_tokens"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Estimated cost $0.01\n"
+     ]
+    }
+   ],
+   "source": [
+    "cost = total_tokens * (.0004 / 1000)\n",
+    "print(f\"Estimated cost ${cost:.2f}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
    "outputs": [],
    "source": [
-    "movies = df[df[\"Origin/Ethnicity\"] == \"American\"].sort_values(\"Release Year\", ascending=False).head(500)"
+    "# establish a cache of embeddings to avoid recomputing\n",
+    "# cache is a dict of tuples (text, model) -> embedding, saved as a pickle file\n",
+    "\n",
+    "# set path to embedding cache\n",
+    "embedding_cache_path = \"./embeddings/movie_embeddings_cache.pkl\"\n",
+    "\n",
+    "# load the cache if it exists, and save a copy to disk\n",
+    "try:\n",
+    "    embedding_cache = pd.read_pickle(embedding_cache_path)\n",
+    "except FileNotFoundError:\n",
+    "    embedding_cache = {}\n",
+    "with open(embedding_cache_path, \"wb\") as embedding_cache_file:\n",
+    "    pickle.dump(embedding_cache, embedding_cache_file)\n",
+    "\n",
+    "# define a function to retrieve embeddings from the cache if present, and otherwise request via the API\n",
+    "def embedding_from_string(\n",
+    "    string,\n",
+    "    model=\"text-embedding-ada-002\",\n",
+    "    embedding_cache=embedding_cache\n",
+    "):\n",
+    "    \"\"\"Return embedding of given string, using a cache to avoid recomputing.\"\"\"\n",
+    "    if (string, model) not in embedding_cache.keys():\n",
+    "        embedding_cache[(string, model)] = get_embedding(string, model)\n",
+    "        print(f\"GOT EMBEDDING FROM OPENAI FOR {string[:20]}\")\n",
+    "        with open(embedding_cache_path, \"wb\") as embedding_cache_file:\n",
+    "            pickle.dump(embedding_cache, embedding_cache_file)\n",
+    "    return embedding_cache[(string, model)]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "GOT EMBEDDING FROM OPENAI FOR Meek clerk Elmer Lam\n",
+      "GOT EMBEDDING FROM OPENAI FOR Nick and Nora Charle\n",
+      "GOT EMBEDDING FROM OPENAI FOR A card sharp steps i\n",
+      "GOT EMBEDDING FROM OPENAI FOR Template:Section Edi\n",
+      "GOT EMBEDDING FROM OPENAI FOR Taft, a policeman, h\n",
+      "GOT EMBEDDING FROM OPENAI FOR Geoffrey Sherwood, r\n",
+      "GOT EMBEDDING FROM OPENAI FOR Stenographer Marilyn\n",
+      "GOT EMBEDDING FROM OPENAI FOR Kay Parrish is the d\n",
+      "GOT EMBEDDING FROM OPENAI FOR The film centers on \n",
+      "GOT EMBEDDING FROM OPENAI FOR Secretary Mirabel Mi\n",
+      "GOT EMBEDDING FROM OPENAI FOR One year after gradu\n",
+      "GOT EMBEDDING FROM OPENAI FOR Ellen Garfield refus\n",
+      "GOT EMBEDDING FROM OPENAI FOR California gubernato\n",
+      "GOT EMBEDDING FROM OPENAI FOR In San Francisco in \n",
+      "GOT EMBEDDING FROM OPENAI FOR Freckles, a young ma\n",
+      "GOT EMBEDDING FROM OPENAI FOR A radical campus gro\n",
+      "GOT EMBEDDING FROM OPENAI FOR A suicidal woman, Li\n",
+      "GOT EMBEDDING FROM OPENAI FOR Broadway star Al How\n",
+      "GOT EMBEDDING FROM OPENAI FOR In 1925 London, midd\n",
+      "GOT EMBEDDING FROM OPENAI FOR When Mary Beekman (I\n",
+      "GOT EMBEDDING FROM OPENAI FOR Set somewhere in Vie\n",
+      "GOT EMBEDDING FROM OPENAI FOR At Hampstead Court H\n",
+      "GOT EMBEDDING FROM OPENAI FOR When top Broadway bo\n",
+      "GOT EMBEDDING FROM OPENAI FOR Diamond Jim Brady (E\n",
+      "GOT EMBEDDING FROM OPENAI FOR Lieut. Bill Branniga\n",
+      "GOT EMBEDDING FROM OPENAI FOR Rodeo star John Scot\n",
+      "GOT EMBEDDING FROM OPENAI FOR Paul Madvig (Edward \n",
+      "GOT EMBEDDING FROM OPENAI FOR Luisa Ginglebusher (\n",
+      "GOT EMBEDDING FROM OPENAI FOR In the resort of Lak\n",
+      "GOT EMBEDDING FROM OPENAI FOR John Mason chases af\n",
+      "GOT EMBEDDING FROM OPENAI FOR In the time of Jesus\n",
+      "GOT EMBEDDING FROM OPENAI FOR In New York City, Dr\n",
+      "GOT EMBEDDING FROM OPENAI FOR Don Phelan, the ace \n",
+      "GOT EMBEDDING FROM OPENAI FOR Wealthy and charitab\n",
+      "GOT EMBEDDING FROM OPENAI FOR In Manhattan's lower\n",
+      "GOT EMBEDDING FROM OPENAI FOR In Dublin in 1922, G\n",
+      "GOT EMBEDDING FROM OPENAI FOR Lawrence (Pat O'Brie\n",
+      "GOT EMBEDDING FROM OPENAI FOR Jim Buchanan (Marsha\n",
+      "GOT EMBEDDING FROM OPENAI FOR Kay Bentley (Joan Cr\n",
+      "GOT EMBEDDING FROM OPENAI FOR In London, Stella Pa\n",
+      "GOT EMBEDDING FROM OPENAI FOR Annette Monard Stree\n",
+      "GOT EMBEDDING FROM OPENAI FOR Belle McGill is unaw\n",
+      "GOT EMBEDDING FROM OPENAI FOR A ranch foreman trie\n",
+      "GOT EMBEDDING FROM OPENAI FOR A publisher bets an \n",
+      "GOT EMBEDDING FROM OPENAI FOR A racketeer known as\n",
+      "GOT EMBEDDING FROM OPENAI FOR Dr. Holden (Ralph Be\n",
+      "GOT EMBEDDING FROM OPENAI FOR The life and loves o\n",
+      "GOT EMBEDDING FROM OPENAI FOR Brought up in povert\n",
+      "GOT EMBEDDING FROM OPENAI FOR Before the First Wor\n",
+      "GOT EMBEDDING FROM OPENAI FOR Laura Bayles has bee\n"
+     ]
+    }
+   ],
+   "source": [
+    "# This line actaully generates the embeddings\n",
+    "plot_embeddings = [embedding_from_string(plot, model=\"text-embedding-ada-002\") for plot in movie_plots]"
    ]
   }
  ],
diff --git a/playground/README.md b/playground/README.md
@@ -82,11 +82,10 @@ You need to create a `.env` file with your `OPENAI_API_KEY`.
 ## Embeddings
 
 - generating a single embedding.
-- creating a movie embedding visualization with Atlas.
 - getting our movie data.
 - getting our movie data ready.
-- generating embeddings for 5000 movies.
-- visualizing our embeddings with atlas.
+- generating embeddings for 50 movies.
+- visualizing our embeddings with Atlas.
 - recommending movies using our embeddings.
 
 [Check the notebook](12-embeddings.ipynb)
diff --git a/playground/embeddings/movie_embeddings_cache.pkl b/playground/embeddings/movie_embeddings_cache.pkl
diff --git a/requirements.txt b/requirements.txt
@@ -20,9 +20,10 @@ comm==0.1.3
 debugpy==1.6.7
 decorator==5.1.1
 defusedxml==0.7.1
+et-xmlfile==1.1.0
 executing==1.2.0
 fastjsonschema==2.16.3
-Flask==2.3.1
+Flask==2.3.2
 fqdn==1.5.1
 frozenlist==1.3.3
 idna==3.4
@@ -59,8 +60,10 @@ notebook==6.5.4
 notebook_shim==0.2.2
 numpy==1.24.3
 openai==0.27.4
+openpyxl==3.1.2
 packaging==23.1
 pandas==2.0.1
+pandas-stubs==2.0.1.230501
 pandocfilters==1.5.0
 parso==0.8.3
 pexpect==4.8.0
@@ -100,6 +103,7 @@ tinycss2==1.2.1
 tornado==6.3.1
 tqdm==4.65.0
 traitlets==5.9.0
+types-pytz==2023.3.0.0
 typing_extensions==4.5.0
 tzdata==2023.3
 uri-template==1.2.0
@@ -108,7 +112,7 @@ wcwidth==0.2.6
 webcolors==1.13
 webencodings==0.5.1
 websocket-client==1.5.1
-Werkzeug==2.3.0
+Werkzeug==2.3.3
 widgetsnbextension==4.0.7
 yarl==1.9.1
 zipp==3.15.0