notebook updated

efenocchi · efenocchi · commit d3202ff29a3c · 2024-11-05T09:04:04.000Z
diff --git a/notebook.ipynb b/notebook.ipynb
@@ -16,7 +16,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -31,7 +31,7 @@
     }
    ],
    "source": [
-    "%pip install -qU deeplake openai"
+    "!pip install deeplake openai"
    ]
   },
   {
@@ -45,19 +45,21 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "We use the `wget` command to download a JSON file containing scraped restaurant data directly from the specified GitHub repository. This file, `scraped_restaurant_data.json`, will be saved to the current working directory, making it readily available for data processing and analysis tasks in the notebook. If you want to perform the scraping yourself, you can use libraries like **[Scrape Graph AI](https://github.com/ScrapeGraphAI/Scrapegraph-ai)** to gather data directly from websites."
+    "We use the `requests` command to download a JSON file containing scraped restaurant data directly from the specified GitHub repository. This file, `scraped_restaurant_data.json`, will be saved to the current working directory, making it readily available for data processing and analysis tasks in the notebook. If you want to perform the scraping yourself, you can use libraries like **[Scrape Graph AI](https://github.com/ScrapeGraphAI/Scrapegraph-ai)** to gather data directly from websites."
    ]
   },
   {
-   "cell_type": "markdown",
+   "cell_type": "code",
+   "execution_count": null,
    "metadata": {},
+   "outputs": [],
    "source": [
-    "%pip install -q requests"
+    "!pip install requests"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 158,
+   "execution_count": 13,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -66,27 +68,12 @@
     "url = \"https://raw.githubusercontent.com/activeloopai/notebook-v4/refs/heads/main/scraped_restaurant_data.json\"\n",
     "file_name = \"scraped_restaurant_data.json\"\n",
     "\n",
-    "response = requests.get(url)\n",
-    "response.raise_for_status()  # Check if the request was successful\n",
-    "\n",
-    "with open(file_name, \"wb\") as file:\n",
-    "    file.write(response.content)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 159,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import json\n",
-    "with open(\"scraped_restaurant_data.json\", \"r\") as f:\n",
-    "    scraped_data = json.load(f)"
+    "scraped_data = requests.get(url).json()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 160,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [
     {
@@ -158,12 +145,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 74,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
-    "ds.add_column(\"restaurant_name\", types.Text(index_type=types.TextIndexType.Inverted))\n",
-    "ds.add_column(\"restaurant_description\", types.Text(index_type=types.TextIndexType.Inverted))"
+    "ds.add_column(\"restaurant_name\", types.Text(index_type=types.Inverted))\n",
+    "ds.add_column(\"restaurant_description\", types.Text(index_type=types.Inverted))"
    ]
   },
   {
@@ -182,7 +169,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 76,
+   "execution_count": 14,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -209,7 +196,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 77,
+   "execution_count": 15,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -258,7 +245,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 79,
+   "execution_count": 16,
    "metadata": {},
    "outputs": [
     {
@@ -267,19 +254,17 @@
        "Dataset(columns=(restaurant_name,restaurant_description), length=4)"
       ]
      },
-     "execution_count": 79,
+     "execution_count": 16,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
     "word = 'burritos'\n",
-    "view = ds.query(\n",
-    "    f\"\"\"\n",
-    "    SELECT *\n",
+    "view = ds.query(f\"\"\"\n",
+    "    SELECT * \n",
     "    WHERE CONTAINS(restaurant_description, '{word}')\n",
-    "    \"\"\"\n",
-    ")\n",
+    "\"\"\")\n",
     "view"
    ]
   },
@@ -374,7 +359,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 82,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -395,8 +380,8 @@
    ],
    "source": [
     "# Add columns to the dataset\n",
-    "ds_bm25.add_column(\"restaurant_name\", types.Text(index_type=types.TextIndexType.BM25))\n",
-    "ds_bm25.add_column(\"restaurant_description\", types.Text(index_type=types.TextIndexType.BM25))\n",
+    "ds_bm25.add_column(\"restaurant_name\", types.Text(index_type=types.BM25))\n",
+    "ds_bm25.add_column(\"restaurant_description\", types.Text(index_type=types.BM25))\n",
     "ds_bm25.commit()\n",
     "ds_bm25.summary()"
    ]
@@ -463,7 +448,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 84,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -479,13 +464,11 @@
    ],
    "source": [
     "query = \"I want burritos\"\n",
-    "view_bm25 = ds_bm25.query(\n",
-    "    f\"\"\"\n",
+    "view_bm25 = ds_bm25.query(f\"\"\"\n",
     "    SELECT * \n",
     "    ORDER BY BM25_SIMILARITY(restaurant_description, '{query}') DESC \n",
-    "    LIMIT 10\n",
-    "    \"\"\"\n",
-    ")\n",
+    "    LIMIT 10    \n",
+    "\"\"\")\n",
     "view_bm25"
    ]
   },
@@ -602,8 +585,8 @@
     "\n",
     "# Add columns to the dataset\n",
     "vector_search.add_column(name=\"embedding\", dtype=types.Embedding(3072))\n",
-    "vector_search.add_column(name=\"restaurant_name\", dtype=types.Text(index_type=types.TextIndexType.BM25))\n",
-    "vector_search.add_column(name=\"restaurant_description\", dtype=types.Text(index_type=types.TextIndexType.BM25))\n",
+    "vector_search.add_column(name=\"restaurant_name\", dtype=types.Text(index_type=types.BM25))\n",
+    "vector_search.add_column(name=\"restaurant_description\", dtype=types.Text(index_type=types.BM25))\n",
     "vector_search.commit()"
    ]
   },
@@ -1063,7 +1046,7 @@
     }
    ],
    "source": [
-    "%pip install -qU numpy pydantic"
+    "!pip install numpy pydantic"
    ]
   },
   {
@@ -1500,7 +1483,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -1519,8 +1502,8 @@
     }
    ],
    "source": [
-    "%pip install -qU torch torchvision\n",
-    "%pip install -q git+https://github.com/openai/CLIP.git"
+    "!pip install -U torch torchvision\n",
+    "!pip install git+https://github.com/openai/CLIP.git"
    ]
   },
   {
@@ -1823,7 +1806,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -1838,7 +1821,7 @@
     }
    ],
    "source": [
-    "%pip install -q matplotlib"
+    "!pip install matplotlib"
    ]
   },
   {
@@ -1931,7 +1914,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -1946,14 +1929,14 @@
     }
    ],
    "source": [
-    "%pip install -q colpali-engine accelerate"
+    "!pip install colpali-engine accelerate"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### Download the ColPali model from Hugging Face"
+    "### Download the ColPali model"
    ]
   },
   {
@@ -2023,7 +2006,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -2038,7 +2021,7 @@
     }
    ],
    "source": [
-    "%pip install -q datasets"
+    "!pip install datasets"
    ]
   },
   {

Original file line number	Diff line number	Diff line change
`@@ -16,7 +16,7 @@`
`16`	`16`	`},`
`17`	`17`	`{`
`18`	`18`	`"cell_type": "code",`
`19`		`- "execution_count": 1,`
	`19`	`+ "execution_count": null,`
`20`	`20`	`"metadata": {},`
`21`	`21`	`"outputs": [`
`22`	`22`	`{`
`@@ -31,7 +31,7 @@`
`31`	`31`	`}`
`32`	`32`	`],`
`33`	`33`	`"source": [`
`34`		`- "%pip install -qU deeplake openai"`
	`34`	`+ "!pip install deeplake openai"`
`35`	`35`	`]`
`36`	`36`	`},`
`37`	`37`	`{`
`@@ -45,19 +45,21 @@`
`45`	`45`	`"cell_type": "markdown",`
`46`	`46`	`"metadata": {},`
`47`	`47`	`"source": [`
`48`		- "We use the `wget` command to download a JSON file containing scraped restaurant data directly from the specified GitHub repository. This file, `scraped_restaurant_data.json`, will be saved to the current working directory, making it readily available for data processing and analysis tasks in the notebook. If you want to perform the scraping yourself, you can use libraries like [Scrape Graph AI](https://github.com/ScrapeGraphAI/Scrapegraph-ai) to gather data directly from websites."
	`48`	+ "We use the `requests` command to download a JSON file containing scraped restaurant data directly from the specified GitHub repository. This file, `scraped_restaurant_data.json`, will be saved to the current working directory, making it readily available for data processing and analysis tasks in the notebook. If you want to perform the scraping yourself, you can use libraries like [Scrape Graph AI](https://github.com/ScrapeGraphAI/Scrapegraph-ai) to gather data directly from websites."
`49`	`49`	`]`
`50`	`50`	`},`
`51`	`51`	`{`
`52`		`- "cell_type": "markdown",`
	`52`	`+ "cell_type": "code",`
	`53`	`+ "execution_count": null,`
`53`	`54`	`"metadata": {},`
	`55`	`+ "outputs": [],`
`54`	`56`	`"source": [`
`55`		`- "%pip install -q requests"`
	`57`	`+ "!pip install requests"`
`56`	`58`	`]`
`57`	`59`	`},`
`58`	`60`	`{`
`59`	`61`	`"cell_type": "code",`
`60`		`- "execution_count": 158,`
	`62`	`+ "execution_count": 13,`
`61`	`63`	`"metadata": {},`
`62`	`64`	`"outputs": [],`
`63`	`65`	`"source": [`
`@@ -66,27 +68,12 @@`
`66`	`68`	`"url = \"https://raw.githubusercontent.com/activeloopai/notebook-v4/refs/heads/main/scraped_restaurant_data.json\"\n",`
`67`	`69`	`"file_name = \"scraped_restaurant_data.json\"\n",`
`68`	`70`	`"\n",`
`69`		`- "response = requests.get(url)\n",`
`70`		`- "response.raise_for_status() # Check if the request was successful\n",`
`71`		`- "\n",`
`72`		`- "with open(file_name, \"wb\") as file:\n",`
`73`		`- " file.write(response.content)"`
`74`		`- ]`
`75`		`- },`
`76`		`- {`
`77`		`- "cell_type": "code",`
`78`		`- "execution_count": 159,`
`79`		`- "metadata": {},`
`80`		`- "outputs": [],`
`81`		`- "source": [`
`82`		`- "import json\n",`
`83`		`- "with open(\"scraped_restaurant_data.json\", \"r\") as f:\n",`
`84`		`- " scraped_data = json.load(f)"`
	`71`	`+ "scraped_data = requests.get(url).json()"`
`85`	`72`	`]`
`86`	`73`	`},`
`87`	`74`	`{`
`88`	`75`	`"cell_type": "code",`
`89`		`- "execution_count": 160,`
	`76`	`+ "execution_count": 3,`
`90`	`77`	`"metadata": {},`
`91`	`78`	`"outputs": [`
`92`	`79`	`{`
`@@ -158,12 +145,12 @@`
`158`	`145`	`},`
`159`	`146`	`{`
`160`	`147`	`"cell_type": "code",`
`161`		`- "execution_count": 74,`
	`148`	`+ "execution_count": null,`
`162`	`149`	`"metadata": {},`
`163`	`150`	`"outputs": [],`
`164`	`151`	`"source": [`
`165`		`- "ds.add_column(\"restaurant_name\", types.Text(index_type=types.TextIndexType.Inverted))\n",`
`166`		`- "ds.add_column(\"restaurant_description\", types.Text(index_type=types.TextIndexType.Inverted))"`
	`152`	`+ "ds.add_column(\"restaurant_name\", types.Text(index_type=types.Inverted))\n",`
	`153`	`+ "ds.add_column(\"restaurant_description\", types.Text(index_type=types.Inverted))"`
`167`	`154`	`]`
`168`	`155`	`},`
`169`	`156`	`{`
`@@ -182,7 +169,7 @@`
`182`	`169`	`},`
`183`	`170`	`{`
`184`	`171`	`"cell_type": "code",`
`185`		`- "execution_count": 76,`
	`172`	`+ "execution_count": 14,`
`186`	`173`	`"metadata": {},`
`187`	`174`	`"outputs": [],`
`188`	`175`	`"source": [`
`@@ -209,7 +196,7 @@`
`209`	`196`	`},`
`210`	`197`	`{`
`211`	`198`	`"cell_type": "code",`
`212`		`- "execution_count": 77,`
	`199`	`+ "execution_count": 15,`
`213`	`200`	`"metadata": {},`
`214`	`201`	`"outputs": [],`
`215`	`202`	`"source": [`
`@@ -258,7 +245,7 @@`
`258`	`245`	`},`
`259`	`246`	`{`
`260`	`247`	`"cell_type": "code",`
`261`		`- "execution_count": 79,`
	`248`	`+ "execution_count": 16,`
`262`	`249`	`"metadata": {},`
`263`	`250`	`"outputs": [`
`264`	`251`	`{`
`@@ -267,19 +254,17 @@`
`267`	`254`	`"Dataset(columns=(restaurant_name,restaurant_description), length=4)"`
`268`	`255`	`]`
`269`	`256`	`},`
`270`		`- "execution_count": 79,`
	`257`	`+ "execution_count": 16,`
`271`	`258`	`"metadata": {},`
`272`	`259`	`"output_type": "execute_result"`
`273`	`260`	`}`
`274`	`261`	`],`
`275`	`262`	`"source": [`
`276`	`263`	`"word = 'burritos'\n",`
`277`		`- "view = ds.query(\n",`
`278`		`- " f\"\"\"\n",`
`279`		`- " SELECT *\n",`
	`264`	`+ "view = ds.query(f\"\"\"\n",`
	`265`	`+ " SELECT * \n",`
`280`	`266`	`" WHERE CONTAINS(restaurant_description, '{word}')\n",`
`281`		`- " \"\"\"\n",`
`282`		`- ")\n",`
	`267`	`+ "\"\"\")\n",`
`283`	`268`	`"view"`
`284`	`269`	`]`
`285`	`270`	`},`
`@@ -374,7 +359,7 @@`
`374`	`359`	`},`
`375`	`360`	`{`
`376`	`361`	`"cell_type": "code",`
`377`		`- "execution_count": 82,`
	`362`	`+ "execution_count": null,`
`378`	`363`	`"metadata": {},`
`379`	`364`	`"outputs": [`
`380`	`365`	`{`
`@@ -395,8 +380,8 @@`
`395`	`380`	`],`
`396`	`381`	`"source": [`
`397`	`382`	`"# Add columns to the dataset\n",`
`398`		`- "ds_bm25.add_column(\"restaurant_name\", types.Text(index_type=types.TextIndexType.BM25))\n",`
`399`		`- "ds_bm25.add_column(\"restaurant_description\", types.Text(index_type=types.TextIndexType.BM25))\n",`
	`383`	`+ "ds_bm25.add_column(\"restaurant_name\", types.Text(index_type=types.BM25))\n",`
	`384`	`+ "ds_bm25.add_column(\"restaurant_description\", types.Text(index_type=types.BM25))\n",`
`400`	`385`	`"ds_bm25.commit()\n",`
`401`	`386`	`"ds_bm25.summary()"`
`402`	`387`	`]`
`@@ -463,7 +448,7 @@`
`463`	`448`	`},`
`464`	`449`	`{`
`465`	`450`	`"cell_type": "code",`
`466`		`- "execution_count": 84,`
	`451`	`+ "execution_count": null,`
`467`	`452`	`"metadata": {},`
`468`	`453`	`"outputs": [`
`469`	`454`	`{`
`@@ -479,13 +464,11 @@`
`479`	`464`	`],`
`480`	`465`	`"source": [`
`481`	`466`	`"query = \"I want burritos\"\n",`
`482`		`- "view_bm25 = ds_bm25.query(\n",`
`483`		`- " f\"\"\"\n",`
	`467`	`+ "view_bm25 = ds_bm25.query(f\"\"\"\n",`
`484`	`468`	`" SELECT * \n",`
`485`	`469`	`" ORDER BY BM25_SIMILARITY(restaurant_description, '{query}') DESC \n",`
`486`		`- " LIMIT 10\n",`
`487`		`- " \"\"\"\n",`
`488`		`- ")\n",`
	`470`	`+ " LIMIT 10 \n",`
	`471`	`+ "\"\"\")\n",`
`489`	`472`	`"view_bm25"`
`490`	`473`	`]`
`491`	`474`	`},`
`@@ -602,8 +585,8 @@`
`602`	`585`	`"\n",`
`603`	`586`	`"# Add columns to the dataset\n",`
`604`	`587`	`"vector_search.add_column(name=\"embedding\", dtype=types.Embedding(3072))\n",`
`605`		`- "vector_search.add_column(name=\"restaurant_name\", dtype=types.Text(index_type=types.TextIndexType.BM25))\n",`
`606`		`- "vector_search.add_column(name=\"restaurant_description\", dtype=types.Text(index_type=types.TextIndexType.BM25))\n",`
	`588`	`+ "vector_search.add_column(name=\"restaurant_name\", dtype=types.Text(index_type=types.BM25))\n",`
	`589`	`+ "vector_search.add_column(name=\"restaurant_description\", dtype=types.Text(index_type=types.BM25))\n",`
`607`	`590`	`"vector_search.commit()"`
`608`	`591`	`]`
`609`	`592`	`},`
`@@ -1063,7 +1046,7 @@`
`1063`	`1046`	`}`
`1064`	`1047`	`],`
`1065`	`1048`	`"source": [`
`1066`		`- "%pip install -qU numpy pydantic"`
	`1049`	`+ "!pip install numpy pydantic"`
`1067`	`1050`	`]`
`1068`	`1051`	`},`
`1069`	`1052`	`{`
`@@ -1500,7 +1483,7 @@`
`1500`	`1483`	`},`
`1501`	`1484`	`{`
`1502`	`1485`	`"cell_type": "code",`
`1503`		`- "execution_count": 1,`
	`1486`	`+ "execution_count": null,`
`1504`	`1487`	`"metadata": {},`
`1505`	`1488`	`"outputs": [`
`1506`	`1489`	`{`
`@@ -1519,8 +1502,8 @@`
`1519`	`1502`	`}`
`1520`	`1503`	`],`
`1521`	`1504`	`"source": [`
`1522`		`- "%pip install -qU torch torchvision\n",`
`1523`		`- "%pip install -q git+https://github.com/openai/CLIP.git"`
	`1505`	`+ "!pip install -U torch torchvision\n",`
	`1506`	`+ "!pip install git+https://github.com/openai/CLIP.git"`
`1524`	`1507`	`]`
`1525`	`1508`	`},`
`1526`	`1509`	`{`
`@@ -1823,7 +1806,7 @@`
`1823`	`1806`	`},`
`1824`	`1807`	`{`
`1825`	`1808`	`"cell_type": "code",`
`1826`		`- "execution_count": 2,`
	`1809`	`+ "execution_count": null,`
`1827`	`1810`	`"metadata": {},`
`1828`	`1811`	`"outputs": [`
`1829`	`1812`	`{`
`@@ -1838,7 +1821,7 @@`
`1838`	`1821`	`}`
`1839`	`1822`	`],`
`1840`	`1823`	`"source": [`
`1841`		`- "%pip install -q matplotlib"`
	`1824`	`+ "!pip install matplotlib"`
`1842`	`1825`	`]`
`1843`	`1826`	`},`
`1844`	`1827`	`{`
`@@ -1931,7 +1914,7 @@`
`1931`	`1914`	`},`
`1932`	`1915`	`{`
`1933`	`1916`	`"cell_type": "code",`
`1934`		`- "execution_count": 3,`
	`1917`	`+ "execution_count": null,`
`1935`	`1918`	`"metadata": {},`
`1936`	`1919`	`"outputs": [`
`1937`	`1920`	`{`
`@@ -1946,14 +1929,14 @@`
`1946`	`1929`	`}`
`1947`	`1930`	`],`
`1948`	`1931`	`"source": [`
`1949`		`- "%pip install -q colpali-engine accelerate"`
	`1932`	`+ "!pip install colpali-engine accelerate"`
`1950`	`1933`	`]`
`1951`	`1934`	`},`
`1952`	`1935`	`{`
`1953`	`1936`	`"cell_type": "markdown",`
`1954`	`1937`	`"metadata": {},`
`1955`	`1938`	`"source": [`
`1956`		`- "### Download the ColPali model from Hugging Face"`
	`1939`	`+ "### Download the ColPali model"`
`1957`	`1940`	`]`
`1958`	`1941`	`},`
`1959`	`1942`	`{`
`@@ -2023,7 +2006,7 @@`
`2023`	`2006`	`},`
`2024`	`2007`	`{`
`2025`	`2008`	`"cell_type": "code",`
`2026`		`- "execution_count": 4,`
	`2009`	`+ "execution_count": null,`
`2027`	`2010`	`"metadata": {},`
`2028`	`2011`	`"outputs": [`
`2029`	`2012`	`{`
`@@ -2038,7 +2021,7 @@`
`2038`	`2021`	`}`
`2039`	`2022`	`],`
`2040`	`2023`	`"source": [`
`2041`		`- "%pip install -q datasets"`
	`2024`	`+ "!pip install datasets"`
`2042`	`2025`	`]`
`2043`	`2026`	`},`
`2044`	`2027`	`{`