language-org · steevelaquitaine · Sep 4, 2020 · Sep 6, 2020 · Sep 6, 2020 · Sep 6, 2020
diff --git a/notebooks/load_data.ipynb b/notebooks/load_data.ipynb
@@ -21,66 +21,69 @@
  "nbformat": 4,
  "nbformat_minor": 2,
  "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Load Dataset \n",
+    "\n",
+    "* author: steeve laquitaine\n",
+    "\n",
+    "* Workload (1 hour to debug)\n"
+   ]
+  },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 28,
    "metadata": {},
    "outputs": [],
    "source": [
     "import os\n",
-    "import wget"
+    "import wget\n",
+    "import time\n",
+    "import tarfile\n",
+    "import numpy as np\n",
+    "from pathlib import Path\n",
+    "import pandas as pd"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 55,
    "metadata": {
     "tags": []
    },
-   "outputs": [
-    {
-     "output_type": "stream",
-     "name": "stdout",
-     "text": "The project path is: /Users/steeve_laquitaine/Desktop/CodeHub/nlp_txt_similarity\n"
-    }
-   ],
+   "outputs": [],
    "source": [
     "# set data paths  \n",
-    "os.chdir('..')\n",
-    "print('The project path is:', os.getcwd())\n",
-    "\n",
-    "data_path           = 'data/'\n",
-    "raw_data_path       = data_path + \"01_raw/\"\n",
-    "init_dataset_path   = data_path + \"02_intermediate/\"\n",
+    "os.chdir('..') # should be in my_project/notebooks/\n",
+    "data_path = os.getcwd()\n",
+    "raw_data_path       = data_path + \"/data/01_raw/\"\n",
+    "preprocessed_data_path = data_path + \"/data/02_preprocessed/\"\n",
     "url                 = 'http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz' \n",
-    "download_output     = raw_data_path + \"/dataset_compressed.tar.gz\""
+    "download_output     = raw_data_path + \"aclImdb_v1.tar.gz\"\n",
+    "train_path = preprocessed_data_path + \"train.csv\"\n",
+    "test_path = preprocessed_data_path + \"test.csv\""
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [
-    {
-     "output_type": "error",
-     "ename": "NameError",
-     "evalue": "name 'wget' is not defined",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
-      "\u001b[0;32m<ipython-input-3-3ff9859f4e46>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0;31m# download\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mwget\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdownload\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0murl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdownload_output\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
-      "\u001b[0;31mNameError\u001b[0m: name 'wget' is not defined"
-     ]
-    }
-   ],
+   "execution_count": 32,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
    "source": [
-    "# download\n",
-    "wget.download(url, download_output)"
+    "## download (to debug)\n",
+    "## !! FileNotFoundError: No such file or directory: './data/01_raw/dataset_compressed.tar.gzoul9mi4u.tmp'\n",
+    "# print(url)\n",
+    "# print(download_output)\n",
+    "# wget.download(url, download_output)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 57,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -96,16 +99,29 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# decompress\n",
+    "decompress_dataset(download_output, raw_data_path)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
-    "decompress_dataset(download_output, raw_data_path)"
+    "# delete compressed file\n",
+    "os.remove(download_output) "
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 51,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -132,26 +148,20 @@
     "    PATH = Path(raw_data_path + 'aclImdb/')\n",
     "    CLAS_PATH = Path(output_path)\n",
     "\n",
-    "    # 3 - Processing and storing train dataset\n",
+    "    # 3 - Process and store train dataset\n",
     "    print(PATH)\n",
     "    trn_texts, trn_labels = get_texts(PATH / 'train', CLASSES)\n",
     "    print(len(trn_texts))\n",
     "    print(len(trn_labels))\n",
-    "    #trn_idx = np.random.permutation(len(trn_texts))\n",
-    "    #trn_texts = trn_texts[trn_idx]\n",
-    "    #trn_labels = trn_labels[trn_idx]\n",
     "    df_trn = pd.DataFrame({'review': trn_texts, 'sentiment': trn_labels}, columns=col_names)\n",
     "    df_trn[df_trn['sentiment'] != 2].to_csv(CLAS_PATH / 'train.csv', index=False)\n",
     "    \n",
-    "    # 4 - Processing and storing evaluation dataset\n",
+    "    # 4 - Process and store evaluation dataset\n",
     "    val_texts, val_labels = get_texts(PATH / 'test', CLASSES)\n",
-    "    #val_idx = np.random.permutation(len(val_texts))\n",
-    "    #val_texts = val_texts[val_idx]\n",
-    "    #val_labels = val_labels[val_idx]\n",
     "    df_val = pd.DataFrame({'review': val_texts, 'sentiment': val_labels}, columns=col_names)\n",
     "    df_val.to_csv(CLAS_PATH / 'test.csv', index=False)\n",
     "    \n",
-    "    # 5 - Store the classes\n",
+    "    # 5 - Store classes\n",
     "    (CLAS_PATH / 'classes.txt').open('w', encoding='utf-8').writelines(f'{o}\\n' for o in CLASSES)\n",
     "\n",
     "    toc = time.time()\n",
@@ -160,16 +170,25 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
+   "execution_count": 52,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stdout",
+     "text": "/Users/steeve_laquitaine/Desktop/CodeHub/nlp_txt_similarity/data/01_raw/aclImdb\n75000\n75000\n54.57  sec\n"
+    }
+   ],
    "source": [
-    "extract_transform_load_dataset(raw_data_path, init_dataset_path, timeit = True)"
+    "# ETL (1 min)\n",
+    "extract_transform_load_dataset(raw_data_path, preprocessed_data_path, timeit = True)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 56,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -201,11 +220,20 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
+   "execution_count": 67,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stdout",
+     "text": "sentiment                                             review\n16560          1  I saw this film in the worst possible circumst...\n10876          0  SPOILERS THROUGHOUT!!!!<br /><br />I had read ...\n15889          1  Time and time again, I've stated that if peopl...\n12920          1  One of my favourite films. It has everything -...\n13565          1  Maybe I'm a sap but this is the sweetest movie...\n(load_dataset) 0.5145790576934814\nCompleted 0.5147788524627686\n"
+    }
+   ],
    "source": [
-    "X_train, X_test, Y_train, Y_test = load_dataset(train_path, test_path, sample=5000)\n"
+    "# load datasets (arrays, 1 sec)\n",
+    "Y_train, Y_test, X_train, X_test = load_dataset(train_path, test_path, sample=5000)\n"
    ]
   }
  ]