Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Develop #1

Open
wants to merge 8 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
140 changes: 84 additions & 56 deletions notebooks/load_data.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -21,66 +21,69 @@
"nbformat": 4,
"nbformat_minor": 2,
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Load Dataset \n",
"\n",
"* author: steeve laquitaine\n",
"\n",
"* Workload (1 hour to debug)\n"
]
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import wget"
"import wget\n",
"import time\n",
"import tarfile\n",
"import numpy as np\n",
"from pathlib import Path\n",
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 55,
"metadata": {
"tags": []
},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": "The project path is: /Users/steeve_laquitaine/Desktop/CodeHub/nlp_txt_similarity\n"
}
],
"outputs": [],
"source": [
"# set data paths \n",
"os.chdir('..')\n",
"print('The project path is:', os.getcwd())\n",
"\n",
"data_path = 'data/'\n",
"raw_data_path = data_path + \"01_raw/\"\n",
"init_dataset_path = data_path + \"02_intermediate/\"\n",
"os.chdir('..') # should be in my_project/notebooks/\n",
"data_path = os.getcwd()\n",
"raw_data_path = data_path + \"/data/01_raw/\"\n",
"preprocessed_data_path = data_path + \"/data/02_preprocessed/\"\n",
"url = 'http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz' \n",
"download_output = raw_data_path + \"/dataset_compressed.tar.gz\""
"download_output = raw_data_path + \"aclImdb_v1.tar.gz\"\n",
"train_path = preprocessed_data_path + \"train.csv\"\n",
"test_path = preprocessed_data_path + \"test.csv\""
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"output_type": "error",
"ename": "NameError",
"evalue": "name 'wget' is not defined",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-3-3ff9859f4e46>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m# download\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mwget\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdownload\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0murl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdownload_output\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;31mNameError\u001b[0m: name 'wget' is not defined"
]
}
],
"execution_count": 32,
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"# download\n",
"wget.download(url, download_output)"
"## download (to debug)\n",
"## !! FileNotFoundError: No such file or directory: './data/01_raw/dataset_compressed.tar.gzoul9mi4u.tmp'\n",
"# print(url)\n",
"# print(download_output)\n",
"# wget.download(url, download_output)"
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 57,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -96,16 +99,29 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 1,
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"# decompress\n",
"decompress_dataset(download_output, raw_data_path)\n"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"decompress_dataset(download_output, raw_data_path)"
"# delete compressed file\n",
"os.remove(download_output) "
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 51,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -132,26 +148,20 @@
" PATH = Path(raw_data_path + 'aclImdb/')\n",
" CLAS_PATH = Path(output_path)\n",
"\n",
" # 3 - Processing and storing train dataset\n",
" # 3 - Process and store train dataset\n",
" print(PATH)\n",
" trn_texts, trn_labels = get_texts(PATH / 'train', CLASSES)\n",
" print(len(trn_texts))\n",
" print(len(trn_labels))\n",
" #trn_idx = np.random.permutation(len(trn_texts))\n",
" #trn_texts = trn_texts[trn_idx]\n",
" #trn_labels = trn_labels[trn_idx]\n",
" df_trn = pd.DataFrame({'review': trn_texts, 'sentiment': trn_labels}, columns=col_names)\n",
" df_trn[df_trn['sentiment'] != 2].to_csv(CLAS_PATH / 'train.csv', index=False)\n",
" \n",
" # 4 - Processing and storing evaluation dataset\n",
" # 4 - Process and store evaluation dataset\n",
" val_texts, val_labels = get_texts(PATH / 'test', CLASSES)\n",
" #val_idx = np.random.permutation(len(val_texts))\n",
" #val_texts = val_texts[val_idx]\n",
" #val_labels = val_labels[val_idx]\n",
" df_val = pd.DataFrame({'review': val_texts, 'sentiment': val_labels}, columns=col_names)\n",
" df_val.to_csv(CLAS_PATH / 'test.csv', index=False)\n",
" \n",
" # 5 - Store the classes\n",
" # 5 - Store classes\n",
" (CLAS_PATH / 'classes.txt').open('w', encoding='utf-8').writelines(f'{o}\\n' for o in CLASSES)\n",
"\n",
" toc = time.time()\n",
Expand All @@ -160,16 +170,25 @@
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"execution_count": 52,
"metadata": {
"tags": []
},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": "/Users/steeve_laquitaine/Desktop/CodeHub/nlp_txt_similarity/data/01_raw/aclImdb\n75000\n75000\n54.57 sec\n"
}
],
"source": [
"extract_transform_load_dataset(raw_data_path, init_dataset_path, timeit = True)"
"# ETL (1 min)\n",
"extract_transform_load_dataset(raw_data_path, preprocessed_data_path, timeit = True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 56,
"metadata": {},
"outputs": [],
"source": [
Expand Down Expand Up @@ -201,11 +220,20 @@
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"execution_count": 67,
"metadata": {
"tags": []
},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": "sentiment review\n16560 1 I saw this film in the worst possible circumst...\n10876 0 SPOILERS THROUGHOUT!!!!<br /><br />I had read ...\n15889 1 Time and time again, I've stated that if peopl...\n12920 1 One of my favourite films. It has everything -...\n13565 1 Maybe I'm a sap but this is the sweetest movie...\n(load_dataset) 0.5145790576934814\nCompleted 0.5147788524627686\n"
}
],
"source": [
"X_train, X_test, Y_train, Y_test = load_dataset(train_path, test_path, sample=5000)\n"
"# load datasets (arrays, 1 sec)\n",
"Y_train, Y_test, X_train, X_test = load_dataset(train_path, test_path, sample=5000)\n"
]
}
]
Expand Down
Loading