TakeLab · mariosasko · Mar 27, 2021 · Mar 31, 2021 · Mar 31, 2021 · Apr 1, 2021
diff --git a/.github/workflows/scheduled.yml b/.github/workflows/scheduled.yml
@@ -0,0 +1,33 @@
+name: Scheduled jobs
+
+on:
+  pull_request:
+    branches:
+      - master
+  repository_dispatch:
+  schedule:
+    - cron: "0 0 * * *"
+
+jobs:
+  check_notebooks:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: [3.6]
+    defaults:
+      run:
+        working-directory: docs/source/scripts
+
+    steps:
+    - uses: actions/checkout@v2
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v2
+      with:
+        python-version: ${{ matrix.python-version }}
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install -r requirements.txt
+    - name: Execute notebooks 
+      run: |
+        python check_notebooks.py --num_proc auto --ignore_whitespace
diff --git a/README.md b/README.md
@@ -207,7 +207,7 @@ Example({
 })
 ```
 
-For a more interactive introduction, check out the quickstart on Google Colab: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/takelab/podium/blob/master/docs/source/notebooks/quickstart.ipynb)
+For a more interactive introduction, check out the quickstart on Google Colab: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/TakeLab/podium/blob/master/docs/source/notebooks/quickstart.ipynb)
 
 More complex examples can be found in our [examples folder](./examples).
 

diff --git a/docs/source/advanced.rst b/docs/source/advanced.rst
@@ -379,7 +379,9 @@ For a simple example, we will take a look at the built-in SST and IMDB datasets:
   >>> from podium import Field, LabelField, Vocab
   >>> # Load the datasets
   >>> imdb_train, imdb_test = IMDB.get_dataset_splits()
+  >>> imdb_train.finalize_fields()
   >>> sst_train, sst_dev, sst_test = SST.get_dataset_splits()
+  >>> sst_train.finalize_fields()
   >>>
   >>> # Luckily, both label vocabularies are already equal
   >>> print(imdb_train.field('label').vocab.itos)
@@ -416,21 +418,22 @@ For this reason, usage of :class:`podium.datasets.BucketIterator` is recommended
   >>> fields = {'text': text, 'label': label}
   >>>
   >>> train, valid, test = SST.get_dataset_splits(fields=fields)
+  >>> train.finalize_fields()
   >>>
   >>> # Define the iterators and our sort key
   >>> from podium import Iterator, BucketIterator
   >>> def instance_length(instance):
   >>>     # Use the text Field
   >>>     raw, tokenized = instance.text
   >>>     return len(tokenized)
-  >>> bucket_iter = BucketIterator(train, batch_size=32, bucket_sort_key=instance_length)
+  >>> bucket_iter = BucketIterator(sst_train, batch_size=32, bucket_sort_key=instance_length)
 
 The ``bucket_sort_key`` function defines how the instances in the dataset should be sorted. The method accepts an instance of the dataset, and should return a value which will be used as a sort key in the ``BucketIterator``. It might be interesting (and surprising) to see how much space (and time) do we earn by bucketing. We will define a naive iterator on the same dataset and measure the total amount of padding used when iterating over a dataset.
 
 .. code-block:: python
 
   >>> import numpy as np
-  >>> vanilla_iter = Iterator(train, batch_size=32)
+  >>> vanilla_iter = Iterator(sst_train, batch_size=32)
   >>>
   >>> def count_padding(batch, padding_idx):
   >>>     return np.count_nonzero(batch == padding_idx)
@@ -518,7 +521,7 @@ Each ``Dataset`` instance in the SST dataset splits contains ``Field``\s and a `
   >>> import pickle
   >>>
   >>> cache_dir = Path('cache')
-  >>> cache_dir.mkdir()
+  >>> cache_dir.mkdir(exist_ok=True)
   >>>
   >>> dataset_store_path = cache_dir.joinpath('sst_preprocessed.pkl')
   >>>

diff --git a/docs/source/installation.md b/docs/source/installation.md
@@ -23,5 +23,5 @@ Coming soon!
 ## Installing from source
 
 To install from source via terminal:
-1. Clone the repository: `git clone [email protected]:takelab/podium.git && cd podium`
+1. Clone the repository: `git clone [email protected]:TakeLab/podium.git && cd podium`
 2. Install podium: `pip install .`
diff --git a/docs/source/notebooks/advanced.ipynb b/docs/source/notebooks/advanced.ipynb
@@ -9,7 +9,7 @@
     "# Podium installation\n",
     "! pip install podium-nlp\n",
     "# To install from source instead of the last release, comment the command above and uncomment the following one.\n",
-    "# ! pip install git+https://github.com/takelab/podium\n",
+    "# ! pip install git+https://github.com/TakeLab/podium.git\n",
     "\n",
     "# Additional dependencies required to run this notebook:\n",
     "! pip install transformers spacy\n",
@@ -134,7 +134,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "<a id='interact_fields'></a>"
+    "<a name='interact_fields' id='interact_fields'></a>"
    ]
   },
   {
@@ -337,7 +337,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "<a id='specials'></a>"
+    "<a name='specials' id='specials'></a>"
    ]
   },
   {
@@ -477,7 +477,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "<a id='custom_numericalization'></a>"
+    "<a name='custom_numericalization' id='custom_numericalization'></a>"
    ]
   },
   {
@@ -798,7 +798,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "<a id='dataset_concat'></a>"
+    "<a name='dataset_concat' id='dataset_concat'></a>"
    ]
   },
   {
@@ -843,7 +843,9 @@
     "from podium import Field, LabelField, Vocab\n",
     "# Load the datasets\n",
     "imdb_train, imdb_test = IMDB.get_dataset_splits()\n",
+    "imdb_train.finalize_fields()\n",
     "sst_train, sst_dev, sst_test = SST.get_dataset_splits()\n",
+    "sst_train.finalize_fields()\n",
     "\n",
     "# Luckily, both label vocabularies are already equal\n",
     "print(imdb_train.field('label').vocab.itos)\n",
@@ -867,7 +869,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "<a id='bucketing'></a>"
+    "<a name='bucketing' id='bucketing'></a>"
    ]
   },
   {
@@ -900,14 +902,15 @@
     "fields = {'text': text, 'label': label}\n",
     "\n",
     "train, valid, test = SST.get_dataset_splits(fields=fields)\n",
+    "train.finalize_fields()\n",
     "\n",
     "# Define the iterators and our sort key\n",
     "from podium import Iterator, BucketIterator\n",
     "def instance_length(instance):\n",
     "    # Use the text Field\n",
     "    raw, tokenized = instance.text\n",
     "    return len(tokenized)\n",
-    "bucket_iter = BucketIterator(train, batch_size=32, bucket_sort_key=instance_length)"
+    "bucket_iter = BucketIterator(sst_train, batch_size=32, bucket_sort_key=instance_length)"
    ]
   },
   {
@@ -936,7 +939,7 @@
    ],
    "source": [
     "import numpy as np\n",
-    "vanilla_iter = Iterator(train, batch_size=32)\n",
+    "vanilla_iter = Iterator(sst_train, batch_size=32)\n",
     "\n",
     "def count_padding(batch, padding_idx):\n",
     "    return np.count_nonzero(batch == padding_idx)\n",
@@ -1092,7 +1095,7 @@
     "import pickle\n",
     "\n",
     "cache_dir = Path('cache')\n",
-    "cache_dir.mkdir()\n",
+    "cache_dir.mkdir(exist_ok=True)\n",
     "\n",
     "dataset_store_path = cache_dir.joinpath('sst_preprocessed.pkl')\n",
     "\n",

diff --git a/docs/source/notebooks/preprocessing.ipynb b/docs/source/notebooks/preprocessing.ipynb
@@ -9,11 +9,12 @@
     "# Podium installation\n",
     "! pip install podium-nlp\n",
     "# To install from source instead of the last release, comment the command above and uncomment the following one.\n",
-    "# ! pip install git+https://github.com/takelab/podium\n",
+    "# ! pip install git+https://github.com/TakeLab/podium.git\n",
     "\n",
     "# Additional dependencies required to run this notebook:\n",
-    "! pip install sacremoses clean-text spacy truecase https://github.com/LIAAD/yake/archive/v0.4.2.tar.gz\n",
-    "! python -m spacy download en_core_web_sm"
+    "! pip install sacremoses clean-text spacy spacy-lookups-data truecase https://github.com/LIAAD/yake/archive/v0.4.2.tar.gz\n",
+    "! python -m spacy download en_core_web_sm\n",
+    "! python -m nltk.downloader stopwords"
    ]
   },
   {
@@ -387,7 +388,7 @@
     {
      "data": {
       "text/plain": [
-       "(None, [opinion', 'exciting', 'funny', 'movie'])"
+       "(None, ['opinion', 'exciting', 'funny', 'movie'])"
       ]
      },
      "execution_count": null,

diff --git a/docs/source/notebooks/quickstart.ipynb b/docs/source/notebooks/quickstart.ipynb
@@ -9,7 +9,7 @@
     "# Podium installation\n",
     "! pip install podium-nlp\n",
     "# To install from source instead of the last release, comment the command above and uncomment the following one.\n",
-    "# ! pip install git+https://github.com/takelab/podium"
+    "# ! pip install git+https://github.com/TakeLab/podium.git"
    ]
   },
   {
@@ -131,9 +131,9 @@
      "data": {
       "text/plain": [
        "Example({\n",
-       "  input_text: (None, ['Amazingly', 'lame', '.']),\n",
-       "  input_chars: (None, ['A', 'm', 'a', 'z', 'i', 'n', 'g', 'l', 'y', ' ', 'l', 'a', 'm', 'e', ' ', '.']),\n",
-       "  target: (None, 'negative')\n",
+       "    input_text: (None, ['Amazingly', 'lame', '.']),\n",
+       "    input_chars: (None, ['A', 'm', 'a', 'z', 'i', 'n', 'g', 'l', 'y', ' ', 'l', 'a', 'm', 'e', ' ', '.']),\n",
+       "    target: (None, 'negative')\n",
        "})"
       ]
      },

diff --git a/docs/source/notebooks/sample_dataset.csv b/docs/source/notebooks/sample_dataset.csv
@@ -0,0 +1,3 @@
+text,label
+Absorbing character study .,positive
+Amazingly lame .,negative
-Original file line number
+Diff line change
@@ Expand Up / @@ -207,7 +207,7 @@ Example({ @@
     })
     ```
-    For a more interactive introduction, check out the quickstart on Google Colab: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/takelab/podium/blob/master/docs/source/notebooks/quickstart.ipynb)
+    For a more interactive introduction, check out the quickstart on Google Colab: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/TakeLab/podium/blob/master/docs/source/notebooks/quickstart.ipynb)
     More complex examples can be found in our [examples folder](./examples).
@@ Expand Down @@