hsjeong5 · sachaMorin · Aug 28, 2019 · Aug 28, 2019 · Aug 28, 2019 · Aug 28, 2019
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,4 @@
+.idea
+__pycache__/
+data/
+scratch.py
diff --git a/README.md b/README.md
@@ -1,53 +1,52 @@
 
-# MNIST for Numpy
+# Datasets for Numpy
 
 ![](mnist_image.png)
 
-The MNIST database of handwritten digits has 60,000 training examples, and 10,000 test examples.
-Each example included in the MNIST database is a 28x28 grayscale image of handwritten digit and its corresponding label(0-9).
-This Python module makes it easy to load the MNIST database into numpy arrays.
-For more details about the MNIST database, please visit [here](http://yann.lecun.com/exdb/mnist/index.html).
-
 ## Requirements
 
 - Python 3.x
 - Numpy
 
+## Datasets
+
+- [MNIST](http://yann.lecun.com/exdb/mnist/index.html)
+- [KMNIST](https://github.com/rois-codh/kmnist)
+- [Fashion-MNIST](https://github.com/zalandoresearch/fashion-mnist)
+- [Banknote Authentication](https://archive.ics.uci.edu/ml/datasets/banknote+authentication)
+- [Sonar (Mines vs. Rocks)](http://archive.ics.uci.edu/ml/datasets/connectionist+bench+(sonar,+mines+vs.+rocks)
+- [MHEALTH](https://archive.ics.uci.edu/ml/datasets/MHEALTH+Dataset)
+
 ## Usage
 
-First, download `mnist.py` from this repository and locate it to your working directory.
-Then you can make your MNIST data to be prepared in Python as follows.
+First, download `loader.py` from this repository and locate it to your working directory.
+You can then load dataset splits into numpy arrays as follows :
 
 ```python
-import mnist
+from loader import load_dataset
 
-mnist.init()
+x_train, y_train, x_test, y_test, classes = load_dataset('MNIST')  # either MNIST, Fashion-MNIST, KMNIST, Banknote, Sonar or MHEALTH
 ```
+classes is a tuple of strings representing class names.
 
-**init()** consists of 2-steps.
+The module checks if the relevant .pkl file is already available under ./data. Otherwise, the dataset will be downloaded and processed into a .pkl file.
 
-1. Download the MNIST database.
-2. Make it into numpy arrays and save it as Pickle. (`mnist.pkl`)
+Labels are integer encoded by default. One-hot encoded labels can be retrieved like so:
 
-You can do this in command line tools as well.
+```python
+from loader import load_dataset
 
-```sh
-$ python3 mnist.py
+x_train, y_train, x_test, y_test, classes = load_dataset('KMNIST', one_hot=True)
 ```
 
-After preparing, you can load the MNIST database into numpy array like this.
+You can change the proportion of samples allocated to training set by specifying the train_prop argument when
+loading datasets for the first time (other than MNIST, KMNIST and Fashion-MNIST, which have predefined test sets). Default is 80 %.
 
 ```python
-x_train, t_train, x_test, t_test = mnist.load()
-```
-
-**load()** takes `mnist.pkl` and returns 4 numpy arrays.
+from loader import load_dataset
 
-- x_train : 60,000x784 numpy array that each row contains flattened version of training images.
-- t_train : 1x60,000 numpy array that each component is true label of the corresponding training images.
-- x_test : 10,000x784 numpy array that each row contains flattened version of test images.
-- t_test : 1x10,000 numpy array that each component is true label of the corresponding test images.
+x_train, y_train, x_test, y_test, classes = load_dataset('Sonar', train_prop=0.7)
+```
 
-## Notice
+Thanks to hsjeong5 for his work on MNIST-for-Numpy!
 
-Once you get `mnist.pkl`, you don't need to call **init()** anymore. Everything you need to do is to locate `mnist.py` and `mnist.pkl` in your working directory and to call **load()**.
diff --git a/__init__.py b/__init__.py
diff --git a/loader.py b/loader.py
@@ -0,0 +1,209 @@
+"""Dataset loader.
+
+Currently supports MNIST, Fashion-MNIST, KMNIST, Banknote, Sonar and MHEALTH
+datasets.
+
+Forked from hsjeong5 for his work on downloading the MNIST dataset.
+"""
+import numpy as np
+from urllib import request
+import gzip
+import pickle
+import os
+import zipfile
+import shutil
+
+SUPPORTED = ['MNIST', 'KMNIST', 'Fashion-MNIST',
+             'Banknote', 'Sonar', 'MHEALTH']
+
+# Split names, will be used as dictionary keys
+SPLITS = ('train_x', 'test_x', 'train_y', 'test_y')
+
+# Files to fetch for MNIST and MNIST-like datasets
+MNIST_URL = [
+    ["training_images", "train-images-idx3-ubyte.gz"],
+    ["test_images", "t10k-images-idx3-ubyte.gz"],
+    ["training_labels", "train-labels-idx1-ubyte.gz"],
+    ["test_labels", "t10k-labels-idx1-ubyte.gz"]
+]
+
+# Links are up-to-date as of August, 27th 2019
+URL = {
+    'MNIST': 'http://yann.lecun.com/exdb/mnist/',
+    'Fashion-MNIST': 'http://fashion-mnist.s3-website.eu-central-1.amazonaws'
+                     '.com/',
+    'KMNIST': 'http://codh.rois.ac.jp/kmnist/dataset/kmnist/',
+    'Banknote': 'http://archive.ics.uci.edu/ml/machine-learning-databases'
+                '/00267/data_banknote_authentication.txt',
+    'Sonar': 'http://archive.ics.uci.edu/ml/machine-learning-databases'
+             '/undocumented/connectionist-bench/sonar/sonar.all-data',
+    'MHEALTH': 'https://archive.ics.uci.edu/ml/machine-learning-databases'
+               '/00319/MHEALTHDATASET.zip'
+}
+
+# Class names. Order follows integer encoding in datasets
+CLASSES = {
+    'MNIST': ("0", "1", "2", "3", "4", "5", "6", "7", "8", "9"),
+    'Fashion-MNIST': ("T-shirt/top", "Trouser", "Pullover", "Dress", "Coat",
+                      "Sandal", "Shirt", "Sneaker", "Bag", "Ankle boot"),
+    'KMNIST': ('お', 'き', 'す', 'つ', 'な', 'は', 'ま', 'や', 'れ', 'を'),
+    'Banknote': ('Genuine', 'Forged'),
+    'Sonar': ('Mine', 'Rock'),
+    'MHEALTH': (
+        'null',
+        'Standing still (1 min)',
+        'Sitting and relaxing (1 min)',
+        'Lying down (1 min)',
+        'Walking (1 min)',
+        'Climbing stairs (1 min)',
+        'Waist bends forward (20x)',
+        'Frontal elevation of arms (20x)',
+        'Knees bending (crouching) (20x)',
+        'Cycling (1 min)',
+        'Jogging (1 min)',
+        'Running (1 min)',
+        'Jump front & back (20x)'
+    )
+}
+
+
+def download_mnist(dataset_name):
+    """Download MNIST and MNIST-like datasets."""
+    base_url = URL[dataset_name]
+    print("Downloading {}...".format(dataset_name))
+    for name in MNIST_URL:
+        request.urlretrieve(base_url + name[1], name[1])
+    print("Download complete.")
+
+
+def save_mnist(dataset_name):
+    """Save dataset as a .pkl file and remove .gz files."""
+    data = []
+    for name in MNIST_URL[:2]:
+        with gzip.open(name[1], 'rb') as f:
+            data.append(np.frombuffer(f.read(), np.uint8,
+                                      offset=16).reshape(-1, 28 * 28))
+    for name in MNIST_URL[-2:]:
+        with gzip.open(name[1], 'rb') as f:
+            data.append(np.frombuffer(f.read(), np.uint8, offset=8))
+
+    if not os.path.exists('./data'):
+        os.mkdir('./data')
+
+    mnist = dict(zip(SPLITS, data))
+
+    with open("data/{}.pkl".format(dataset_name), 'wb') as f:
+        pickle.dump(mnist, f)
+
+    for name in MNIST_URL:
+        os.remove(name[1])
+
+    print("Save complete.")
+
+
+def init_mnist(dataset_name):
+    download_mnist(dataset_name)
+    save_mnist(dataset_name)
+
+
+def init(dataset_name, train_prop):
+    if not os.path.exists('./data'):
+        os.mkdir('./data')
+
+    print("Downloading {}...".format(dataset_name))
+    data = request.urlopen(URL[dataset_name])
+
+    if dataset_name is 'Sonar':
+        html_response = data.read()
+        encoding = data.headers.get_content_charset('utf-8')
+        data = html_response.decode(encoding)
+        data = data.replace('M', '0')
+        data = data.replace('R', '1')
+
+        with open('temp.txt', 'w') as temp:
+            temp.write(data)
+
+        data = np.genfromtxt('temp.txt', delimiter=',')
+        os.remove('temp.txt')
+    elif dataset_name is 'MHEALTH':
+        # Rertrive and extract file
+        request.urlretrieve(URL[dataset_name], "temp.zip")
+        with zipfile.ZipFile('temp.zip', 'r') as zip_ref:
+            zip_ref.extractall()
+
+        # Load data for all patients
+        patients = [np.genfromtxt('MHEALTHDATASET/mHealth_subject{}.log'
+                                  .format(i),
+                                  delimiter='	') for i in range(1, 11)]
+        data = np.concatenate(patients)
+
+        # Remove temp files
+        os.remove('temp.zip')
+        shutil.rmtree('MHEALTHDATASET')
+    else:
+        data = np.genfromtxt(data, delimiter=',')
+
+    np.random.shuffle(data)
+
+    x, y = data[:, :-1], data[:, -1]
+    split = int(train_prop * x.shape[0])
+
+    train_x, train_y = x[0:split], y[0:split]
+    test_x, test_y = x[split:-1], y[split:-1]
+    train_y = train_y.astype(int)
+    test_y = test_y.astype(int)
+
+    dataset = dict(zip(SPLITS, (train_x, test_x, train_y, test_y)))
+
+    with open("data/{}.pkl".format(dataset_name), 'wb') as f:
+        pickle.dump(dataset, f)
+
+    print("Save complete.")
+
+
+def vectorize(labels, dataset_name):
+    """Take an integer encoded array and return a one-hot encoded version."""
+    temp = np.zeros((len(labels), len(CLASSES[dataset_name])))
+    temp[np.arange(len(labels)), labels] = 1
+    return temp
+
+
+def load_dataset(dataset_name='MNIST', train_prop=None, one_hot=False):
+    """Load dataset as numpy arrays.
+
+    Download dataset if not already available under ./data.
+    """
+    if dataset_name not in SUPPORTED:
+        raise Exception('{} is not supported.'.format(dataset_name))
+
+    if not os.path.exists('data/{}.pkl'.format(dataset_name)):
+        if dataset_name in ['MNIST', 'KMNIST', 'Fashion-MNIST']:
+            if train_prop is not None:
+                print('Warning! MNIST datasets ignore '
+                      'the train_prop argument.')
+            init_mnist(dataset_name)
+
+        elif dataset_name in ['Banknote', 'Sonar', 'MHEALTH']:
+            if train_prop is None:
+                train_prop = 0.8
+            init(dataset_name, train_prop)
+
+    elif train_prop is not None:
+        raise Exception('train_prop should only be used when initializing '
+                        'dataset. Please delete the .pkl file in the data '
+                        'directory and try again.')
+
+    # Load data
+    with open("data/{}.pkl".format(dataset_name), 'rb') as f:
+        dataset = pickle.load(f)
+
+    if one_hot:
+        dataset['train_y'] = vectorize(dataset['train_y'], dataset_name)
+        dataset['test_y'] = vectorize(dataset['test_y'], dataset_name)
+
+    return dataset["train_x"], dataset["train_y"], dataset[
+        "test_x"], dataset["test_y"], CLASSES[dataset_name]
+
+
+if __name__ == '__main__':
+    init_mnist('MNIST')
diff --git a/mnist.py b/mnist.py