|
5 | 5 | import os
|
6 | 6 | import numpy as np
|
7 | 7 | import h5py
|
| 8 | +import torch |
8 | 9 |
|
9 | 10 |
|
10 | 11 | def parse_binary_mnist(data_dir):
|
11 |
| - def lines_to_np_array(lines): |
12 |
| - return np.array([[int(i) for i in line.split()] for line in lines]) |
13 |
| - with open(os.path.join(data_dir, 'binarized_mnist_train.amat')) as f: |
14 |
| - lines = f.readlines() |
15 |
| - train_data = lines_to_np_array(lines).astype('float32') |
16 |
| - with open(os.path.join(data_dir, 'binarized_mnist_valid.amat')) as f: |
17 |
| - lines = f.readlines() |
18 |
| - validation_data = lines_to_np_array(lines).astype('float32') |
19 |
| - with open(os.path.join(data_dir, 'binarized_mnist_test.amat')) as f: |
20 |
| - lines = f.readlines() |
21 |
| - test_data = lines_to_np_array(lines).astype('float32') |
22 |
| - return train_data, validation_data, test_data |
| 12 | + def lines_to_np_array(lines): |
| 13 | + return np.array([[int(i) for i in line.split()] for line in lines]) |
| 14 | + |
| 15 | + with open(os.path.join(data_dir, "binarized_mnist_train.amat")) as f: |
| 16 | + lines = f.readlines() |
| 17 | + train_data = lines_to_np_array(lines).astype("float32") |
| 18 | + with open(os.path.join(data_dir, "binarized_mnist_valid.amat")) as f: |
| 19 | + lines = f.readlines() |
| 20 | + validation_data = lines_to_np_array(lines).astype("float32") |
| 21 | + with open(os.path.join(data_dir, "binarized_mnist_test.amat")) as f: |
| 22 | + lines = f.readlines() |
| 23 | + test_data = lines_to_np_array(lines).astype("float32") |
| 24 | + return train_data, validation_data, test_data |
23 | 25 |
|
24 | 26 |
|
25 | 27 | def download_binary_mnist(fname):
|
26 |
| - data_dir = '/tmp/' |
27 |
| - subdatasets = ['train', 'valid', 'test'] |
28 |
| - for subdataset in subdatasets: |
29 |
| - filename = 'binarized_mnist_{}.amat'.format(subdataset) |
30 |
| - url = 'http://www.cs.toronto.edu/~larocheh/public/datasets/binarized_mnist/binarized_mnist_{}.amat'.format( |
31 |
| - subdataset) |
32 |
| - local_filename = os.path.join(data_dir, filename) |
33 |
| - urllib.request.urlretrieve(url, local_filename) |
34 |
| - |
35 |
| - train, validation, test = parse_binary_mnist(data_dir) |
36 |
| - |
37 |
| - data_dict = {'train': train, 'valid': validation, 'test': test} |
38 |
| - f = h5py.File(fname, 'w') |
39 |
| - f.create_dataset('train', data=data_dict['train']) |
40 |
| - f.create_dataset('valid', data=data_dict['valid']) |
41 |
| - f.create_dataset('test', data=data_dict['test']) |
42 |
| - f.close() |
43 |
| - print(f'Saved binary MNIST data to: {fname}') |
| 28 | + data_dir = "/tmp/" |
| 29 | + subdatasets = ["train", "valid", "test"] |
| 30 | + for subdataset in subdatasets: |
| 31 | + filename = "binarized_mnist_{}.amat".format(subdataset) |
| 32 | + url = "http://www.cs.toronto.edu/~larocheh/public/datasets/binarized_mnist/binarized_mnist_{}.amat".format( |
| 33 | + subdataset |
| 34 | + ) |
| 35 | + local_filename = os.path.join(data_dir, filename) |
| 36 | + urllib.request.urlretrieve(url, local_filename) |
| 37 | + |
| 38 | + train, validation, test = parse_binary_mnist(data_dir) |
| 39 | + |
| 40 | + data_dict = {"train": train, "valid": validation, "test": test} |
| 41 | + f = h5py.File(fname, "w") |
| 42 | + f.create_dataset("train", data=data_dict["train"]) |
| 43 | + f.create_dataset("valid", data=data_dict["valid"]) |
| 44 | + f.create_dataset("test", data=data_dict["test"]) |
| 45 | + f.close() |
| 46 | + print(f"Saved binary MNIST data to: {fname}") |
| 47 | + |
| 48 | + |
| 49 | +def load_binary_mnist(fname, batch_size, test_batch_size, use_gpu): |
| 50 | + f = h5py.File(fname, "r") |
| 51 | + x_train = f["train"][::] |
| 52 | + x_val = f["valid"][::] |
| 53 | + x_test = f["test"][::] |
| 54 | + train = torch.utils.data.TensorDataset(torch.from_numpy(x_train)) |
| 55 | + kwargs = {"num_workers": 4, "pin_memory": True} if use_gpu else {} |
| 56 | + train_loader = torch.utils.data.DataLoader( |
| 57 | + train, batch_size=batch_size, shuffle=True, **kwargs |
| 58 | + ) |
| 59 | + validation = torch.utils.data.TensorDataset(torch.from_numpy(x_val)) |
| 60 | + val_loader = torch.utils.data.DataLoader( |
| 61 | + validation, batch_size=test_batch_size, shuffle=False, **kwargs |
| 62 | + ) |
| 63 | + test = torch.utils.data.TensorDataset(torch.from_numpy(x_test)) |
| 64 | + test_loader = torch.utils.data.DataLoader( |
| 65 | + test, batch_size=test_batch_size, shuffle=False, **kwargs |
| 66 | + ) |
| 67 | + return train_loader, val_loader, test_loader |
0 commit comments