-
Notifications
You must be signed in to change notification settings - Fork 30
/
Copy path1-5-datasets.py
116 lines (94 loc) · 2.94 KB
/
1-5-datasets.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
#%%
# To do machine learning you need data, and there are three concepts
# to master here, Dataset, Dataloader, and transforms
#%%
# Let's make use of pandas and CSV data to create a dataset.
import torch
import pandas
from torch.utils.data import Dataset
class IrisDataset(Dataset):
def __init__(self):
'''Load up the data.
'''
self.data = pandas.read_csv('./Iris.csv')
def __len__(self):
'''How much data do we have?
'''
return len(self.data)
def __getitem__(self, idx):
'''Grab one data sample
Arguments:
idx {int} -- data at this position.
'''
return self.data.iloc[idx]
# pretty simple when we start from pandas
# here is a dataset loaded, with a single sample
iris = IrisDataset()
len(iris), iris[0]
#%%
# To do machine learning you need data, and there are three concepts
# to master here, Dataset, Dataloader, and transforms
#%%
# Let's make use of pandas and CSV data to create a dataset.
import torch
import pandas
from torch.utils.data import Dataset
class IrisDataset(Dataset):
def __init__(self):
'''Load up the data.
'''
self.data = pandas.read_csv('./Iris.csv')
def __len__(self):
'''How much data do we have?
'''
return len(self.data)
def __getitem__(self, idx):
'''Grab one data sample
Arguments:
idx {int} -- data at this position.
'''
return self.data.iloc[idx]
# pretty simple when we start from pandas
# here is a dataset loaded, with a single sample
iris = IrisDataset()
len(iris), iris[0]
#%%
# Now, the small problem is -- we have a named tuple,
# and we're going to need a tensor for inputs and
# the target label -- so we need to transform
class TensorIrisDataset(IrisDataset):
def __getitem__(self, idx):
'''Get a single sample that is
{values:, label:}
'''
sample = super().__getitem__(idx)
return {
'tensor': torch.Tensor(
[sample.SepalLengthCm,
sample.SepalWidthCm,
sample.PetalLengthCm,
sample.PetalWidthCm]
),
'label': sample.Species
}
# and output...
tensors = TensorIrisDataset()
len(tensors), tensors[0]
#%%
# Training almost always takes place in batches
# so pytorch has a very convenient loader that can take
# a dataset and turn it into batches so you can iterate
from torch.utils.data import DataLoader
loader = DataLoader(tensors, batch_size=16, shuffle=True)
for batch in loader:
print(batch)
# see how the data comes out in batches, and the last batch
# tries to be as large as it can
#%%
# And -- there is even a parallel possibility
# this is a pretty small dataset so it's not really
# essential, but here is how you use it
parallel_loader = DataLoader(tensors,
batch_size=16, shuffle=True, num_workers=4)
for batch in parallel_loader:
print(batch)