-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdatasets.py
139 lines (102 loc) · 4.2 KB
/
datasets.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
# -*- coding: utf-8 -*-
"""
Created on Tue Apr 3 14:38:04 2018
@author: sakurai
"""
import gzip
from pathlib import Path
import urllib
import h5py
import numpy as np
def _download_uci_corpus(dataset_name, dataset_location='~/dataset/bow'):
url_base = ('https://archive.ics.uci.edu/ml/machine-learning-databases/'
'bag-of-words/')
bow_url = url_base + 'docword.{}.txt.gz'.format(dataset_name)
vocab_url = url_base + 'vocab.{}.txt'.format(dataset_name)
bow_path = Path(dataset_location, 'docword.{}.txt.gz'.format(dataset_name))
vocab_path = Path(dataset_location, 'vocab.{}.txt'.format(dataset_name))
Path(dataset_location).mkdir(exist_ok=True)
if not bow_path.exists():
urllib.request.urlretrieve(bow_url, bow_path)
if not vocab_path.exists():
urllib.request.urlretrieve(vocab_url, vocab_path)
def _create_hdf5(dataset_name, dataset_location='~/dataset/bow'):
if dataset_name not in ('kos', 'enron', 'nips', 'nytimes', 'pubmed'):
raise ValueError("`dataset_name` must be 'kos', 'enron', 'nips'"
", 'nytimes' or 'pubmed'")
dataset_location = Path(dataset_location).expanduser()
docwords_filepath = dataset_location / 'docword.{}.txt.gz'.format(
dataset_name)
vocab_filepath = dataset_location / 'vocab.{}.txt'.format(dataset_name)
hdf5_filepath = Path(dataset_location, '{}.hdf5'.format(dataset_name))
if hdf5_filepath.exists():
return
print(hdf5_filepath)
f = h5py.File(hdf5_filepath)
# Create bag-of-words (pairs of an array of words and an array of counts)
docwords_file = gzip.open(docwords_filepath)
n_documents = int(docwords_file.readline())
docwords_file.readline() # Read and discard a line (n_words)
docwords_file.readline() # Read and discard a line (n_nonzero)
dt = h5py.special_dtype(vlen=np.int32)
ds = f.create_dataset('bow', (n_documents, 2), dtype=dt)
current_document = 0
words = []
counts = []
for line in docwords_file:
d, w, n = [int(num) for num in line.decode('utf-8').split(' ')]
if d - 1 != current_document:
words = np.array(words, np.int32) - 1 # Adjust word index 0-origin
counts = np.array(counts, np.int32)
ds[current_document] = np.vstack((words, counts))
current_document = d - 1
words = []
counts = []
words.append(w)
counts.append(n)
words = np.array(words, np.int32) - 1 # Adjust word index 0-origin
counts = np.array(counts, np.int32)
ds[current_document] = np.vstack((words, counts)).astype(np.int32)
docwords_file.close()
# Create vocabulary
with open(vocab_filepath) as vocab_file:
words = [word.strip() for word in vocab_file.readlines()]
n_vocab = len(words)
dt = h5py.special_dtype(vlen=str)
ds = f.create_dataset('vocab', (n_vocab,), dtype=dt)
ds[:] = words
f.flush()
f.close()
class _DatasetBase(object):
dataset_name = None
def __init__(self, dataset_location='~/dataset/bow'):
if self.dataset_name is None:
raise NotImplementedError
dataset_name = self.dataset_name
_download_uci_corpus(dataset_name, dataset_location)
_create_hdf5(dataset_name, dataset_location)
hdf5_filepath = Path(dataset_location, '{}.hdf5'.format(dataset_name))
self._hdf5 = h5py.File(hdf5_filepath, 'r')
self.docs = self._hdf5['bow']
self.vocabulary = np.array(self._hdf5['vocab'])
self.num_docs = len(self.docs)
self.num_terms = len(self.vocabulary)
self.id2word = self.vocabulary
def __del__(self):
self._hdf5.close()
class KosDataset(_DatasetBase):
dataset_name = 'kos'
class NipsDataset(_DatasetBase):
dataset_name = 'nips'
class EnronDataset(_DatasetBase):
dataset_name = 'enron'
class NytimesDataset(_DatasetBase):
dataset_name = 'nytimes'
class PubmedDataset(_DatasetBase):
dataset_name = 'pubmed'
if __name__ == '__main__':
dataset_location = r'E:\Dataset\bow'
dataset = KosDataset(dataset_location)
dataset = NipsDataset(dataset_location)
dataset = EnronDataset(dataset_location)
dataset = NytimesDataset(dataset_location)