Skip to content

Commit 47e6937

Browse files
committed
Initial commit
1 parent e473b48 commit 47e6937

File tree

5 files changed

+634
-0
lines changed

5 files changed

+634
-0
lines changed

prep_corpus.py

+51
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
"""
2+
Preprocess dataset for sent140 corpus.
3+
4+
"""
5+
import csv
6+
import numpy as np
7+
import cPickle as pickle
8+
import re
9+
import sys
10+
11+
from utils import alph, translate, ALPH_rev
12+
13+
FILENAME = './sent140.csv'
14+
LENGTH = 140
15+
16+
def normalize(string):
17+
'''Remove out-of-alphabet symbols and align to LENGTH '''
18+
s = string.ljust(LENGTH)
19+
return s
20+
21+
def get_dataset(filename, limit=0):
22+
X,Y = [], []
23+
with open(filename) as h:
24+
dataset = csv.reader(h)
25+
i = 0
26+
for row in dataset:
27+
y = 0 if int(row[0]) == 0 else 1
28+
Y.append(y)
29+
x = translate(normalize(row[5][:140]))
30+
X.append(x)
31+
if limit !=0 and i > limit:
32+
break
33+
i+=1
34+
return X,Y
35+
36+
if __name__ == '__main__':
37+
38+
if len(sys.argv) < 3:
39+
print "Usage: %s <input_corpus> <output>" % (sys.argv[0])
40+
sys.exit(1)
41+
42+
inp = sys.argv[1]
43+
out = sys.argv[2]
44+
45+
X,Y = get_dataset(filename=inp, limit=100000)
46+
47+
with open(out, 'wb') as h:
48+
np.save(h, [len(alph)])
49+
np.save(h, np.array(X, dtype=np.int8))
50+
np.save(h, np.array(Y, dtype=np.int8))
51+

prep_layer.py

+40
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
"""
2+
Preprocess dataset for sent140 corpus.
3+
4+
"""
5+
import numpy as np
6+
import sys
7+
8+
from utils import alph, translate, ALPH_rev
9+
10+
def encode(data, alph):
11+
m, n = data.shape # batch_size x n_steps
12+
13+
enc = np.zeros((m,n*alph), dtype=np.uint8)
14+
15+
for i in xrange(m):
16+
for j in xrange(n):
17+
enc[i, j*alph + data[i,j]] = 1
18+
19+
return enc
20+
21+
if __name__ == '__main__':
22+
23+
if len(sys.argv) < 3:
24+
print "Usage: %s <input_corpus> <output>" % (sys.argv[0])
25+
sys.exit(1)
26+
27+
inp = sys.argv[1]
28+
out = sys.argv[2]
29+
30+
with open(inp) as h:
31+
alph = np.load(h)[0]
32+
X = np.load(h)
33+
Y = np.load(h)
34+
35+
# this could be memory-critical part
36+
enc_X = encode(X, alph)
37+
38+
with open(out, 'wb') as h:
39+
np.save(h, enc_X) # indicates the layer
40+

0 commit comments

Comments
 (0)