File tree 5 files changed +634
-0
lines changed
5 files changed +634
-0
lines changed Original file line number Diff line number Diff line change
1
+ """
2
+ Preprocess dataset for sent140 corpus.
3
+
4
+ """
5
+ import csv
6
+ import numpy as np
7
+ import cPickle as pickle
8
+ import re
9
+ import sys
10
+
11
+ from utils import alph , translate , ALPH_rev
12
+
13
+ FILENAME = './sent140.csv'
14
+ LENGTH = 140
15
+
16
+ def normalize (string ):
17
+ '''Remove out-of-alphabet symbols and align to LENGTH '''
18
+ s = string .ljust (LENGTH )
19
+ return s
20
+
21
+ def get_dataset (filename , limit = 0 ):
22
+ X ,Y = [], []
23
+ with open (filename ) as h :
24
+ dataset = csv .reader (h )
25
+ i = 0
26
+ for row in dataset :
27
+ y = 0 if int (row [0 ]) == 0 else 1
28
+ Y .append (y )
29
+ x = translate (normalize (row [5 ][:140 ]))
30
+ X .append (x )
31
+ if limit != 0 and i > limit :
32
+ break
33
+ i += 1
34
+ return X ,Y
35
+
36
+ if __name__ == '__main__' :
37
+
38
+ if len (sys .argv ) < 3 :
39
+ print "Usage: %s <input_corpus> <output>" % (sys .argv [0 ])
40
+ sys .exit (1 )
41
+
42
+ inp = sys .argv [1 ]
43
+ out = sys .argv [2 ]
44
+
45
+ X ,Y = get_dataset (filename = inp , limit = 100000 )
46
+
47
+ with open (out , 'wb' ) as h :
48
+ np .save (h , [len (alph )])
49
+ np .save (h , np .array (X , dtype = np .int8 ))
50
+ np .save (h , np .array (Y , dtype = np .int8 ))
51
+
Original file line number Diff line number Diff line change
1
+ """
2
+ Preprocess dataset for sent140 corpus.
3
+
4
+ """
5
+ import numpy as np
6
+ import sys
7
+
8
+ from utils import alph , translate , ALPH_rev
9
+
10
+ def encode (data , alph ):
11
+ m , n = data .shape # batch_size x n_steps
12
+
13
+ enc = np .zeros ((m ,n * alph ), dtype = np .uint8 )
14
+
15
+ for i in xrange (m ):
16
+ for j in xrange (n ):
17
+ enc [i , j * alph + data [i ,j ]] = 1
18
+
19
+ return enc
20
+
21
+ if __name__ == '__main__' :
22
+
23
+ if len (sys .argv ) < 3 :
24
+ print "Usage: %s <input_corpus> <output>" % (sys .argv [0 ])
25
+ sys .exit (1 )
26
+
27
+ inp = sys .argv [1 ]
28
+ out = sys .argv [2 ]
29
+
30
+ with open (inp ) as h :
31
+ alph = np .load (h )[0 ]
32
+ X = np .load (h )
33
+ Y = np .load (h )
34
+
35
+ # this could be memory-critical part
36
+ enc_X = encode (X , alph )
37
+
38
+ with open (out , 'wb' ) as h :
39
+ np .save (h , enc_X ) # indicates the layer
40
+
You can’t perform that action at this time.
0 commit comments