forked from levitation/toxic_comments
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutils.py
106 lines (87 loc) · 3.09 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import re
import sys
import json
import logging
import pandas as pd
from tqdm import tqdm
import numpy as np
import gensim
try:
import cPickle as pickle
except ImportError:
import pickle
def load_data(fname, **kwargs):
func = kwargs.get('func', None)
if func is not None:
del kwargs['func']
df = pd.read_csv(fname, **kwargs)
if func is not None:
return func(df.values)
return df
class Embeds(object):
def __init__(self, fname, w2v_type='fasttext', format='file'):
if format in ('json', 'pickle'):
self.load(fname, format)
elif w2v_type == 'fasttext':
self.model = self._read_fasttext(fname)
elif w2v_type == 'word2vec':
self.model = gensim.models.KeyedVectors.load_word2vec_format(fname, binary=format=='binary')
else:
self.model = {}
def __getitem__(self, key):
try:
return self.model[key]
except KeyError:
return None
def __contains__(self, key):
return self.__getitem__[key] is not None
def _process_line(self, line):
line = line.rstrip().split(' ')
word = line[0]
vec = line[1:]
return word, [float(val) for val in vec]
def _read_fasttext(self, fname):
with open(fname) as f:
tech_line = f.readline()
dict_size, vec_size = self._process_line(tech_line)
print('dict_size = {}'.format(dict_size))
print('vec_size = {}'.format(vec_size))
model = {}
for line in tqdm(f, file=sys.stdout):
word, vec = self._process_line(line)
model[word] = vec
return model
def save(self, fname, format='json'):
if format == 'json':
with open(fname, 'w') as f:
json.dump(self.model, f)
elif format == 'pickle':
with open(fname, 'wb') as f:
pickle.dump(self.model, f)
return self
def load(self, fname, format='json'):
if format == 'json':
with open(fname) as f:
self.model = json.load(f)
elif format == 'pickle':
with open(fname, 'rb') as f:
self.model = pickle.load(f)
return self
class Logger(object):
def __init__(self, logger, fname=None, format="%(asctime)s [%(threadName)-12.12s] [%(levelname)-5.5s] %(message)s"):
self.logFormatter = logging.Formatter(format)
self.rootLogger = logger
self.rootLogger.setLevel(logging.DEBUG)
self.consoleHandler = logging.StreamHandler(sys.stdout)
self.consoleHandler.setFormatter(self.logFormatter)
self.rootLogger.addHandler(self.consoleHandler)
if fname is not None:
self.fileHandler = logging.FileHandler(fname)
self.fileHandler.setFormatter(self.logFormatter)
self.rootLogger.addHandler(self.fileHandler)
def warn(self, message):
self.rootLogger.warn(message)
def info(self, message):
self.rootLogger.info(message)
def debug(self, message):
self.rootLogger.debug(message)