-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathVocabulary.com+scraping.py
236 lines (162 loc) · 6.95 KB
/
Vocabulary.com+scraping.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
# coding: utf-8
# In[1]:
import re
from selenium.webdriver import PhantomJS
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import string
from selenium.common.exceptions import TimeoutException, StaleElementReferenceException
import nltk
import pymysql
from pymysql import InternalError
import itertools
# In[9]:
from abc import abstractmethod, ABC
class Scraper(ABC):
def __init__(self, start_link, dynamic=False):
self.browser = self.get_crawler(dynamic)
self.db_executer = self.connect_to_database()
self.start_link = start_link
self.temporary_memory = set()
def connect_to_database(self):
global conn
global cur
conn = pymysql.connect(host='35.198.176.76', user='root', passwd=None, db='mysql', charset='utf8')
cur = conn.cursor()
cur.execute("USE definitions")
return cur
def get_crawler(self, dynamic):
crawler = None
if (dynamic):
crawler = PhantomJS()
return crawler
@abstractmethod
def scraping_strategy(self):
pass
def insert_to_database(self, row):
term, definition, example = row
if example is None:
example = ""
cur = self.db_executer
cur.execute("SELECT definitions,examples FROM terms WHERE term = '{}'".format(term))
res = cur.fetchall()
examples_alr= res[0][1]
definitions_alr = res[0][0]
definitions_alr = [defin.replace("'","`") for defin in definitions_alr]
print("Already in database - ", res)
if (len(res) > 0):
if definition not in res[0][0]:
cur.execute(
"UPDATE terms SET definitions = '{0} &$ {1}',examples = '{2} &$ {3}' WHERE term = '{4}'".format(
definitions_alr, definition,examples_alr, example, term))
else:
query = "INSERT into terms (term,definitions,examples) VALUES ('{0}','{1}','{2}')".format(term, definition,
example)
cur.execute(query)
conn.commit()
def finalize(self):
cur.close
conn.close()
def find_pos_tag(self, sentence, needed_word):
pattern = re.compile(r"" + needed_word + r"e?s?", re.M | re.I)
tokenizer = nltk.word_tokenize
pos_tags = nltk.pos_tag(tokenizer(sentence))
needed_tag = [y for x, y in pos_tags if len(re.findall(pattern, x)) > 0]
try:
needed_tag = needed_tag[0]
except IndexError:
needed_tag = ""
return needed_tag
def handle_internal_error(self):
cur = self.db_executer
queries = ["SHOW FULL PROCESSLIST"]
[cur.execute(query) for query in queries]
results = (cur.fetchall())
thread_ids = [result[0] for result in results]
queries = ["KILL " + thrd_id for thrd_id in thread_ids]
[cur.execute(query) for query in queries]
self.db_executer = self.connect_to_database
# In[7]:
class VocabularyScraper(Scraper):
def __init__(self, *args, **kwargs):
super(self.__class__, self).__init__(*args, **kwargs)
def preprocess_exs_defs(self, term):
webdriver = self.browser
term = term.replace("'", "`")
definitions = list(itertools.chain(*[webdriver.find_elements_by_css_selector(name) for name in
[".wordPage .blurb .short", ".wordPage .blurb .long",
".wordPage .definition h3"]]))
examples = webdriver.find_elements_by_css_selector(".wordPage .example")
definitions = self.remove_redund_chars(definitions)
examples = self.remove_redund_chars(examples)
examples = " &$ ".join(['&%s& %s' % (self.find_pos_tag(example, term), example) for example in examples])
definitions = " &$ ".join([definition for definition in definitions])
row = tuple((term, definitions, examples))
return row
def remove_redund_chars(self, iterable):
if iterable is not None:
filtered = [element.text.replace("'", '`') for element in iterable]
return filtered
def enter_search_field(self, key):
webdriver = self.browser
search_field = webdriver.find_element_by_id("search")
search_field.clear()
search_field.send_keys(key)
# search_field.send_keys(Keys.ENTER)
WebDriverWait(webdriver, 5).until(EC.presence_of_element_located((By.CLASS_NAME, "entry")))
def preparation(self, key):
found = False
while not found:
try:
webdriver = self.browser
webdriver.get(self.start_link)
self.enter_search_field(key)
found = True
except TimeoutException:
print("Exceeded time.. Retrying to get search field")
def load_more(self):
webdriver = self.browser
div = webdriver.find_element_by_class_name("hasmore")
webdriver.execute_script("arguments[0].scrollTop += arguments[0].scrollHeight", div)
def scraping_strategy(self):
letter = "a"
self.preparation(letter)
scraped = self.temporary_memory
webdriver = self.browser
# Just to initialize for checking
clicked = set((0, 1))
words = set()
time_started = time.time()
while (len(clicked - words) != 0):
words = set(webdriver.find_elements_by_css_selector(".autocomplete .word"))
clicked = clicked | words
for word in words:
try:
term = word.get_attribute("innerHTML")
if term not in scraped:
word.click()
time.sleep(1)
print("Term is ", term)
scraped.add(term)
row = self.preprocess_exs_defs(term)
print(row)
self.insert_to_database(row)
except StaleElementReferenceException:
print("Word %s was not loaded into dataset" % term)
continue
except InternalError:
print("Trying to handle multiple threading error")
self.handle_internal_error()
time.sleep(.8)
self.load_more()
time_ended = time.time()
print("Scraping of the letter '{}' was successfuly finished,it took it {} seconds".format(letter,
time_ended - time_started))
self.finalize()
test = VocabularyScraper("https://www.vocabulary.com/dictionary/", dynamic=True)
# In[80]:
test.scraping_strategy()
# In[78]:
test.finalize()