-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathPlagCreator.py
393 lines (308 loc) · 16.6 KB
/
PlagCreator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
import os
import random
import re
from enum import Enum
import pickle
import time
import collections
import csv
# text modes
class Text_mode(Enum):
simple = 0
markov = 1
# plag modes
class Plag_mode(Enum):
one_to_one = 0
shuffle = 1
replace = 2
distance_between_words = 3
Source_Info = collections.namedtuple('Source_Info', 'article extract start end')
class PlagCreator:
def __init__(self):
self.wiki_articles = self.parse_wiki_dump()
self.words, self.db = self.make_words_list_and_db()
def parse_wiki_dump(self):
'''
Parses a wiki dump textfile
:return: dictionary with key: title of wiki article and value: text of wiki article as list of words
'''
print("parsing wiki dump file...")
source_file = "dump/clean_dump.txt"
text_file = open(source_file, "r")
wiki = text_file.read() # whole file in a string
text_file.close()
texts = wiki.split("-------------------------------------------------") # split file at each separator line
texts = [re.sub(r'[\n]|[\s]', ' ', x) for x in texts]
texts = [re.sub(r'\s{2,}', ' ', x).strip() for x in texts]
texts = [x for x in texts if x != '']
text_dict = {}
for i in range(0, len(texts), 2):
split_title = texts[i].split('] ')
text_dict[(split_title[0][1:], split_title[1])] = texts[i + 1] # turn list into a
text_dict = {k: re.sub(r'==.*==', '', text_dict[k]) for k in text_dict} # remove section headings
text_dict = {k: re.sub(r'[^\w]|\n|[\s]', ' ', text_dict[k]) for k in text_dict} # remove punctuation chars
text_dict = {k: re.sub(r'\d+', '', text_dict[k]) for k in text_dict} # remove numbers
text_dict = {k: re.sub(r'\s{2,}', ' ', text_dict[k]) for k in
text_dict} # replace multiple space chars with one space char
text_dict = {k: text_dict[k].lower().split(' ') for k in text_dict}
return text_dict
def make_words_list_and_db(self):
'''
Generates a list of words and dictionary for the markov chain text generator
:return: tuple with list of words and dictionary
'''
# concat all texts of all wiki articles as one bisg list of words
print("making words list...")
words = []
for text in self.wiki_articles.values():
words.append(text)
words = sum(words, []) # flatten
# append the two first words to the end, to avoid KeyError in markov
words.append(words[0])
words.append(words[1])
# build triples of three succeeding words with a step size of 1
print("making db dictionary...")
triples = []
if len(words) >= 3:
for i in range(len(words) - 2):
triple = (words[i], words[i + 1], words[i + 2])
triples.append(triple)
# build a dictionary with a two word key and a list with all succeeding words as the value
db = dict()
for w1, w2, w3 in triples:
key = (w1, w2)
if key in db:
db[key].append(w3)
else:
db[key] = [w3]
return (words, db)
def text_generator_simple(self, min_length, max_length):
'''
Generates a random text out of a list of words
:param number_of_texts: number of texts to be generated
:param min_length: min length of the text (lower limit of a random length)
:param max_length: max length of the text (upper limit of a random length)
:return: list of generated texts; a text is represented as list of words
'''
source_file = "wordlist/germanWords.txt"
text_file = open(source_file, "r")
words = text_file.read().splitlines() # read lines of file to list
text_file.close()
length = random.randint(min_length, max_length)
text = []
for y in range(length):
text.append(random.choice(words)) # randomly choosing words
return text
def text_generator_markov(self, min_length, max_length):
'''
Generates a random text using markov chain. The generated text looks more natural
:param number_of_texts: number of texts to be generated
:param min_length: min length of the text (lower limit of a random length)
:param max_length: max length of the text (upper limit of a random length)
:return: list of generated texts; a text is represented as list of words
'''
# randomly choose the index of the word to start with (seed)
seed_index = random.randrange(0, len(self.words) - 3)
w1, w2 = self.words[seed_index], self.words[seed_index + 1] # get the word and the next word from the dict
text = []
length = random.randint(min_length, max_length)
for i in range(length):
text.append(w1)
w1, w2 = w2, random.choice(self.db[(w1, w2)]) # randomly choose one possible word for the selected key
text.append(w2)
return text
def shuffle_plag(self, plag):
'''
Rondomly shuffles extract
:param plag: wiki source extract
'''
random.shuffle(plag)
def replace_plag(self, plag):
'''
Randomly replaces words in extract with other words
:param plag: wiki source extract
'''
number_replacements = int(len(plag) * 0.2)
if number_replacements == 0:
number_replacements = 1
replaced_indices = []
for _ in range(number_replacements):
random_position = random.randrange(len(plag))
while random_position in replaced_indices:
random_position = random.randrange(len(plag))
replaced_indices.append(random_position)
for i in replaced_indices:
replacement = random.choice(self.words)
while plag[i] == replacement:
replacement = random.choice(self.words)
plag[i] = replacement
def get_plag_text(self, length):
'''
Randomly chooses a text part out of a wiki article
:param length: length of the plagiarized text part
:return: tuple (title of article, text part as list of words)
'''
# randomly choose a wiki article
article_title = random.choice(list(self.wiki_articles.keys()))
# if wiki article is too short, randomly choose a different article
while len(self.wiki_articles[article_title]) < length:
article_title = random.choice(list(self.wiki_articles.keys()))
# randomly choose start position of plag
start = random.randint(0, len(self.wiki_articles[article_title]) - length)
# cut text part out
plag = self.wiki_articles[article_title][start: start + length]
return Source_Info(article_title, plag, start, start + len(plag) - 1)
def detect_overlapping_plags(self, existing_positions, new_position):
'''
Detects overlapping of two plags
:param existing_positions: list of tupels the position of all plagiarisms in text
:param new_position: tupel containing new intended position for plag
:return: true if overlapping
'''
overlapping = False
# if list is not empty
if existing_positions:
for pos in existing_positions:
if new_position[0] <= pos[1] and pos[0] <= new_position[1]:
overlapping = True
return overlapping
def generate_plags(self, text_mode, plag_mode, number_of_texts, min_text_length, max_text_length,
plag_length, output_dir, max_word_distance=1, number_of_plags_per_text=1):
'''
Generates texts with embedded plagiarism + info file for each text and outputs them to txt files
:param number_of_texts: number of texts to be generated
:param min_text_length: min length of the surrounding text (lower limit of a random length)
:param max_text_length: max length of the surrounding text (upper limit of a random length)
:param plag_length: length of the plagiarized text part
:param output_dir: output directory for the generated texts
'''
# used for creation of files containing target texts and infos
plag_ID = 0
plag_infos = []
# create output_dir if not existing
if not os.path.exists(output_dir):
os.makedirs(output_dir)
with open(output_dir + '/plag_infos.csv', 'w') as csvfile:
fieldnames = ['plag_ID', 'article_ID', 'start_in_source_text', 'end_in_source_text', 'plag_mode']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for _ in range(number_of_texts):
if text_mode == Text_mode.simple:
text = self.text_generator_simple(min_text_length, max_text_length)
elif text_mode == Text_mode.markov:
text = self.text_generator_markov(min_text_length, max_text_length)
else:
print("NO SUCH TEXT MODE (" + str(text_mode) + ")!")
return
if (number_of_plags_per_text == 0):
target_text_length = str(len(text))
target_text = ' '.join(text)
plag_positions_in_target_text = []
for _ in range(number_of_plags_per_text):
# randomly choose plag
plag = self.get_plag_text(plag_length)
# position of plag in surrounding text
plag_start_in_target_text = random.randrange(0, len(text))
plag_end_in_target_text = plag_start_in_target_text + len(plag.extract) - 1
word_pos_list = []
# copy plag.extract
extract_from_source_text = list(plag.extract)
plag_infos = []
# plag infos extended with infos common in all modes
plag_infos.extend(("source_text_ID: " + plag.article[0],
"source_text_title: " + plag.article[1],
"extract_from_source_text: " + ' '.join(extract_from_source_text),
"plag_start_in_source_text: " + str(plag.start),
"plag_end_in_source_text: " + str(plag.end),
"plag_length: " + str(plag_length)))
# special case: distance_between_words. Plag infos differs from other cases
if plag_mode == Plag_mode.distance_between_words:
# create distance between words pattern always starting with 0
random_word_positions = 0
for word in plag.extract:
word_pos_list.append((random_word_positions, word))
random_word_positions += random.randint(1, max_word_distance)
plag_end_in_target_text = word_pos_list[-1][0] + plag_start_in_target_text
while self.detect_overlapping_plags(plag_positions_in_target_text,
(plag_start_in_target_text, plag_end_in_target_text)):
plag_start_in_target_text = random.randrange(0, len(text))
plag_end_in_target_text = plag_start_in_target_text + len(plag.extract) - 1
plag_positions_in_target_text.append((plag_start_in_target_text, plag_end_in_target_text))
for i, elem in enumerate(plag_positions_in_target_text):
if plag_start_in_target_text < elem[0]:
plag_positions_in_target_text[i] = (elem[0] + plag_length, elem[1] + plag_length)
# iterate over all words in extract and insert every single word
# with different distances into target text
for word_tupel in word_pos_list:
# move plagiarism block to correct position
word_tupel = (word_tupel[0] + plag_start_in_target_text, word_tupel[1])
# insert word
text.insert(word_tupel[0], word_tupel[1])
# plag infos extended with infos in plag mode distance_between_words
plag_infos.extend(("word: " + word_tupel[1],
"word_position_target_text: " + str(word_tupel[0])))
# other plag modes
else:
while self.detect_overlapping_plags(plag_positions_in_target_text,
(plag_start_in_target_text, plag_end_in_target_text)):
plag_start_in_target_text = random.randrange(0, len(text))
plag_end_in_target_text = plag_start_in_target_text + len(plag.extract) - 1
for i, elem in enumerate(plag_positions_in_target_text):
if plag_start_in_target_text < elem[0]:
plag_positions_in_target_text[i] = (elem[0] + plag_length, elem[1] + plag_length)
if plag_mode == Plag_mode.shuffle:
self.shuffle_plag(plag.extract)
if plag_mode == Plag_mode.replace:
self.replace_plag(plag.extract)
# insert plag into surrounding text
text[plag_start_in_target_text: plag_start_in_target_text] = plag.extract
# alias for plag.extract to show that original text has changed
text_in_target_text = plag.extract
# plag infos extended with infos common in other plag modes
plag_infos.extend(("text_in_target_text: " + ' '.join(text_in_target_text),
"plag_start_in_target_text: " + str(plag_start_in_target_text),
"plag_end_in_target_text: " +
str(plag_end_in_target_text)))
# save plag position
plag_positions_in_target_text.append((plag_start_in_target_text, plag_end_in_target_text))
# convert list of words into space separated string
target_text = ' '.join(text)
# plag infos extended with infos common in all plag modes
plag_infos.extend(("plag_mode: " + plag_mode.name, "target_text_length: " + str(len(text))))
writer.writerow({'plag_ID': plag_ID, 'article_ID': plag.article[0],
'start_in_source_text': plag.start, 'end_in_source_text': plag.end,
'plag_mode': plag_mode.value})
# concatenate plag_infos to string
plag_infos_str = ""
# when number_of_plags_per_text = 0 then no plag_infos list ist created
for info in plag_infos:
plag_infos_str += info + "\n"
if (number_of_plags_per_text == 0):
plag_infos_str += "target_text_length: " + target_text_length
print(plag_infos_str)
print("target_text:\n" + target_text)
print("\n")
# write text to file
output_file_name = output_dir + "/plag" + str(plag_ID) + ".txt"
output_file = open(output_file_name, "w")
output_file.write(target_text)
output_file.close()
# write info file
output_file_name = output_dir + "/plag" + str(plag_ID) + "_info.txt"
output_file = open(output_file_name, "w")
output_file.write(plag_infos_str)
output_file.close()
plag_ID += 1
# measure execution time
start = time.clock()
# read PlagCreator object from disk if existing, else create it
if os.path.exists("PlagCreator.p"):
pc = pickle.load(open("PlagCreator.p", "rb"))
else:
pc = PlagCreator()
pickle.dump(pc, open("PlagCreator.p", "wb"))
# execute generate_plags with desired parameters
pc.generate_plags(Text_mode.markov, Plag_mode.distance_between_words, number_of_texts=10, number_of_plags_per_text=2,
min_text_length=100, max_text_length=500,
plag_length=10, max_word_distance=4, output_dir="plag")
print("execution time: %.3f seconds" % (time.clock() - start))