-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtext2slide.py
124 lines (107 loc) · 5.06 KB
/
text2slide.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
from summarization.abstractive import summarize
from summarization.extractive.SlideMan.youyakuman import YouyakuMan
from scraping.scraping import scraping,irasutoya
from generate.generate import generate
import argparse
import sys
import textproc
import itertools
from summarization.question_generation import qgenerator
def preprocess(_document):
document = _document.split('\n\n')
document = [paragraph.strip() for paragraph in document if paragraph != '']
return document
class PrintMarkdown:
def __init__(self,outfile,titles,summaries,pictures):
self.outfile = outfile
self.titles = titles
self.summaries = summaries
self.pictures = pictures
self.sections = sorted(summaries.keys())
self.print_cover()
self.print_pages()
def print_cover(self):
self.outfile.write("---\n")
self.outfile.write("title: タイトル\n")
self.outfile.write("subtitle: サブタイトル\n")
self.outfile.write("author: 実験1班\n")
self.outfile.write("---\n")
def print_pages(self):
def print_title(title):
self.outfile.write("## %s\n" % (title))
def print_sentences(sentences):
for sentence in sentences:
print(sentence)
self.outfile.write("- %s\n" % (sentence))
def print_picture(picture):
self.outfile.write("\n" % (picture))
for i in self.sections:
picture_path=self.pictures.get(i,None)
print_title(self.titles[i])
if picture_path: #画像がある場合
self.outfile.write(":::::::::::::: {.columns}\n::: {.column width=\"65%\"}\n")
print_sentences(self.summaries[i])
self.outfile.write(":::\n::: {.column width=\"35%\"}\n")
print_picture(self.pictures[i])
self.outfile.write(":::\n::::::::::::::\n")
else: #画像がない場合
print_sentences(self.summaries[i])
def text2slide(document, output="output"):
preprocessed_doc = preprocess(document)
titles = {}
contents = {}
pictures = {}
for i, paragraph in enumerate(preprocessed_doc):
summary_ja = summarize.summarize(paragraph, 'google/pegasus-xsum')
if '?' in summary_ja: #TODO
titles[i] = textproc.titleize(summary_ja, False)
else:
summary_en = summarize.translate_to_en(summary_ja) # キャッシュ済みなのでリクエストは飛ばない
title_en = qgenerator.generate(summary_en)
print(summarize.translate_to_ja(title_en))
if title_en != '':
titles[i] = textproc.titleize(summarize.translate_to_ja(title_en))
else:
titles[i] = textproc.titleize(summary_ja)
print(titles[i])
contents_extractive = YouyakuMan(paragraph,3) #リスト形式で指定した数(以上)の抽出した文が返される
contents_abstractive = summarize.summarize(paragraph, 'google/pegasus-cnn_dailymail').split("。")
contents_abstractive = filter(lambda x: x != '', contents_abstractive)
contents_extractive = list(map(lambda x: textproc.desmasu2dadearu(x), contents_extractive))
contents_abstractive = list(map(lambda x: textproc.desmasu2dadearu(x), contents_abstractive))
#contents_extractive = list(map(lambda x: textproc.simplify(x), contents_extractive))
#contents_abstractive = list(map(lambda x: textproc.simplify(x), contents_abstractive))
###taigendome
contents_extractive = list(map(lambda x: textproc.taigendomize(x), contents_extractive))
contents_abstractive = list(map(lambda x: textproc.taigendomize(x), contents_abstractive))
SIMILARITY_TH = 0.88 #88ではなさそう
for j, text_ext in enumerate(contents_extractive):
for text_abs in contents_abstractive:
print(text_ext, ' / ', text_abs)
similarity = textproc.calc_similarity(text_ext, text_abs)
print(similarity)
if similarity >= SIMILARITY_TH :
if (len(text_ext) > len(text_abs)):
contents_extractive[j] = text_abs
contents[i] = contents_extractive
pictures[i] = irasutoya(scraping(paragraph,i),i) #あってる
print("summarization result:\n---")
print(titles)
print(contents)
print("----")
with open(output + ".md", mode='w', encoding='utf8', buffering=1) as outfile:
PrintMarkdown(outfile,titles,contents,pictures)
generate(document, output)
def main():
parser = argparse.ArgumentParser(description="")
parser.add_argument("--input", "-i", type=str, required=True, help="")
parser.add_argument("--output", "-o", type=str, default="output", help="")
args = parser.parse_args()
input = args.input
output = args.output
document = ""
with open(input, mode='r', encoding='utf8') as f:
document = f.read()
text2slide(document, output)
if __name__ == "__main__":
main()