-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy path03-merge-lines.py
80 lines (65 loc) · 1.82 KB
/
03-merge-lines.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
"""
把行进行合并,只要处于同一个章回,同一个段落,如果结尾不为。!,则进行merge
"""
import os
import re
from typing import List
def split_by_pos(content: str, splitters: List[int]) -> List[str]:
"""
对content按照splitter进行切分,切分成若干部分,每一部分都是一篇文章
:param content:
:param splitters:
:return:
"""
splitters = sorted(splitters)
if splitters[0] != 0:
splitters.insert(0, 0)
if splitters[-1] != len(content):
splitters.append(len(content))
a = []
for i in range(0, len(splitters) - 1):
a.append(content[splitters[i]:splitters[i + 1]])
return a
def get_pos(res: List[re.Match]):
"""
把res映射成int列表
:param res:
:return:
"""
beg_list = []
for i in res:
beg = i.start()
beg_list.append(beg)
return beg_list
def get_parts(content: str):
a = re.finditer('\n\s*第[一二三四五六七八九十1234567890]+[章回](.+?)?\n', content)
splitters = get_pos(a)
s = split_by_pos(content, splitters)
return s
def merge_lines(s: str):
# s是一个章回
s = s.strip()
lines = [i.strip() for i in s.splitlines()]
ans = [lines[0]]
last_end = True
for i in lines[1:]:
if last_end:
ans.append(i)
else:
ans[-1] += i
if i:
if i[-1] in ":。!?":
last_end = True
else:
last_end = False
return '\n'.join(ans)
def handle(s: str):
s = get_parts(s)
for i in range(len(s)):
s[i] = merge_lines(s[i])
return '\n'.join(s)
for i in os.listdir('src'):
filepath = os.path.join('src/' + i)
content = open(filepath).read()
content = handle(content)
open(filepath, 'w').write(content)