-
Notifications
You must be signed in to change notification settings - Fork 24
/
Copy pathtext_to_transcript.py
48 lines (38 loc) · 1.33 KB
/
text_to_transcript.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
# This is a simple script to turn a a text file of lines or paragraphs
# into a transcript file that can be used for the forced alignment.
# '#' is the comment character in the text files
import sys
import simplejson as json
import os.path
import click
import jsonschema
@click.command()
@click.argument('text_file')
@click.option('--output-file', default=None, help="Output transcript file")
@click.option('--speaker-name', default="Narrator", help="The name of the speaker")
def text_to_transcript(text_file, output_file, speaker_name):
text = open(text_file).read()
filedir = os.path.dirname(os.path.realpath(__file__))
schema_path = os.path.join(
filedir, "alignment-schemas/transcript_schema.json")
transcript_schema = json.load(open(schema_path))
paragraphs = text.split("\n\n")
out = []
for para in paragraphs:
para = para.replace("\n", " ")
if para == "" or para.startswith("#"):
continue
line = {
"speaker": speaker_name,
"line": para
}
out.append(line)
jsonschema.validate(out, transcript_schema)
if output_file is None:
print json.dumps(out, indent=4)
else:
with open(output_file, 'w') as f:
f.write(json.dumps(out, indent=4))
return
if __name__ == "__main__":
text_to_transcript()