Skip to content

Commit 3d48035

Browse files
committed
preprocess: one sample per line
1 parent ef59f6e commit 3d48035

File tree

1 file changed

+10
-7
lines changed

1 file changed

+10
-7
lines changed

caption_process.py

+10-7
Original file line numberDiff line numberDiff line change
@@ -20,11 +20,12 @@
2020
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
2121
# SOFTWARE.
2222

23-
import webvtt
2423
import glob
2524
import os.path
2625
import re
2726

27+
import webvtt
28+
2829
import download
2930

3031
DATA_DIR = os.path.join(download.DATA_DIR, "captions")
@@ -39,22 +40,24 @@ def main():
3940
txt_file = os.path.join(DATA_DIR, "../", "captions.txt")
4041
with open(txt_file, "w+") as f:
4142
for vtt_file in vtt_files:
43+
text = []
4244
stephen_speaking = True
4345
for caption in vtt.read(vtt_file):
44-
text = caption.text
45-
if speaker_jon(text):
46+
t = caption.text
47+
if speaker_jon(t):
4648
stephen_speaking = False
4749
continue
48-
if speaker_stephen(text):
50+
if speaker_stephen(t):
4951
stephen_speaking = True
50-
text = "".join(x[1] for x in re.findall(r"&gt;&gt; (Stephen:|STEPHEN:|stephen:) (.*)$", text+<"|endoftext|">))
52+
t = "".join(x[1] for x in re.findall(r"&gt;&gt; (Stephen:|STEPHEN:|stephen:) (.*)$", t))
5153
# print(text)
5254
if not stephen_speaking:
5355
continue
5456
# print(text)
55-
text = re.sub(speaker, r"\3", text, flags=re.M)
56-
print(text.strip("\n "), file=f, end=" ")
57+
t = re.sub(speaker, r"\3", t, flags=re.M)
58+
text += [t.replace("\n", " ")]
5759
# print(text)
60+
f.writelines([" ".join(text)] + ["\n", "<|endoftext|>", "\n"])
5861

5962

6063
if __name__ == '__main__':

0 commit comments

Comments
 (0)