File tree 1 file changed +10
-7
lines changed
1 file changed +10
-7
lines changed Original file line number Diff line number Diff line change 20
20
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
21
# SOFTWARE.
22
22
23
- import webvtt
24
23
import glob
25
24
import os .path
26
25
import re
27
26
27
+ import webvtt
28
+
28
29
import download
29
30
30
31
DATA_DIR = os .path .join (download .DATA_DIR , "captions" )
@@ -39,22 +40,24 @@ def main():
39
40
txt_file = os .path .join (DATA_DIR , "../" , "captions.txt" )
40
41
with open (txt_file , "w+" ) as f :
41
42
for vtt_file in vtt_files :
43
+ text = []
42
44
stephen_speaking = True
43
45
for caption in vtt .read (vtt_file ):
44
- text = caption .text
45
- if speaker_jon (text ):
46
+ t = caption .text
47
+ if speaker_jon (t ):
46
48
stephen_speaking = False
47
49
continue
48
- if speaker_stephen (text ):
50
+ if speaker_stephen (t ):
49
51
stephen_speaking = True
50
- text = "" .join (x [1 ] for x in re .findall (r">> (Stephen:|STEPHEN:|stephen:) (.*)$" , text + < "|endoftext|" > ))
52
+ t = "" .join (x [1 ] for x in re .findall (r">> (Stephen:|STEPHEN:|stephen:) (.*)$" , t ))
51
53
# print(text)
52
54
if not stephen_speaking :
53
55
continue
54
56
# print(text)
55
- text = re .sub (speaker , r"\3" , text , flags = re .M )
56
- print ( text . strip ("\n " ), file = f , end = " " )
57
+ t = re .sub (speaker , r"\3" , t , flags = re .M )
58
+ text += [ t . replace ("\n " , " " )]
57
59
# print(text)
60
+ f .writelines ([" " .join (text )] + ["\n " , "<|endoftext|>" , "\n " ])
58
61
59
62
60
63
if __name__ == '__main__' :
You can’t perform that action at this time.
0 commit comments