1
+ import os
1
2
import json
2
3
import logging
3
4
import subprocess
4
- import sys
5
5
from argparse import ArgumentParser
6
6
from pathlib import Path
7
7
from statistics import mean
8
8
9
9
import datasets
10
- from bs4 import BeautifulSoup
11
- from bs4 .dammit import EncodingDetector
12
10
from datasets import config , load_from_disk
13
11
from datasets .utils .logging import set_verbosity_info
14
12
15
13
set_verbosity_info ()
16
14
logger = logging .getLogger (__name__ )
17
15
18
- # For `soup.decode_content` that can hit the limit
19
- sys .setrecursionlimit (10000 )
20
-
21
16
22
17
def get_args ():
23
18
parser = ArgumentParser ()
@@ -59,6 +54,11 @@ def main():
59
54
args = get_args ()
60
55
logger .info (f"** The job is runned with the following arguments: **\n { args } \n **** " )
61
56
57
+ if os .path .isfile (args .save_path_stats_json ):
58
+ logger .info (f" --- Statistics already computed for seed id { args .seed_id } " )
59
+ return
60
+
61
+ logger .info (f" --- Statistics not already computed for seed id { args .seed_id } " )
62
62
if not args .use_datasets_caching :
63
63
datasets .set_caching_enabled (False )
64
64
else :
@@ -92,8 +92,10 @@ def main():
92
92
93
93
ds_html = splits [selected_mime_types [0 ]]
94
94
95
+ logger .info (f"the currents splits are { data_stats } ." )
96
+
95
97
def get_length_text (example ):
96
- example ["length_text" ] = len (example ["text" ])
98
+ example ["length_text" ] = len (example ["text" ]) if example [ "text" ] is not None else 0
97
99
return example
98
100
99
101
cols_to_remove = [col for col in ds .column_names if col not in ["content_languages" , "url_host_tld" ]]
@@ -105,7 +107,9 @@ def get_length_text(example):
105
107
)
106
108
107
109
data_stats ["html_empty_text" ] = len ([e for e in ds_html ["length_text" ] if e == 0 ])
108
- data_stats ["html_mean_length_non_empty_text" ] = mean ([e for e in ds_html ["length_text" ] if e != 0 ])
110
+
111
+ non_empty_texts = [e for e in ds_html ["length_text" ] if e != 0 ]
112
+ data_stats ["html_mean_length_non_empty_text" ] = mean (non_empty_texts ) if non_empty_texts != [] else None
109
113
data_stats ["seed_id" ] = args .seed_id
110
114
111
115
logger .info (f"There is { data_stats ['html_empty_text' ]} empty text rows out of { len (ds_html )} rows." )
@@ -119,7 +123,8 @@ def get_length_text(example):
119
123
subprocess .run (["mv" , save_path_tmp , str (save_path .absolute ())])
120
124
121
125
save_path = Path (args .save_path_stats_full_json )
122
- save_path_tmp = f"{ str (save_path .absolute ())} .tmp"
126
+ tmp_file_name = f"tmp-{ str (save_path .name )} "
127
+ save_path_tmp = os .path .join (save_path .parent , tmp_file_name )
123
128
logger .info (f"Saving the dataset at { save_path_tmp } " )
124
129
ds_html .to_json (
125
130
save_path_tmp ,
0 commit comments