Skip to content

Commit 19c004e

Browse files
committed
update statistics
1 parent c388e47 commit 19c004e

File tree

2 files changed

+19
-13
lines changed

2 files changed

+19
-13
lines changed

dashboard/python_scripts/compute_stats.py

+14-9
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,18 @@
1+
import os
12
import json
23
import logging
34
import subprocess
4-
import sys
55
from argparse import ArgumentParser
66
from pathlib import Path
77
from statistics import mean
88

99
import datasets
10-
from bs4 import BeautifulSoup
11-
from bs4.dammit import EncodingDetector
1210
from datasets import config, load_from_disk
1311
from datasets.utils.logging import set_verbosity_info
1412

1513
set_verbosity_info()
1614
logger = logging.getLogger(__name__)
1715

18-
# For `soup.decode_content` that can hit the limit
19-
sys.setrecursionlimit(10000)
20-
2116

2217
def get_args():
2318
parser = ArgumentParser()
@@ -59,6 +54,11 @@ def main():
5954
args = get_args()
6055
logger.info(f"** The job is runned with the following arguments: **\n{args}\n **** ")
6156

57+
if os.path.isfile(args.save_path_stats_json):
58+
logger.info(f" --- Statistics already computed for seed id {args.seed_id} ")
59+
return
60+
61+
logger.info(f" --- Statistics not already computed for seed id {args.seed_id} ")
6262
if not args.use_datasets_caching:
6363
datasets.set_caching_enabled(False)
6464
else:
@@ -92,8 +92,10 @@ def main():
9292

9393
ds_html = splits[selected_mime_types[0]]
9494

95+
logger.info(f"the currents splits are {data_stats}.")
96+
9597
def get_length_text(example):
96-
example["length_text"] = len(example["text"])
98+
example["length_text"] = len(example["text"]) if example["text"] is not None else 0
9799
return example
98100

99101
cols_to_remove = [col for col in ds.column_names if col not in ["content_languages", "url_host_tld"]]
@@ -105,7 +107,9 @@ def get_length_text(example):
105107
)
106108

107109
data_stats["html_empty_text"] = len([e for e in ds_html["length_text"] if e == 0])
108-
data_stats["html_mean_length_non_empty_text"] = mean([e for e in ds_html["length_text"] if e != 0])
110+
111+
non_empty_texts = [e for e in ds_html["length_text"] if e != 0]
112+
data_stats["html_mean_length_non_empty_text"] = mean(non_empty_texts) if non_empty_texts != [] else None
109113
data_stats["seed_id"] = args.seed_id
110114

111115
logger.info(f"There is {data_stats['html_empty_text']} empty text rows out of {len(ds_html)} rows.")
@@ -119,7 +123,8 @@ def get_length_text(example):
119123
subprocess.run(["mv", save_path_tmp, str(save_path.absolute())])
120124

121125
save_path = Path(args.save_path_stats_full_json)
122-
save_path_tmp = f"{str(save_path.absolute())}.tmp"
126+
tmp_file_name = f"tmp-{str(save_path.name)}"
127+
save_path_tmp = os.path.join(save_path.parent, tmp_file_name)
123128
logger.info(f"Saving the dataset at {save_path_tmp}")
124129
ds_html.to_json(
125130
save_path_tmp,

dashboard/slurm_scripts/compute_stats_on_pseudo_crawl.slurm

+5-4
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,12 @@
11
#!/bin/bash
2-
#SBATCH --job-name=pseudo_crawl_compute_stats
2+
#SBATCH --job-name=pseudo_crawl_compute_stats_v5
33
#SBATCH --nodes=1
44
#SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node!
55
#SBATCH --cpus-per-task=4 # number of cores per tasks
66
#SBATCH --hint=nomultithread # we get physical cores not logical
77
#SBATCH --partition=cpu_p1
88
#SBATCH --time 10:00:00 # maximum execution time (HH:MM:SS)
9-
#SBATCH --output=/gpfswork/rech/six/uty16tp/code/big_science/logs/compute_stats/%x-%j.out # output file name
9+
#SBATCH --output=/gpfswork/rech/six/uty16tp/code/big_science/logs/compute_stats_v5/%x-%j.out # output file name
1010
#SBATCH --array=1-604
1111
#SBATCH --account=six@cpu
1212

@@ -26,9 +26,10 @@ echo "Computing stats on seed id ${SEED_ID}"
2626

2727
DATASET_PATH=$six_ALL_CCFRSCRATCH/pseudo_crawl/datasets-seeds/bigscience-catalogue-data/pseudo_crawl_seed--seed-id--"$SEED_ID"
2828
SAVE_STATS_PATH=$six_ALL_CCFRSCRATCH/pseudo_crawl/datasets-stats/bigscience-catalogue-data/seed_id="$SEED_ID"/stats.json
29-
SAVE_STATS_PATH_FULL=$six_ALL_CCFRSCRATCH/pseudo_crawl/datasets-stats/bigscience-catalogue-data/seed_id="$SEED_ID"/full
29+
SAVE_STATS_PATH_DIR=$six_ALL_CCFRSCRATCH/pseudo_crawl/datasets-stats/bigscience-catalogue-data/seed_id="$SEED_ID"/full
30+
SAVE_STATS_PATH_FULL=$SAVE_STATS_PATH_DIR/full.jsonl.gz
3031

31-
mkdir -p $SAVE_STATS_PATH_FULL
32+
mkdir -p $SAVE_STATS_PATH_DIR
3233

3334
export HF_DATASETS_OFFLINE=1
3435
export HF_DATASETS_CACHE=$SCRATCH/to_delete

0 commit comments

Comments
 (0)