bigscience-workshop
diff --git a/‎cc_pseudo_crawl/.gitignore
+1 b/‎cc_pseudo_crawl/.gitignore
+1
diff --git a/‎cc_pseudo_crawl/DEPTH.md
+17 b/‎cc_pseudo_crawl/DEPTH.md
+17
diff --git a/‎cc_pseudo_crawl/README.md
+44-11 b/‎cc_pseudo_crawl/README.md
+44-11
diff --git a/‎cc_pseudo_crawl/cc_lookup_next.py
+181 b/‎cc_pseudo_crawl/cc_lookup_next.py
+181
diff --git a/‎cc_pseudo_crawl/cc_lookup.py renamed to ‎cc_pseudo_crawl/cc_lookup_seed.py
+2-6 b/‎cc_pseudo_crawl/cc_lookup.py renamed to ‎cc_pseudo_crawl/cc_lookup_seed.py
+2-6
diff --git a/‎cc_pseudo_crawl/check_erros_in_dataset.py
+50 b/‎cc_pseudo_crawl/check_erros_in_dataset.py
+50
@@ -0,0 +1 @@
+sourcing_sheet_seeds/seeds.gz.parquet
@@ -0,0 +1,17 @@
+## Strategy to get depth 1
+
+### Context
+
+Once we've extract all the seed pages, we plan to make a pseudo crawl. The idea is simple:
+ - we extract the outgoing urls from those pages.
+ - we find the most recent record in CC matching that url (if it exists).
+ - we do the entire processing for all the new records.pages
+ - we update `outgoing_urls` to obtain `outgoing_ids`
+
+### Process
+
+ - 1) Make Athena query
+ - 2) Preprocess dataset to: load_warc, obtain pdf_urls, extract external_urls
+ - 3) Build new query with all `external_urls`
+ - 4) Repeat 1-3 until reaching the depth we want.
+ - 5) Finalise `finalise.py` to: generate ids, generate `external_ids` that map to rows inside dataset.
@@ -26,15 +26,14 @@ For every site list
   ```
   aws s3 cp seeds.gz.parquet s3://bucket/path/seeds/
   ```
-  Note: the S3 path must point to a bucket with write permissions granted. The path needs to be adjusted also in follwing commands.
+  Note: the S3 path must point to a bucket with write permissions granted. The path needs to be adjusted also in following commands.
 
 3. import the seed table into Athena
   ```sql
-  CREATE EXTERNAL TABLE IF NOT EXISTS bigscience.seeds (
+  CREATE EXTERNAL TABLE IF NOT EXISTS bigscience.seed (
            `id` int,
            `title` string,
            `link` string,
-           `language` string,
            `url_path_prefix` string,
            `url_host_name` string,
            `url_host_registered_domain` string,
@@ -48,18 +47,14 @@ For every site list
 
 4. join the seeds table crawl by crawl with Common Crawl's index, creating a temporary table which is later used as one partition of the result table
    ```
-   python3 cc_lookup.py s3://bucket/path seeds "CC-MAIN-2021"
+   python3 cc_lookup_seed.py s3://bucket/path seeds "CC-MAIN-2021"
    ```
    This will run the join for all crawls of the year 2021 and put the join data into `s3://bucket/path/cc`.
 
 5. finally, create a table holding the result data in order to get further metrics or prepare the content export
   ```sql
-  CREATE EXTERNAL TABLE IF NOT EXISTS bigscience.cc (
-      id                             INT,
-      title                       STRING,
-      link                        STRING,
-      language                    STRING,
-      url_surtkey_prefix          STRING,
+  CREATE EXTERNAL TABLE IF NOT EXISTS bigscience.cc_seed (
+      seed_id                     INT,
       url_surtkey                 STRING,
       url_host_tld                STRING,
       url_host_registered_domain  STRING,
@@ -85,5 +80,43 @@ For every site list
 
 6. load the partitions of the join table
    ```sql
-   MSCK REPAIR TABLE bigscience.cc;
+   MSCK REPAIR TABLE bigscience.cc_seed;
    ```
+
+7. We want to run deduplication in terms of urls.
+  ```sql
+  CREATE TABLE bigscience.cc_seed_dedup_url
+    WITH (external_location = 's3://bucket/path/cc-seed_dedup_url/',
+          partitioned_by = ARRAY['subset'],
+          format = 'PARQUET',
+          parquet_compression = 'GZIP')
+    AS
+    WITH tmp AS (
+        SELECT *, row_number() over (partition by url order by fetch_time desc) row
+        FROM bigscience.cc_seed
+    )
+
+    SELECT
+       seed_id,
+       url,
+       url_surtkey,
+       url_host_tld,
+       url_host_registered_domain,
+       url_host_name,
+       fetch_status,
+       fetch_time,
+       warc_filename,
+       warc_record_offset,
+       warc_record_length,
+       fetch_redirect,
+       content_mime_detected,
+       content_languages,
+       subset
+    FROM tmp
+    WHERE row = 1
+  ```
+8. Run `download_warc.py` The script will download warc files. We provide two helpers: a bash script and a slurm script for easy use
+
+10. Run `preprocess_dataset.py`. This will help populate specific columns such as `outgoing_links`, `depth` ...
+
+9. Send dataset in a bucket somewhere.
@@ -0,0 +1,181 @@
+# iterate over monthly crawls and store
+# the joined data as a partition of the result table
+
+import logging
+import re
+import sys
+
+from pyathena import connect
+
+logging.basicConfig(
+    level="INFO", format="%(asctime)s %(levelname)s %(name)s: %(message)s"
+)
+
+join_template = """
+CREATE TABLE {db}._tmp_overlap
+WITH (external_location = '{s3_location}/crawl={crawl}/',
+      partitioned_by = ARRAY['subset'],
+      format = 'PARQUET',
+      parquet_compression = 'GZIP')
+AS SELECT
+       cc.url_surtkey        AS url_surtkey,
+       cc.url_host_tld       AS url_host_tld,
+       cc.url_host_registered_domain AS url_host_registered_domain,
+       cc.url_host_name      AS url_host_name,
+       cc.url                AS url,
+       cc.fetch_status       AS fetch_status,
+       cc.fetch_time         AS fetch_time,
+       cc.warc_filename      AS warc_filename,
+       cc.warc_record_offset AS warc_record_offset,
+       cc.warc_record_length AS warc_record_length,
+       cc.fetch_redirect     AS fetch_redirect,
+       cc.content_mime_detected AS content_mime_detected,
+       cc.content_languages  AS content_languages,
+       cc.subset             AS subset
+FROM ccindex.ccindex AS cc
+  RIGHT OUTER JOIN {db}.{url_table} AS {tid}
+  ON cc.url = {tid}.url
+WHERE cc.crawl = '{crawl}'
+"""
+
+drop_tmp_table = "DROP TABLE `{db}._tmp_overlap`;"
+
+# list of crawls
+# Note: in order to get a list of released crawls:
+# - query Athena
+#    SHOW PARTITIONS ccindex
+# - see
+#    https://commoncrawl.s3.amazonaws.com/crawl-data/index.html
+crawls = [
+    "CC-MAIN-2013-20",
+    "CC-MAIN-2013-48",
+    #
+    "CC-MAIN-2014-10",
+    "CC-MAIN-2014-15",
+    "CC-MAIN-2014-23",
+    "CC-MAIN-2014-35",
+    "CC-MAIN-2014-41",
+    "CC-MAIN-2014-42",
+    "CC-MAIN-2014-49",
+    "CC-MAIN-2014-52",
+    #
+    "CC-MAIN-2015-06",
+    "CC-MAIN-2015-11",
+    "CC-MAIN-2015-14",
+    "CC-MAIN-2015-18",
+    "CC-MAIN-2015-22",
+    "CC-MAIN-2015-27",
+    "CC-MAIN-2015-32",
+    "CC-MAIN-2015-35",
+    "CC-MAIN-2015-40",
+    "CC-MAIN-2015-48",
+    #
+    "CC-MAIN-2016-07",
+    "CC-MAIN-2016-18",
+    "CC-MAIN-2016-22",
+    "CC-MAIN-2016-26",
+    "CC-MAIN-2016-30",
+    "CC-MAIN-2016-36",
+    "CC-MAIN-2016-40",
+    "CC-MAIN-2016-44",
+    "CC-MAIN-2016-50",
+    #
+    "CC-MAIN-2017-04",
+    "CC-MAIN-2017-09",
+    "CC-MAIN-2017-13",
+    "CC-MAIN-2017-17",
+    "CC-MAIN-2017-22",
+    "CC-MAIN-2017-26",
+    "CC-MAIN-2017-30",
+    "CC-MAIN-2017-34",
+    "CC-MAIN-2017-39",
+    "CC-MAIN-2017-43",
+    "CC-MAIN-2017-47",
+    "CC-MAIN-2017-51",
+    #
+    "CC-MAIN-2018-05",
+    "CC-MAIN-2018-09",
+    "CC-MAIN-2018-13",
+    "CC-MAIN-2018-17",
+    "CC-MAIN-2018-22",
+    "CC-MAIN-2018-26",
+    "CC-MAIN-2018-30",
+    "CC-MAIN-2018-34",
+    "CC-MAIN-2018-39",
+    "CC-MAIN-2018-43",
+    "CC-MAIN-2018-47",
+    "CC-MAIN-2018-51",
+    #
+    "CC-MAIN-2019-04",
+    "CC-MAIN-2019-09",
+    "CC-MAIN-2019-13",
+    "CC-MAIN-2019-18",
+    "CC-MAIN-2019-22",
+    "CC-MAIN-2019-26",
+    "CC-MAIN-2019-30",
+    "CC-MAIN-2019-35",
+    "CC-MAIN-2019-39",
+    "CC-MAIN-2019-43",
+    "CC-MAIN-2019-47",
+    "CC-MAIN-2019-51",
+    #
+    "CC-MAIN-2020-05",
+    "CC-MAIN-2020-10",
+    "CC-MAIN-2020-16",
+    "CC-MAIN-2020-24",
+    "CC-MAIN-2020-29",
+    "CC-MAIN-2020-34",
+    "CC-MAIN-2020-40",
+    "CC-MAIN-2020-45",
+    "CC-MAIN-2020-50",
+    #
+    "CC-MAIN-2021-04",
+    "CC-MAIN-2021-10",
+    "CC-MAIN-2021-17",
+    "CC-MAIN-2021-21",
+    "CC-MAIN-2021-25",
+    "CC-MAIN-2021-31",
+    "CC-MAIN-2021-39",
+    "CC-MAIN-2021-43",
+    "CC-MAIN-2021-49",
+    #
+]
+
+
+s3_location = sys.argv[1]
+s3_location = s3_location.rstrip("/")  # no trailing slash!
+
+url_table = sys.argv[2]
+
+crawl_selector = re.compile(sys.argv[3], re.IGNORECASE)
+
+
+crawls = filter(lambda c: crawl_selector.match(c), crawls)
+
+
+cursor = connect(
+    s3_staging_dir="{}/staging".format(s3_location), region_name="us-east-1"
+).cursor()
+
+for crawl in crawls:
+    query = join_template.format(
+        crawl=crawl,
+        s3_location=f"{s3_location}/cc-{url_table}",
+        db="bigscience",
+        url_table=url_table,
+        tid="bs",
+    )
+    logging.info("Athena query: %s", query)
+
+    cursor.execute(query)
+    logging.info("Athena query ID %s: %s", cursor.query_id, cursor.result_set.state)
+    logging.info(
+        "       data_scanned_in_bytes: %d", cursor.result_set.data_scanned_in_bytes
+    )
+    logging.info(
+        "       total_execution_time_in_millis: %d",
+        cursor.result_set.total_execution_time_in_millis,
+    )
+
+    cursor.execute(drop_tmp_table.format(db="bigscience"))
+    logging.info("Drop temporary table: %s", cursor.result_set.state)
@@ -20,11 +20,7 @@
       format = 'PARQUET',
       parquet_compression = 'GZIP')
 AS SELECT
-       {tid}.id              AS id,
-       {tid}.title           AS title,
-       {tid}.link            AS link,
-       {tid}.language        AS language,
-       {tid}.url_surtkey     AS url_surtkey_prefix,
+       {tid}.id              AS seed_id,
        cc.url_surtkey        AS url_surtkey,
        cc.url_host_tld       AS url_host_tld,
        cc.url_host_registered_domain AS url_host_registered_domain,
@@ -168,7 +164,7 @@
 for crawl in crawls:
     query = join_template.format(
         crawl=crawl,
-        s3_location="{}/cc".format(s3_location),
+        s3_location=f"{s3_location}/cc-{seed_table}",
         db="bigscience",
         seed_table=seed_table,
         tid="bs",
 
@@ -0,0 +1,50 @@
+import os
+import logging
+from argparse import ArgumentParser
+
+from datasets import load_from_disk
+from datasets.utils.logging import set_verbosity_info
+
+set_verbosity_info()
+logger = logging.getLogger(__name__)
+
+
+def get_args():
+    parser = ArgumentParser()
+    parser.add_argument("--dataset-dir", type=str, required=True, help="Dataset name.")
+
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    # Setup logging
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    args = get_args()
+    logger.info(
+        f"** The job is runned with the following arguments: **\n{args}\n **** "
+    )
+
+    for dataset_name in os.listdir(args.dataset_dir):
+        dataset_path = os.path.join(args.dataset_dir, dataset_name)
+        try:
+            logging.info(f"Processing: {dataset_path}")
+            ds = load_from_disk(dataset_path)
+            new_ds = ds.filter(keep_failed_examples)
+            logging.info(f"Here's the subset of failed downloads: {new_ds}")
+        except Exception as e:
+            logging.warning(f"Failed to process {dataset_path} with error '{str(e)}'")
+
+
+def keep_failed_examples(example):
+    if example["download_exception"] is None:
+        return False
+    return True
+
+
+if __name__ == "__main__":
+    main()