Skip to content

Commit

Permalink
Actually do sequence packing (#873)
Browse files Browse the repository at this point in the history
  • Loading branch information
ahmeda14960 authored Feb 10, 2025
1 parent 1927f93 commit 7a78c13
Show file tree
Hide file tree
Showing 12 changed files with 769 additions and 52 deletions.
200 changes: 200 additions & 0 deletions config/data/dolma_llama.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,200 @@
cache_dir: null
cache_options:
batch_size: 128
num_shard_groups: 128
target_size_per_flush: 512MB
configs:
dolma/algebraic-stack:
cache_dir: gs://marin-us-west4/tokenized/dolma/algebraic-stack-cc00cf
id: null
name: null
plaintext: false
stream: true
tags: []
text_key: text
train_urls:
- gs://marin-us-central2/raw/dolma/v1.7/algebraic-stack-train-{0000..0015}.json.gz
validation_urls: []
dolma/arxiv:
cache_dir: gs://marin-us-west4/tokenized/dolma/arxiv-07a51f
id: null
name: null
plaintext: false
stream: true
tags: []
text_key: text
train_urls:
- gs://marin-us-central2/raw/dolma/v1.7/arxiv-{0000..0099}.json.gz
validation_urls: []
dolma/c4:
cache_dir: gs://marin-us-west4/tokenized/dolma/c4-e0e5ec
id: null
name: null
plaintext: false
stream: true
tags: []
text_key: text
train_urls:
- gs://marin-us-central2/raw/dolma/v1.7/c4-{0000..0170}.json.gz
validation_urls: []
dolma/cc:
cache_dir: gs://marin-us-west4/tokenized/dolma/cc-74b017
id: null
name: null
plaintext: false
stream: true
tags: []
text_key: text
train_urls:
- gs://marin-us-central2/raw/dolma/v1.7/cc_en_head-{0000..0274}.json.gz
- gs://marin-us-central2/raw/dolma/v1.7/cc_en_middle-{0000..0238}.json.gz
- gs://marin-us-central2/raw/dolma/v1.7/cc_en_middle-{0240..0379}.json.gz
- gs://marin-us-central2/raw/dolma/v1.7/cc_en_tail-{0000..0152}.json.gz
- gs://marin-us-central2/raw/dolma/v1.7/cc_en_tail-{0154..0444}.json.gz
validation_urls: []
dolma/cc-news:
cache_dir: gs://marin-us-west4/tokenized/dolma/cc-news-625d3e
id: null
name: null
plaintext: false
stream: true
tags: []
text_key: text
train_urls:
- gs://marin-us-central2/raw/dolma/v1.7/cc_news_head-{0000..0004}.json.gz
- gs://marin-us-central2/raw/dolma/v1.7/cc_news_middle-{0000..0002}.json.gz
- gs://marin-us-central2/raw/dolma/v1.7/cc_news_tail-0000.json.gz
validation_urls: []
dolma/falcon:
cache_dir: gs://marin-us-west4/tokenized/dolma/falcon-da8fd0
id: null
name: null
plaintext: false
stream: true
tags: []
text_key: text
train_urls:
- gs://marin-us-central2/raw/dolma/v1.7/falcon-{0000..0499}.json.gz
validation_urls: []
dolma/flan:
cache_dir: gs://marin-us-west4/tokenized/dolma/flan-a99cb2
id: null
name: null
plaintext: false
stream: true
tags: []
text_key: text
train_urls:
- gs://marin-us-central2/raw/dolma/v1.7/tulu_flan-{0000..0065}.json.gz
validation_urls: []
dolma/gutenberg:
cache_dir: gs://marin-us-west4/tokenized/dolma/gutenberg-f9eb99
id: null
name: null
plaintext: false
stream: true
tags: []
text_key: text
train_urls:
- gs://marin-us-central2/raw/dolma/v1.7/books-{0000..0002}.json.gz
validation_urls: []
dolma/megawika:
cache_dir: gs://marin-us-west4/tokenized/dolma/megawika-34abf2
id: null
name: null
plaintext: false
stream: true
tags: []
text_key: text
train_urls:
- gs://marin-us-central2/raw/dolma/v1.7/megawika-{0000..0261}.json.gz
validation_urls: []
dolma/open-web-math:
cache_dir: gs://marin-us-west4/tokenized/dolma/open-web-math-79823d
id: null
name: null
plaintext: false
stream: true
tags: []
text_key: text
train_urls:
- gs://marin-us-central2/raw/dolma/v1.7/open-web-math-train-{0000..0012}.json.gz
validation_urls: []
dolma/pes2o:
cache_dir: gs://marin-us-west4/tokenized/dolma/pes2o-538363
id: null
name: null
plaintext: false
stream: true
tags: []
text_key: text
train_urls:
- gs://marin-us-central2/raw/dolma/v1.7/pes2o-{0000..0025}.json.gz
validation_urls: []
dolma/reddit:
cache_dir: gs://marin-us-west4/tokenized/dolma/reddit-62a64a
id: null
name: null
plaintext: false
stream: true
tags: []
text_key: text
train_urls:
- gs://marin-us-central2/raw/dolma/v1.7/reddit-{0000..0077}.json.gz
validation_urls: []
dolma/stackexchange:
cache_dir: gs://marin-us-west4/tokenized/dolma/stackexchange-adfc49
id: null
name: null
plaintext: false
stream: true
tags: []
text_key: text
train_urls:
- gs://marin-us-central2/raw/dolma/v1.7/stackexchange-{0000..0025}.json.gz
validation_urls: []
dolma/starcoder:
cache_dir: gs://marin-us-west4/tokenized/dolma/starcoder-8b6089
id: null
name: null
plaintext: false
stream: true
tags: []
text_key: text
train_urls:
- gs://marin-us-central2/raw/dolma/v1.7/starcoder-{0000..0048}.json.gz
validation_urls: []
dolma/wiki:
cache_dir: gs://marin-us-west4/tokenized/dolma/wiki-212315
id: null
name: null
plaintext: false
stream: true
tags: []
text_key: text
train_urls:
- gs://marin-us-central2/raw/dolma/v1.7/wiki-{0000..0001}.json.gz
validation_urls: []
enforce_eos: true
ignore_token_id: null
mixture_block_size: 2048
shuffle: true
stop_strategy: restart
tokenizer: nvidia/Llama-3.1-Nemotron-70B-Instruct-HF
train_weights:
dolma/algebraic-stack: 12.6
dolma/arxiv: 28.0
dolma/c4: 124.95
dolma/cc: 597.75
dolma/cc-news: 14.3
dolma/falcon: 456.4
dolma/flan: 16.5
dolma/gutenberg: 5.3
dolma/megawika: 4.6
dolma/open-web-math: 12.6
dolma/pes2o: 57.2
dolma/reddit: 79.9
dolma/stackexchange: 19.6
dolma/starcoder: 263.8
dolma/wiki: 7.4
vocab_size: null
Loading

0 comments on commit 7a78c13

Please sign in to comment.