Skip to content

Commit b6b8a06

Browse files
committed
impl src/dataset/prepare.py
1 parent b9dc33a commit b6b8a06

File tree

1 file changed

+61
-40
lines changed

1 file changed

+61
-40
lines changed

src/dataset/prepare.py

+61-40
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,25 @@
1-
import lit_llama.packed_dataset as packed_dataset
2-
from lit_llama import Tokenizer, HFTokenizer
3-
from datasets import load_dataset
4-
import numpy as np
5-
1+
import os
2+
import time
63
from pathlib import Path
4+
5+
from litgpt import HFTokenizer
6+
from litgpt.data.prepare_starcoder import DataChunkRecipe
7+
from litgpt.utils import CLI
8+
9+
from datasets.load import load_dataset
10+
711
import sys
812

913
# support running without installing as a package
1014
wd = Path(__file__).parent.parent.resolve()
1115
sys.path.append(str(wd))
1216

13-
sample_ids = ["izumi-lab/wikinews-ja-20230728", "izumi-lab/wikinews-en-20230728", "if001/aozorabunko-clean-sin"]
17+
dataset_list = [
18+
{"id": "wikimedia/wikipedia", "config": "20231101.en"},
19+
{"id": "wikimedia/wikipedia", "config": "20231101.ja"},
20+
{"id": "CohereForAI/aya_dataset", "config": "en"},
21+
{"id": "CohereForAI/aya_dataset", "config": "ja"},
22+
]
1423

1524

1625
def format_number(num):
@@ -24,54 +33,66 @@ def format_number(num):
2433
return str(num)
2534

2635

36+
class YuisekinAIDataRecipe(DataChunkRecipe):
37+
def __init__(self, tokenizer: HFTokenizer, chunk_size: int):
38+
super().__init__(chunk_size)
39+
self.tokenizer = tokenizer
40+
self.total_token_cnt = 0
41+
42+
def prepare_item(self):
43+
for dataset_data in dataset_list:
44+
print("start...", dataset_data["id"], dataset_data["config"])
45+
dataset_id = dataset_data["id"]
46+
dataset_config = dataset_data["config"]
47+
if dataset_config is not None:
48+
dataset = load_dataset(dataset_id, dataset_config)
49+
else:
50+
dataset = load_dataset(dataset_id)
51+
ds = dataset["train"]
52+
print("ds", ds)
53+
if "aya" in dataset_id:
54+
for v in ds["inputs"]:
55+
text_ids = self.tokenizer.encode(v, bos=False, eos=True)
56+
self.total_token_cnt += len(text_ids)
57+
yield text_ids
58+
else:
59+
for v in ds:
60+
text_ids = self.tokenizer.encode(v["text"], bos=False, eos=True)
61+
self.total_token_cnt += len(text_ids)
62+
yield text_ids
63+
64+
2765
def prepare_for_dataset(
28-
dataset_ids: list[str],
2966
tokenizer_path: Path,
3067
destination_path: Path,
3168
chunk_size: int,
3269
) -> None:
3370
destination_path.mkdir(parents=True, exist_ok=True)
34-
# tokenizer = Tokenizer(tokenizer_path)
35-
tokenizer = HFTokenizer(model_path=tokenizer_path)
36-
total_token_cnt = 0
37-
for dataset_id in dataset_ids:
38-
token_cnt = 0
39-
print(f"Processing {dataset_ids}")
40-
prefix = dataset_id.split("/")[-1]
41-
builder = packed_dataset.PackedDatasetBuilder(
42-
outdir=destination_path,
43-
prefix=prefix,
44-
chunk_size=chunk_size,
45-
sep_token=tokenizer.bos_id,
46-
dtype="auto",
47-
vocab_size=tokenizer.vocab_size,
48-
)
49-
ds = load_dataset(dataset_id)
50-
ds = ds["train"]
51-
52-
if "aozora" in dataset_id:
53-
for v in ds["text"]:
54-
text_ids = tokenizer.encode(v)
55-
token_cnt += len(text_ids)
56-
builder.add_array(np.array(text_ids, dtype=builder.dtype))
57-
else:
58-
for v in ds:
59-
text_ids = tokenizer.encode(v["text"])
60-
token_cnt += len(text_ids)
61-
builder.add_array(np.array(text_ids, dtype=builder.dtype))
62-
builder.write_reminder()
63-
print("tokens ", format_number(token_cnt))
64-
total_token_cnt += token_cnt
65-
print("total tokens", format_number(total_token_cnt))
71+
from litdata.processing.data_processor import DataProcessor
72+
73+
tokenizer = HFTokenizer(tokenizer_path)
74+
data_recipe = YuisekinAIDataRecipe(tokenizer=tokenizer, chunk_size=chunk_size)
75+
data_processor = DataProcessor(
76+
input_dir=None,
77+
output_dir=str(destination_path),
78+
fast_dev_run=True,
79+
num_workers=os.cpu_count(),
80+
num_downloaders=1,
81+
)
82+
83+
start_time = time.time()
84+
data_processor.run(data_recipe)
85+
elapsed_time = time.time() - start_time
86+
print(f"Time taken: {elapsed_time:.2f} seconds")
6687

6788

6889
def prepare(
6990
destination_path: Path = Path("/data/YuisekinAI_data"),
7091
# 2048 block size + 1 for causal (from LLama), 1024 blocks
7192
chunk_size: int = 2049 * 1024,
7293
) -> None:
94+
tokenizer_path = Path("./tmp/tokenizer.json")
7395
prepare_for_dataset(
74-
dataset_ids=dataset_ids,
7596
tokenizer_path=tokenizer_path,
7697
destination_path=destination_path,
7798
chunk_size=chunk_size,

0 commit comments

Comments
 (0)