1
- import lit_llama .packed_dataset as packed_dataset
2
- from lit_llama import Tokenizer , HFTokenizer
3
- from datasets import load_dataset
4
- import numpy as np
5
-
1
+ import os
2
+ import time
6
3
from pathlib import Path
4
+
5
+ from litgpt import HFTokenizer
6
+ from litgpt .data .prepare_starcoder import DataChunkRecipe
7
+ from litgpt .utils import CLI
8
+
9
+ from datasets .load import load_dataset
10
+
7
11
import sys
8
12
9
13
# support running without installing as a package
10
14
wd = Path (__file__ ).parent .parent .resolve ()
11
15
sys .path .append (str (wd ))
12
16
13
- sample_ids = ["izumi-lab/wikinews-ja-20230728" , "izumi-lab/wikinews-en-20230728" , "if001/aozorabunko-clean-sin" ]
17
+ dataset_list = [
18
+ {"id" : "wikimedia/wikipedia" , "config" : "20231101.en" },
19
+ {"id" : "wikimedia/wikipedia" , "config" : "20231101.ja" },
20
+ {"id" : "CohereForAI/aya_dataset" , "config" : "en" },
21
+ {"id" : "CohereForAI/aya_dataset" , "config" : "ja" },
22
+ ]
14
23
15
24
16
25
def format_number (num ):
@@ -24,54 +33,66 @@ def format_number(num):
24
33
return str (num )
25
34
26
35
36
+ class YuisekinAIDataRecipe (DataChunkRecipe ):
37
+ def __init__ (self , tokenizer : HFTokenizer , chunk_size : int ):
38
+ super ().__init__ (chunk_size )
39
+ self .tokenizer = tokenizer
40
+ self .total_token_cnt = 0
41
+
42
+ def prepare_item (self ):
43
+ for dataset_data in dataset_list :
44
+ print ("start..." , dataset_data ["id" ], dataset_data ["config" ])
45
+ dataset_id = dataset_data ["id" ]
46
+ dataset_config = dataset_data ["config" ]
47
+ if dataset_config is not None :
48
+ dataset = load_dataset (dataset_id , dataset_config )
49
+ else :
50
+ dataset = load_dataset (dataset_id )
51
+ ds = dataset ["train" ]
52
+ print ("ds" , ds )
53
+ if "aya" in dataset_id :
54
+ for v in ds ["inputs" ]:
55
+ text_ids = self .tokenizer .encode (v , bos = False , eos = True )
56
+ self .total_token_cnt += len (text_ids )
57
+ yield text_ids
58
+ else :
59
+ for v in ds :
60
+ text_ids = self .tokenizer .encode (v ["text" ], bos = False , eos = True )
61
+ self .total_token_cnt += len (text_ids )
62
+ yield text_ids
63
+
64
+
27
65
def prepare_for_dataset (
28
- dataset_ids : list [str ],
29
66
tokenizer_path : Path ,
30
67
destination_path : Path ,
31
68
chunk_size : int ,
32
69
) -> None :
33
70
destination_path .mkdir (parents = True , exist_ok = True )
34
- # tokenizer = Tokenizer(tokenizer_path)
35
- tokenizer = HFTokenizer (model_path = tokenizer_path )
36
- total_token_cnt = 0
37
- for dataset_id in dataset_ids :
38
- token_cnt = 0
39
- print (f"Processing { dataset_ids } " )
40
- prefix = dataset_id .split ("/" )[- 1 ]
41
- builder = packed_dataset .PackedDatasetBuilder (
42
- outdir = destination_path ,
43
- prefix = prefix ,
44
- chunk_size = chunk_size ,
45
- sep_token = tokenizer .bos_id ,
46
- dtype = "auto" ,
47
- vocab_size = tokenizer .vocab_size ,
48
- )
49
- ds = load_dataset (dataset_id )
50
- ds = ds ["train" ]
51
-
52
- if "aozora" in dataset_id :
53
- for v in ds ["text" ]:
54
- text_ids = tokenizer .encode (v )
55
- token_cnt += len (text_ids )
56
- builder .add_array (np .array (text_ids , dtype = builder .dtype ))
57
- else :
58
- for v in ds :
59
- text_ids = tokenizer .encode (v ["text" ])
60
- token_cnt += len (text_ids )
61
- builder .add_array (np .array (text_ids , dtype = builder .dtype ))
62
- builder .write_reminder ()
63
- print ("tokens " , format_number (token_cnt ))
64
- total_token_cnt += token_cnt
65
- print ("total tokens" , format_number (total_token_cnt ))
71
+ from litdata .processing .data_processor import DataProcessor
72
+
73
+ tokenizer = HFTokenizer (tokenizer_path )
74
+ data_recipe = YuisekinAIDataRecipe (tokenizer = tokenizer , chunk_size = chunk_size )
75
+ data_processor = DataProcessor (
76
+ input_dir = None ,
77
+ output_dir = str (destination_path ),
78
+ fast_dev_run = True ,
79
+ num_workers = os .cpu_count (),
80
+ num_downloaders = 1 ,
81
+ )
82
+
83
+ start_time = time .time ()
84
+ data_processor .run (data_recipe )
85
+ elapsed_time = time .time () - start_time
86
+ print (f"Time taken: { elapsed_time :.2f} seconds" )
66
87
67
88
68
89
def prepare (
69
90
destination_path : Path = Path ("/data/YuisekinAI_data" ),
70
91
# 2048 block size + 1 for causal (from LLama), 1024 blocks
71
92
chunk_size : int = 2049 * 1024 ,
72
93
) -> None :
94
+ tokenizer_path = Path ("./tmp/tokenizer.json" )
73
95
prepare_for_dataset (
74
- dataset_ids = dataset_ids ,
75
96
tokenizer_path = tokenizer_path ,
76
97
destination_path = destination_path ,
77
98
chunk_size = chunk_size ,
0 commit comments