Skip to content

Commit 8e3d195

Browse files
committed
langmoe
1 parent f31a1a3 commit 8e3d195

File tree

4 files changed

+623
-8
lines changed

4 files changed

+623
-8
lines changed

src/nanotron/config/models_config.py

+94-1
Original file line numberDiff line numberDiff line change
@@ -262,4 +262,97 @@ def hidden_act(self):
262262
return self.activation_function
263263

264264

265-
NanotronConfigs = LlamaConfig | Starcoder2Config | GPT3Config | GPT3MoEConfig
265+
@dataclass
266+
class GPT3LangMoEConfig:
267+
"""Configuration for a GPT3 __MoE__ model with language aware gating"""
268+
269+
activation_function: str = "gelu"
270+
attn_pdrop: float = 0.1
271+
embd_pdrop: float = 0.1
272+
eos_token_id: int = 49152
273+
hidden_size: int = 2048
274+
intermediate_size: Optional[int] = None
275+
layer_norm_epsilon: float = 1e-05
276+
max_position_embeddings: int = 4096
277+
num_attention_heads: int = 16
278+
num_hidden_layers: int = 24
279+
resid_pdrop: float = 0.1
280+
scale_attention_softmax_in_fp32: bool = True
281+
scale_attn_weights: bool = True
282+
vocab_size: int = 49280
283+
sinusoidal_position_embedding: bool = True
284+
position_embedding_offset: int = 2
285+
use_spda: bool = False
286+
act_pdrop: float = 0.0
287+
scale_embedding: bool = True
288+
# MoE specific
289+
is_moe: bool = True
290+
moe_num_experts: int = 1
291+
num_experts_per_tok: int = 1
292+
moe_loss_weight: float = 0.01
293+
moe_z_loss_weight: float = 0.001
294+
moe_glu: bool = False
295+
296+
# Language aware gating
297+
num_languages: int = 100
298+
language_embedding_size: int = 128
299+
300+
def as_gpt3(self) -> GPT3Config:
301+
config = dict(**vars(self))
302+
303+
# Moe
304+
del config["is_moe"]
305+
del config["moe_num_experts"]
306+
del config["num_experts_per_tok"]
307+
del config["moe_loss_weight"]
308+
del config["moe_z_loss_weight"]
309+
del config["moe_glu"]
310+
311+
# language aware gating
312+
del config["num_languages"]
313+
del config["language_embedding_size"]
314+
315+
if "_is_using_mup" in config:
316+
del config["_is_using_mup"]
317+
return GPT3Config(**config)
318+
319+
def as_starcoder2(self) -> Starcoder2Config:
320+
# same as gpt3 conversion above
321+
config = dict(**vars(self))
322+
del config["sinusoidal_position_embedding"]
323+
del config["use_spda"]
324+
del config["position_embedding_offset"]
325+
del config["act_pdrop"]
326+
del config["scale_embedding"]
327+
328+
# Moe
329+
del config["is_moe"]
330+
del config["moe_num_experts"]
331+
del config["num_experts_per_tok"]
332+
del config["moe_loss_weight"]
333+
del config["moe_z_loss_weight"]
334+
del config["moe_glu"]
335+
336+
# language aware gating
337+
del config["num_languages"]
338+
del config["language_embedding_size"]
339+
340+
if "_is_using_mup" in config:
341+
del config["_is_using_mup"]
342+
return Starcoder2Config(
343+
grouped_query=True,
344+
num_kv_heads=self.num_attention_heads,
345+
use_rotary_embeddings=False,
346+
**config,
347+
)
348+
349+
@property
350+
def n_inner(self):
351+
return self.intermediate_size
352+
353+
@property
354+
def hidden_act(self):
355+
return self.activation_function
356+
357+
358+
NanotronConfigs = LlamaConfig | Starcoder2Config | GPT3Config | GPT3MoEConfig | GPT3LangMoEConfig

0 commit comments

Comments
 (0)