@@ -262,4 +262,97 @@ def hidden_act(self):
262
262
return self .activation_function
263
263
264
264
265
- NanotronConfigs = LlamaConfig | Starcoder2Config | GPT3Config | GPT3MoEConfig
265
+ @dataclass
266
+ class GPT3LangMoEConfig :
267
+ """Configuration for a GPT3 __MoE__ model with language aware gating"""
268
+
269
+ activation_function : str = "gelu"
270
+ attn_pdrop : float = 0.1
271
+ embd_pdrop : float = 0.1
272
+ eos_token_id : int = 49152
273
+ hidden_size : int = 2048
274
+ intermediate_size : Optional [int ] = None
275
+ layer_norm_epsilon : float = 1e-05
276
+ max_position_embeddings : int = 4096
277
+ num_attention_heads : int = 16
278
+ num_hidden_layers : int = 24
279
+ resid_pdrop : float = 0.1
280
+ scale_attention_softmax_in_fp32 : bool = True
281
+ scale_attn_weights : bool = True
282
+ vocab_size : int = 49280
283
+ sinusoidal_position_embedding : bool = True
284
+ position_embedding_offset : int = 2
285
+ use_spda : bool = False
286
+ act_pdrop : float = 0.0
287
+ scale_embedding : bool = True
288
+ # MoE specific
289
+ is_moe : bool = True
290
+ moe_num_experts : int = 1
291
+ num_experts_per_tok : int = 1
292
+ moe_loss_weight : float = 0.01
293
+ moe_z_loss_weight : float = 0.001
294
+ moe_glu : bool = False
295
+
296
+ # Language aware gating
297
+ num_languages : int = 100
298
+ language_embedding_size : int = 128
299
+
300
+ def as_gpt3 (self ) -> GPT3Config :
301
+ config = dict (** vars (self ))
302
+
303
+ # Moe
304
+ del config ["is_moe" ]
305
+ del config ["moe_num_experts" ]
306
+ del config ["num_experts_per_tok" ]
307
+ del config ["moe_loss_weight" ]
308
+ del config ["moe_z_loss_weight" ]
309
+ del config ["moe_glu" ]
310
+
311
+ # language aware gating
312
+ del config ["num_languages" ]
313
+ del config ["language_embedding_size" ]
314
+
315
+ if "_is_using_mup" in config :
316
+ del config ["_is_using_mup" ]
317
+ return GPT3Config (** config )
318
+
319
+ def as_starcoder2 (self ) -> Starcoder2Config :
320
+ # same as gpt3 conversion above
321
+ config = dict (** vars (self ))
322
+ del config ["sinusoidal_position_embedding" ]
323
+ del config ["use_spda" ]
324
+ del config ["position_embedding_offset" ]
325
+ del config ["act_pdrop" ]
326
+ del config ["scale_embedding" ]
327
+
328
+ # Moe
329
+ del config ["is_moe" ]
330
+ del config ["moe_num_experts" ]
331
+ del config ["num_experts_per_tok" ]
332
+ del config ["moe_loss_weight" ]
333
+ del config ["moe_z_loss_weight" ]
334
+ del config ["moe_glu" ]
335
+
336
+ # language aware gating
337
+ del config ["num_languages" ]
338
+ del config ["language_embedding_size" ]
339
+
340
+ if "_is_using_mup" in config :
341
+ del config ["_is_using_mup" ]
342
+ return Starcoder2Config (
343
+ grouped_query = True ,
344
+ num_kv_heads = self .num_attention_heads ,
345
+ use_rotary_embeddings = False ,
346
+ ** config ,
347
+ )
348
+
349
+ @property
350
+ def n_inner (self ):
351
+ return self .intermediate_size
352
+
353
+ @property
354
+ def hidden_act (self ):
355
+ return self .activation_function
356
+
357
+
358
+ NanotronConfigs = LlamaConfig | Starcoder2Config | GPT3Config | GPT3MoEConfig | GPT3LangMoEConfig
0 commit comments