very close

AleHD · AleHD · commit 930fe81cde40 · 2024-09-03T09:01:06.000Z
diff --git a/examples/xglm/README.md b/examples/xglm/README.md
@@ -31,4 +31,4 @@ To save back to huggingface format use
 torchrun examples/xglm/convert_ntmoe2hf.py --checkpoint-path=$SCRATCH/checkpoints/xglm-8x564M --save-path=$SCRATCH/checkpoints/huggingface/xglm-8x56fM
 ```
 
-Make sure to have the [XGLM MOE implementation](https://github.com/negar-foroutan/Multilingual_MoE) installed (e.g. using `PYTHONPATH=/path/to/Multilingual_MoE/models`).
+Make sure to have the [XGLM MOE implementation](https://github.com/negar-foroutan/Multilingual_MoE) installed (e.g. using `PYTHONPATH=/path/to/Multilingual_MoE`).
diff --git a/examples/xglm/convert_ntmoe2hf.py b/examples/xglm/convert_ntmoe2hf.py
@@ -9,22 +9,26 @@
 from pathlib import Path
 from typing import Optional
 
+import torch
 from transformers import AutoTokenizer
+from tqdm import tqdm
 
 from nanotron.config.models_config import GPT3MoEConfig
 from nanotron.models.gpt3_moe import GPT3MoEForTraining, GPT3MoEBlock
-from nanotron.models.moe import dMoE, SparseMLP
+from nanotron.models.moe import dMoE, SparseMLP, LearnedRouter
 
-from examples.xglm.convert_dense2moe import create_nt_moe_model, convert_attention
+from examples.xglm.convert_dense2moe import create_nt_moe_model
+from examples.xglm.convert_nt2hf import convert_attention
 from examples.xglm.convert_utils import convert_generic
 
 from models.xglm_model import XGLMForCausalLM, XGLMDecoderLayer, XGLMmoeConfig, XGLMSparseMoeBlock, XGLMMLP
+from models.gating import BasicGate
 
 # TODO: nanotron moe scales down the moe weights but hf doesn't
 # TODO: nanotron does not use pdrop in moe.
 
 
-def convert_config(config: GPT3MoEConfig) -> XGLMmoeConfig
+def convert_config(config: GPT3MoEConfig) -> XGLMmoeConfig:
     assert config.moe_num_experts > 1, f"Why are you using a 1-expert moe? lol"
     if config.embd_pdrop != config.resid_pdrop:
         warnings.warn(
@@ -59,7 +63,7 @@ def convert_config(config: GPT3MoEConfig) -> XGLMmoeConfig
         num_experts_per_tok=config.num_experts_per_tok,
         gate_type="linear",
         gate_depth=1,
-        router_aux_loss_coef=config.moe_looss_weight,
+        router_aux_loss_coef=config.moe_loss_weight,
     )
 
 
@@ -69,25 +73,38 @@ def convert_mlp(mlp_hf: XGLMMLP, mlp_nt: SparseMLP):
     convert_generic(mlp_hf.fc2, mlp_nt.w2.module)
 
 
-def convert_ff(ff_hf: XGLMSparseMoeBlock, ff_nt: dMoE):
-    convert_generic(ff_hf.gate.gate, ff_nt.router.layer)
-    for expert_hf, expert_nt in zip(ff_hf.experts, ff_nt.experts):
-        convert_mlp(expert_hf, expert_nt.mlp)
+def convert_gate(gate_hf: BasicGate, gate_nt: LearnedRouter):
+    convert_generic(gate_hf.gate, gate_nt.layer)
+
 
+def convert_ff(ff_hf: XGLMSparseMoeBlock, ff_nt: dMoE):
+    convert_gate(ff_hf.gate, ff_nt.gate)
+    int_size = ff_nt.config.intermediate_size
+    for i, expert_hf in enumerate(ff_hf.experts):
+        # TODO: fc1, fc2 has bias
+        i0 = i*int_size
+        i1 = (i + 1)*int_size
+        with torch.no_grad():
+            expert_hf.fc1.weight.copy_(ff_nt.experts.mlp.w1.module.weight.T[i0:i1, :].clone())
+            expert_hf.fc1.bias.data.zero_()
+            expert_hf.fc2.weight.copy_(ff_nt.experts.mlp.w2.module.weight[i0:i1, :].T.clone())
+            expert_hf.fc2.bias.data.zero_()
 
 def convert_decoder(block_hf: XGLMDecoderLayer, block_nt: GPT3MoEBlock):
     convert_generic(block_hf.self_attn_layer_norm, block_nt.ln_1)
     convert_attention(block_hf.self_attn, block_nt.attn)
     convert_generic(block_hf.final_layer_norm, block_nt.ln_2)
     # TODO: hf has fc1, fc2 attributes but they are not used, probably should be removed.
-    convert_generic(block_hf.fc1, block_nt.ff.c_fc)
-    convert_generic(block_hf.fc2, block_nt.ff.c_proj)
+    #return block_nt.ff
+    convert_ff(block_hf.block_sparse_moe, block_nt.ff) # REMOVE
 
 
 def convert(model_hf: XGLMForCausalLM, model_nt: GPT3MoEForTraining):
     convert_generic(model_hf.model.embed_tokens, model_nt.model.token_embeddings.pp_block.token_embedding)
-    for layer_hf, layer_nt in zip(model_hf.model.layers, model_nt.model.decoder):
-        convert_decoder(layer_hf, layer_nt.pp_block)
+    for layer_hf, layer_nt in tqdm(zip(model_hf.model.layers, model_nt.model.decoder), desc="Converting layers",
+                                   total=model_nt.config.num_hidden_layers):
+        #return convert_decoder(layer_hf, layer_nt.pp_block)
+        convert_decoder(layer_hf, layer_nt.pp_block) # REMOVE
     convert_generic(model_hf.model.layer_norm, model_nt.model.final_layer_norm.pp_block)
     convert_generic(model_hf.lm_head, model_nt.model.lm_head.pp_block)
 
@@ -104,7 +121,10 @@ def main(checkpoint_path: Path, save_path: Path, tokenizer_name: Optional[str]):
     if tokenizer_name is not None:
         tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
         tokenizer.save_pretrained(save_path)
-    convert(model_hf, model_nt)
+    states = torch.randn(4, 1, 1024)
+    #return convert(model_hf, model_nt), states.cuda().bfloat16()
+    convert(model_hf, model_nt), states.cuda().bfloat16()  # REMOVE
+    print("Saving...")
     model_hf.save_pretrained(save_path)
     print(f"Model saved to {save_path}")
 
@@ -119,4 +139,4 @@ def main(checkpoint_path: Path, save_path: Path, tokenizer_name: Optional[str]):
     )
     parser.add_argument("--tokenizer-name", type=str, default="facebook/xglm-7.5B")
     args = parser.parse_args()
-    main(args.checkpoint_path, args.save_path, args.tokenizer_name)
+    ret = main(args.checkpoint_path, args.save_path, args.tokenizer_name)
diff --git a/examples/xglm/tests/test_moe.py b/examples/xglm/tests/test_moe.py
@@ -0,0 +1,179 @@
+import torch
+import pytest
+
+import nanotron
+from nanotron.config.parallelism_config import ParallelismArgs
+from nanotron.config.models_config import GPT3MoEConfig
+from nanotron.parallel import ParallelContext
+from nanotron.parallel.pipeline_parallel.tensor_pointer import TensorPointer
+from nanotron.trainer import mark_tied_parameters
+from nanotron.models.gpt3_moe import GPT3MoEBlock, GPT3MoEForTraining
+from nanotron.models.moe import LearnedRouter, dMoE
+
+from tests.helpers.utils import init_distributed
+
+from examples.xglm.convert_ntmoe2hf import convert_config, convert_gate, convert_ff, convert
+from examples.xglm.tests.test_implementation import almost_close
+
+from models.xglm_model import XGLMSparseMoeBlock, XGLMForCausalLM
+from models.gating import BasicGate
+
+
+MAX_SEQUENCE_LENGTH = 2048
+TEST_SEQUENCE_LENGTH = 128  # If we test with a very large sequence length, precision errors get more significant independent of the correct implementation.
+#TEST_SEQUENCE_LENGTH = MAX_SEQUENCE_LENGTH
+BATCH_SIZE = 4
+HIDDEN_SIZE = 1024
+DTYPE = torch.bfloat16
+#DTYPE = torch.float32
+TEXT = "Hello. This is a relatively long text. I will use this text to test the conversion scripts. Let's finish this text soon because I don't have much more to say. Final note:"
+
+CONFIG = GPT3MoEConfig(
+    attn_pdrop=0.0,
+    embd_pdrop=0.0,
+    resid_pdrop=0.0,
+    act_pdrop=0.0,
+    eos_token_id=2,
+    hidden_size=HIDDEN_SIZE,
+    intermediate_size=4096,
+    layer_norm_epsilon=1e-05,
+    max_position_embeddings=MAX_SEQUENCE_LENGTH,
+    num_attention_heads=16,
+    num_hidden_layers=24,
+    scale_attn_weights=True,
+    vocab_size=256008,
+    sinusoidal_position_embedding=True,
+    position_embedding_offset=2,
+    use_spda=DTYPE is not torch.bfloat16,
+    # vvv moe vvv
+    is_moe=True,
+    moe_num_experts=4,
+    num_experts_per_tok=4,
+    moe_loss_weight=0.01,
+    moe_z_loss_weight=0.0,
+    moe_glu=False,
+)
+#PARALLEL_CONFIG = ParallelismArgs(dp=1, pp=1, tp=1, expert_parallel_size=1)  #CONFIG.moe_num_experts)
+
+
+@pytest.fixture
+def hidden_states() -> torch.Tensor:
+    return torch.randn(TEST_SEQUENCE_LENGTH, BATCH_SIZE, HIDDEN_SIZE, dtype=DTYPE)
+
+
+@pytest.fixture
+def input_mask() -> torch.Tensor:
+    return torch.ones(BATCH_SIZE, TEST_SEQUENCE_LENGTH, dtype=torch.bool)
+
+
+@pytest.fixture
+def input_ids() -> torch.Tensor:
+    return torch.randint(0, CONFIG.vocab_size, (BATCH_SIZE, TEST_SEQUENCE_LENGTH))
+
+
+def _test_nt2hf_gate(parallel_context: ParallelContext, hidden_states: torch.Tensor):
+    hidden_states = hidden_states.cuda()
+
+    config_hf = convert_config(CONFIG)
+    gate_nt = LearnedRouter(CONFIG).cuda().to(DTYPE)
+    gate_hf = BasicGate(config_hf).cuda().to(DTYPE)
+    convert_gate(gate_hf, gate_nt)
+
+    router_logits_nt, _, _ = gate_nt(hidden_states.view(-1, HIDDEN_SIZE))
+    router_logits_hf = gate_hf(hidden_states.permute(1, 0, 2).reshape(-1, HIDDEN_SIZE), "")
+
+    router_logits_nt = router_logits_nt.view(TEST_SEQUENCE_LENGTH, BATCH_SIZE, -1)
+    router_logits_hf = router_logits_hf.view(BATCH_SIZE, TEST_SEQUENCE_LENGTH, -1).permute(1, 0, 2)
+
+    assert router_logits_nt.size() == router_logits_hf.size()
+    torch.testing.assert_close(router_logits_nt, router_logits_hf)
+
+
+def test_nt2hf_gate(hidden_states: torch.Tensor):
+    init_distributed(tp=1, dp=1, pp=1)(_test_nt2hf_gate)(hidden_states=hidden_states)
+
+
+def _test_nt2hf_ff(parallel_context: ParallelContext, hidden_states: torch.Tensor):
+    hidden_states = hidden_states.cuda()
+
+    config_hf = convert_config(CONFIG)
+    ff_nt = dMoE(CONFIG, parallel_context, None).cuda().to(DTYPE)
+    ff_hf = XGLMSparseMoeBlock(config_hf).cuda().to(DTYPE)
+    convert_ff(ff_hf, ff_nt)
+
+    out_nt = ff_nt(hidden_states)["hidden_states"]
+    out_hf, _ = ff_hf(hidden_states.permute(1, 0, 2).contiguous(), "")
+    out_hf = out_hf.permute(1, 0, 2)
+
+    assert out_nt.size() == out_hf.size()
+    almost_close(out_nt, out_hf, max_far=0.1, far_atol=0.02)
+    #torch.testing.assert_close(out_nt, out_hf)
+
+
+
+def _test_nt2hf_model(parallel_context: ParallelContext, input_ids: torch.Tensor, input_mask: torch.Tensor):
+    random_states = nanotron.random.RandomStates({"tp_synced": nanotron.random.get_current_random_state()})
+    input_ids = input_ids.cuda()
+    input_mask = input_mask.cuda()
+
+    # unfortunately, we can't use float64 with huggingface xglm.
+    new_dtype = torch.float32 if DTYPE == torch.float64 else DTYPE
+
+    # Get nanotron model.
+    config_nt = GPT3MoEConfig(**vars(CONFIG))
+    if new_dtype not in {torch.bfloat16, torch.float16}:
+        config_nt.use_spda = True
+    model_nt = nanotron.models.build_model(
+        model_builder=lambda: GPT3MoEForTraining(
+            config=config_nt,
+            parallel_context=parallel_context,
+            parallel_config=None,
+            random_states=random_states,
+        ),
+        parallel_context=parallel_context,
+        dtype=new_dtype,
+        device="cuda",
+    ).eval()
+    mark_tied_parameters(model=model_nt, parallel_context=parallel_context)
+
+    # Create empty model_hf and make conversion.
+    model_hf = XGLMForCausalLM(convert_config(config_nt)).cuda().to(new_dtype).eval()
+    convert(model_hf, model_nt)
+
+    # Needed :/
+    aux_losses = {
+        "load_balancing_loss": (
+            torch.zeros(1, device=input_ids.device)
+            if not isinstance(input_ids, TensorPointer)
+            else TensorPointer(self.input_pp_rank)
+        ),
+        "z_loss": (
+            torch.zeros(1, device=input_ids.device)
+            if not isinstance(input_ids, TensorPointer)
+            else TensorPointer(self.input_pp_rank)
+        ),
+    }
+
+    # Get outputs and assert.
+    with torch.no_grad():
+        out_nt = model_nt.model(input_ids, input_mask, aux_losses)["sharded_logits"].to(new_dtype)
+        del model_nt
+        torch.cuda.empty_cache()
+        out_hf = model_hf(input_ids=input_ids, attention_mask=input_mask, output_router_logits=False).logits.permute(1, 0, 2)
+        del model_hf
+        torch.cuda.empty_cache()
+    assert out_nt.size() == out_hf.size(), f"{out_nt.size()}, {out_hf.size()}"
+    return out_nt.cpu(), out_hf.cpu()
+
+
+def test_nt2hf_ff(hidden_states: torch.Tensor):
+    init_distributed(tp=1, dp=1, pp=1)(_test_nt2hf_ff)(hidden_states=hidden_states)
+
+
+def _test_nt2hf_dummy_xglm(parallel_context: ParallelContext, input_ids: torch.Tensor, input_mask: torch.Tensor):
+    out_nt, out_hf = _test_nt2hf_model(parallel_context, input_ids, input_mask)
+    almost_close(out_nt, out_hf, max_far=0.1, far_atol=0.02)
+
+
+def test_nt2hf_dummy_xglm(input_ids: torch.Tensor, input_mask: torch.Tensor):
+    init_distributed(tp=1, dp=1, pp=1)(_test_nt2hf_dummy_xglm)(input_ids=input_ids, input_mask=input_mask)
diff --git a/src/nanotron/models/moe.py b/src/nanotron/models/moe.py
@@ -162,7 +162,8 @@ def forward(self, hidden_states: torch.Tensor):
         router_logits, expert_weights, top_experts = self.gate(x)
 
         # Compute the experts.
-        x, lbl_loss, z_loss = self.experts(x, router_logits, expert_weights, top_experts)
+        #return self.experts(x, router_logits, expert_weights, top_experts)
+        x, lbl_loss, z_loss = self.experts(x, router_logits, expert_weights, top_experts) #REMOVE
         return {
             "hidden_states": x.reshape(batch_size, sequence_length, -1),
             "load_balancing_loss": lbl_loss,
@@ -300,12 +301,15 @@ def forward_once(self, x, expert_weights, top_experts):  # TODO: sparse
             ) = self.indices_and_padded_bins(top_experts)
 
         # Route the tokens for MoE computation.
+        #x_pre = x.clone()
         x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, self.num_experts_per_tok)
+        #print("forward_once a", x.shape)
 
         with torch.no_grad():
             topo = self.topology(x, padded_bins)
 
-        x = self.mlp(x, topo)
+        x = self.mlp(x, topo)  #REMOVE
+        #return x_pre, self.mlp(x, topo)
 
         # Un-route the data for the MoE output.
         x = ops.padded_scatter(
@@ -422,7 +426,11 @@ def forward(self, x, router_logits, expert_weights, top_experts):
             top_experts: tensor of shape [sequence_length * batch_size, num_experts_per_tok]
         """
         # Compute the experts.
-        x, tokens_per_expert = self.forward_fn(x, expert_weights.flatten(), top_experts.flatten())
+        x, tokens_per_expert = self.forward_fn(x, expert_weights.flatten(), top_experts.flatten())  #REMOVE
+        #return router_logits
+        #print("nano b", expert_weights)
+        #return expert_weights.bfloat16()
+        #return self.forward_fn(x, expert_weights.flatten(), top_experts.flatten())
         if self.training:
             lbl_loss = load_balancing_loss(router_logits, tokens_per_expert, self.config)
             z_loss = router_z_loss(router_logits, self.config)
@@ -595,9 +603,14 @@ def __init__(
 
     def forward(self, x, topo):
         self.w1.scale_gradients(), self.w2.scale_gradients()
-        x = self.sdd(x.contiguous(), self.w1.module.weight, topo)
-        activation_fn_out = act_fn(x, self.act)
-        return self.dsd(activation_fn_out, self.w2.module.weight)
+        x = self.sdd(x.contiguous(), self.w1.module.weight, topo) # REMOVE
+        #x1 = self.sdd(x.contiguous(), self.w1.module.weight, topo)
+        activation_fn_out = act_fn(x, self.act)  # REMOVE
+        #print(x.shape, activation_fn_out.shape, self.w2.module.weight.shape)
+        #activation_fn_out = act_fn(x1, self.act)
+        return self.dsd(activation_fn_out, self.w2.module.weight)  #REMOVE
+        #x2 = self.dsd(activation_fn_out, self.w2.module.weight)
+        #return x, x1, x2, topo, self.w1.module.weight, self.w2.module.weight
 
 
 class MLP(nn.Module):
@@ -718,4 +731,4 @@ def forward(self, x, topo):
         x1 = self.sdd(x, self.w1.module.weight, topo)
         x2 = self.sdd(x, self.w3.module.weight, topo)
         x = stk.ops.mul(act_fn(x1, self.act), x2)
-        return self.dsd(x, self.w2.module.weight)
+        return self.dsd(x, self.w2.module.weight)