NVIDIA · megha95 · Feb 16, 2024
diff --git a/examples/llama/convert_checkpoint.py b/examples/llama/convert_checkpoint.py
@@ -1003,13 +1003,8 @@ def convert_hf_llama(hf_model,
 
             moe_experts_gate_weights = get_weight(
                 model_params, prefix + 'block_sparse_moe.gate', dtype)
-            v = split(moe_experts_gate_weights,
-                      mapping.tp_size,
-                      mapping.tp_rank,
-                      dim=-1)
-
             weights.update(
-                get_tllm_linear_weight(v.to(torch.float32),
+                get_tllm_linear_weight(moe_experts_gate_weights.to(torch.float32),
                                        tllm_prex + 'mlp.router.', None,
                                        use_weight_only,
                                        plugin_weight_only_quant_type, dtype,

diff --git a/tensorrt_llm/layers/moe.py b/tensorrt_llm/layers/moe.py
@@ -230,17 +230,16 @@ def __init__(self,
         if quant_mode.is_weight_only():
             self.weight_dtype = trt.int8
 
-        # TODO: benchmark the router and check best TP configuration
-        # Since output dimension is usually low (in the order of 10s), we split on input dim for the moment
-        # Maybe no TP at all is even more efficient
+        # Since output dimension is usually low (in the order of 10s), no TP at all is more efficient
+        # as no allreduce required in the end
         self.router = RowLinear(
             hidden_size,
             self.num_experts,
             bias=False,
             dtype=trt.
             float32,  # Routing is sensitive since it conditions what experts are used
-            tp_group=tp_group,
-            tp_size=tp_size,
+            tp_group=None,
+            tp_size=1,
             strict_dtype=True,
         )
 
@@ -293,10 +292,6 @@ def __init__(self,
     def forward(self, hidden_states, finished=None, lora_layer_params=None):
         assert lora_layer_params is None, "LoRA + MoE is not supported for the moment"
         routing_input = cast(hidden_states, trt.float32)
-        if self.tp_size > 1:
-            routing_input = split(routing_input,
-                                  self.router.in_features,
-                                  dim=-1)[self.tp_rank]
         routing = self.router(routing_input)
         output = _moe_plugin(self.moe_config,
                              hidden_states,