addresses review comment

sarathc-cerebras · sarathc-cerebras · commit a363e4540f72 · 2025-12-09T20:57:29.000+04:00
diff --git a/docs/source/en/model_doc/jais2.md b/docs/source/en/model_doc/jais2.md
@@ -19,7 +19,7 @@ rendered properly in your Markdown viewer.
 
 ## Overview
 
-Jais2 is a large language model developed by MBZUAI, Inception and Cerebras Systems. It is based on the transformer architecture with several modifications including:
+Jais2 a next-generation Arabic open-weight LLM trained on the richest Arabic-first dataset to date. Built from the ground up with 8B and 70B parameters, Jais 2 understands Arabic the way it's truly spoken across dialects, cuulutre, and modern expression. It is developed by MBZUAI, Inception and Cerebras Systems and based on the transformer architecture with modifications including:
 
 - LayerNorm instead of RMSNorm
 - ReLU² activation function
diff --git a/src/transformers/models/jais2/modular_jais2.py b/src/transformers/models/jais2/modular_jais2.py
@@ -17,7 +17,6 @@
 
 import torch.nn as nn
 
-from ...activations import ACT2FN
 from ...modeling_rope_utils import RopeParameters
 from ...utils import auto_docstring, can_return_tuple
 from ..llama.configuration_llama import LlamaConfig
@@ -30,6 +29,7 @@
     LlamaModel,
     LlamaPreTrainedModel,
 )
+from ..nemotron.modeling_nemotron import NemotronMLP
 
 
 class Jais2Config(LlamaConfig):
@@ -165,18 +165,8 @@ def __init__(
 __all__ = ["Jais2Config"]
 
 
-class Jais2MLP(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.config = config
-        self.hidden_size = config.hidden_size
-        self.intermediate_size = config.intermediate_size
-        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
-        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.mlp_bias)
-        self.act_fn = ACT2FN[config.hidden_act]
-
-    def forward(self, x):
-        return self.down_proj(self.act_fn(self.up_proj(x)))
+class Jais2MLP(NemotronMLP):
+    pass
 
 
 class Jais2DecoderLayer(LlamaDecoderLayer):
diff --git a/tests/models/jais2/test_modeling_jais2.py b/tests/models/jais2/test_modeling_jais2.py
@@ -128,11 +128,11 @@ def test_model_logits(self):
     @slow
     @require_torch_accelerator
     def test_model_logits_bf16(self):
-        """Test model logits in bfloat16 precision."""
+        """Test model logits in float16 precision."""
         model = Jais2ForCausalLM.from_pretrained(
             self.checkpoint,
             device_map="auto",
-            torch_dtype=torch.bfloat16,
+            torch_dtype=torch.float16,
         )
 
         input_text = "The capital of France is"