From a81757787ea70cbec49728c8a19b5a98a14fc386 Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Tue, 21 Jun 2022 16:10:29 +0200 Subject: [PATCH 1/9] add first generation tutorial --- docs/source/en/generation.mdx | 254 ++++++++++++++++++++++++++++++++++ 1 file changed, 254 insertions(+) create mode 100644 docs/source/en/generation.mdx diff --git a/docs/source/en/generation.mdx b/docs/source/en/generation.mdx new file mode 100644 index 000000000000..b5f1ac54652f --- /dev/null +++ b/docs/source/en/generation.mdx @@ -0,0 +1,254 @@ + + +# How to generate text with 🤗 Transformers. + +Generating text with transformer models can be done via *auto-regressive* language generation. +Auto-regressive generation is defined as iteratively forwarding a sequence of tokens through the model and sampling the next token in the sequence from the model's output distribution until a certain stopping criteria is met. + +This section will serve as a practical guide on how to use 🤗 Transformers' text-generation method +[`~generation_utils.GenerationMixin.generate`] for different generation methods, model architectures and different +generation configurations. + +Before diving into some practical examples, the reader is strongly advised to go over this more theoretical +blog post on [text-generation](https://huggingface.co/blog/how-to-train) to understand how the different +generation methods function. + +The most common **generation methods** are: + +- 1. Greedy search: [`~generation_utils.GenerationMixin.greedy_search`], +- 2. Sample: [`~generation_utils.GenerationMixin.sample`], and +- 3. Beam Search: [`~generation_utils.GenerationMixin.beam_search`]. + +and we can divide all model architectures broadly into **decoder-only** and **encoder-decoder** +models. + +**Decoder-only** models include architectures, such as [GPT-2](), [OPT](), and [BLOOM]() and +can be loaded via the [`AutoModelForCausalLM`] class. Decoder-only models are mainly used for +open-ended text-generation, but can essentially be used to solve every text-to-text task via +prompt-tuning as introduced in the [GPT-3 paper]( ). + +**Encoder-Decoder** models include architectures, such as [T5](), [Bart](), and [Marian]() and +can be loaded via the [`AutoModelForSeq2SeqLM`] class. Encoder-Decoder models are mainly used for +translation and summarization, but can also be used to solve every text-to-text task when the task +is framed as a text-to-text task as has been shown in the [T5 paper]( ). + +## Greedy search + +The simplest and default generation method is [`~generation_utils.GenerationMixin.greedy_search`]. + +For greedy search, the model samples from the predicted logit distribution by simply taking the most probably logit. This logit defines the next token id that is concatenated to the passed token ids and the most probably logit is sampled again. As the *most probably* logit corresponds to simply taking the `argmax`, greedy +search is a *deterministic* generation method. + +It is very easy to write such a loop yourself for a *decoder-only* model, such as [opt-125m](facebook/opt-125m). + +First, we load the model. + +```py +from transformers import AutoModelForCausalLM, AutoTokenizer + +model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m") +tokenizer = AutoTokenizer.from_pretrained("facebook/opt-125m") +``` + +Next, we define a prompt the model be conditioned on to predict the next tokens. + +```py +prompt = "In the winter, it is cold." + +prompt_ids = tokenizer(prompt, return_tensors="pt").input_ids +``` + +Now we can write a short iterative greedy serach loop that generates 20 tokens. + + + +While the generation parameter [`max_length`]( ) is widely used and always set +by default, we strongly recommend users to switch to using `max_new_tokens` instead. +`max_length` will generate tokens **up to** `max_length` where as `max_new_tokens` +will generate exactly `max_new_tokens` independent of the input length. +Unexpected behavior can occur when using `max_length` with `input_ids` that are +longer than `max_length` which is why we recommend using `max_new_tokens` instead. + + + +```py +max_new_tokens = 10 + +input_ids = prompt_ids +for _ in range(max_new_tokens): + with torch.no_grad(): + logits = model(input_ids).logits + + next_token_id = torch.argmax(logits[:, -1:], dim=-1) + + input_ids = torch.concat([input_ids, next_token_id], dim=-1) +``` + +After having generated 10 new tokens, let's take a look at the prediction. + +```py +tokenizer.batch_decode(input_ids) +``` + +Instead of writing this greedy_search method everytime yourself, one can instead +make use of [`~generation_utils.GenerationMixin.generate`]. + +Let's see how to replicate the above example with [`~generation_utils.GenerationMixin.generate`]. + + + +Greedy search is activated by default or when passing `num_beams=1` and `do_sample=False` to +[`~generation_utils.GenerationMixin.generate`] + + + +```py +sequenences = model.generate(prompt_ids, do_sample=False, num_beams=1) +tokenizer.batch_decode(input_ids) +``` + +For **encoder-decoder** models, we first need to encode the prompt with the encoder +and then iteratively forward the `input_ids` through the decoder just like in the example above. +Let's load an encoder-decoder model such as [t5-small]( ): + +```py +from transformers import AutoModelForSeq2SeqLM, AutoTokenizer + +model = AutoModelForSeq2SeqLM.from_pretrained("t5-small") +tokenizer = AutoTokenizer.from_pretrained("t5-small") +``` + +Let's use the same prompt, but now pass it once to the encoder. + +```py +input_ids = prompt_ids + +with torch.no_grad(): + encoder_hidden_states = model.encoder(input_ids).hidden_states +``` + +The decoder of encoder-decoder models contains so-called *cross-attention* layers +which have to be conditioned on encoded hidden states. Therefore, we will +pass the `encoder_hidden_states` at every iteration step below. + + + +To better understand how *encoder-decoder* models function from a theoretical +point of view please take a look at the [Encoder-decoder blog post]( ). + + + +```py +input_ids = torch.tensor([[model.config.decoder_start_token_id]]) + +max_new_tokens = 10 +for _ in range(max_new_tokens): + with torch.no_grad(): + logits = model(input_ids, encoder_hidden_states=encoder_hidden_states).logits + + next_token_id = torch.argmax(logits[:, -1:], dim=-1) +``` + +Transformers `generate` method automatically detects if the model is an encoder-decoder +model and correspondingly correctly prepare the `encoder_hidden_states`. + +```py +sequenences = model.generate(prompt_ids, do_sample=False, num_beams=1) +tokenizer.batch_decode(input_ids) +``` + +When generating longer texts, e.g. `max_new_tokens = 200` +you will notice that [`~generation_utils.GenerationMixin.greedy_search`] will automatically +finish the generation loop early because the model has predicted a so-called *end-of-sentence* token. + +Stopping at the EOS token amongst many other features, such as cached generation, batched generation, etc... +is missing in the native loop above. Therefore, we strongly recommend to use 🤗 Transformers generate +method instead of writing your own training loop. + +Other important generation features that can be set when doing `greedy_search` include: + +- + +## Sample + + + + + +it again into the + +simply takes the index of the most probably + + +## Greedy search generate +sampling methods + +The simplest form of sampling is to t + + +The [`PreTrainedTokenizerFast`] depends on the [🤗 Tokenizers](https://huggingface.co/docs/tokenizers) library. The tokenizers obtained from the 🤗 Tokenizers library can be +loaded very simply into 🤗 Transformers. + +Before getting in the specifics, let's first start by creating a dummy tokenizer in a few lines: + +```python +>>> from tokenizers import Tokenizer +>>> from tokenizers.models import BPE +>>> from tokenizers.trainers import BpeTrainer +>>> from tokenizers.pre_tokenizers import Whitespace + +>>> tokenizer = Tokenizer(BPE(unk_token="[UNK]")) +>>> trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]) + +>>> tokenizer.pre_tokenizer = Whitespace() +>>> files = [...] +>>> tokenizer.train(files, trainer) +``` + +We now have a tokenizer trained on the files we defined. We can either continue using it in that runtime, or save it to +a JSON file for future re-use. + +## Loading directly from the tokenizer object + +Let's see how to leverage this tokenizer object in the 🤗 Transformers library. The +[`PreTrainedTokenizerFast`] class allows for easy instantiation, by accepting the instantiated +*tokenizer* object as an argument: + +```python +>>> from transformers import PreTrainedTokenizerFast + +>>> fast_tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer) +``` + +This object can now be used with all the methods shared by the 🤗 Transformers tokenizers! Head to [the tokenizer +page](main_classes/tokenizer) for more information. + +## Loading from a JSON file + +In order to load a tokenizer from a JSON file, let's first start by saving our tokenizer: + +```python +>>> tokenizer.save("tokenizer.json") +``` + +The path to which we saved this file can be passed to the [`PreTrainedTokenizerFast`] initialization +method using the `tokenizer_file` parameter: + +```python +>>> from transformers import PreTrainedTokenizerFast + +>>> fast_tokenizer = PreTrainedTokenizerFast(tokenizer_file="tokenizer.json") +``` + +This object can now be used with all the methods shared by the 🤗 Transformers tokenizers! Head to [the tokenizer +page](main_classes/tokenizer) for more information. From dd2abb9f64e9bfd4da93332d37301929c2bc2907 Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Mon, 8 Dec 2025 19:03:24 +0000 Subject: [PATCH 2/9] WIP --- .../models/ministral3/convert_ministral3_weights_to_hf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/models/ministral3/convert_ministral3_weights_to_hf.py b/src/transformers/models/ministral3/convert_ministral3_weights_to_hf.py index 29b267c888e2..2e4715c1b17b 100644 --- a/src/transformers/models/ministral3/convert_ministral3_weights_to_hf.py +++ b/src/transformers/models/ministral3/convert_ministral3_weights_to_hf.py @@ -196,7 +196,7 @@ def convert_config(original_config: dict, max_position_embeddings: int = 262144) assert original_config["quantization"]["qscheme_act"] == "TENSOR" quantization_config = { "activation_scheme": "static", - "modules_to_not_convert": ["model.vision_tower", "model.multi_modal_projector"], + "modules_to_not_convert": ["model.vision_tower", "model.multi_modal_projector", "lm_head"], "quant_method": "fp8", "weight_block_size": None, } From fd602a1ed66e038974054467153813fc7afa5075 Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Mon, 8 Dec 2025 19:05:13 +0000 Subject: [PATCH 3/9] WIP --- docs/source/en/generation.mdx | 254 ---------------------------------- 1 file changed, 254 deletions(-) delete mode 100644 docs/source/en/generation.mdx diff --git a/docs/source/en/generation.mdx b/docs/source/en/generation.mdx deleted file mode 100644 index b5f1ac54652f..000000000000 --- a/docs/source/en/generation.mdx +++ /dev/null @@ -1,254 +0,0 @@ - - -# How to generate text with 🤗 Transformers. - -Generating text with transformer models can be done via *auto-regressive* language generation. -Auto-regressive generation is defined as iteratively forwarding a sequence of tokens through the model and sampling the next token in the sequence from the model's output distribution until a certain stopping criteria is met. - -This section will serve as a practical guide on how to use 🤗 Transformers' text-generation method -[`~generation_utils.GenerationMixin.generate`] for different generation methods, model architectures and different -generation configurations. - -Before diving into some practical examples, the reader is strongly advised to go over this more theoretical -blog post on [text-generation](https://huggingface.co/blog/how-to-train) to understand how the different -generation methods function. - -The most common **generation methods** are: - -- 1. Greedy search: [`~generation_utils.GenerationMixin.greedy_search`], -- 2. Sample: [`~generation_utils.GenerationMixin.sample`], and -- 3. Beam Search: [`~generation_utils.GenerationMixin.beam_search`]. - -and we can divide all model architectures broadly into **decoder-only** and **encoder-decoder** -models. - -**Decoder-only** models include architectures, such as [GPT-2](), [OPT](), and [BLOOM]() and -can be loaded via the [`AutoModelForCausalLM`] class. Decoder-only models are mainly used for -open-ended text-generation, but can essentially be used to solve every text-to-text task via -prompt-tuning as introduced in the [GPT-3 paper]( ). - -**Encoder-Decoder** models include architectures, such as [T5](), [Bart](), and [Marian]() and -can be loaded via the [`AutoModelForSeq2SeqLM`] class. Encoder-Decoder models are mainly used for -translation and summarization, but can also be used to solve every text-to-text task when the task -is framed as a text-to-text task as has been shown in the [T5 paper]( ). - -## Greedy search - -The simplest and default generation method is [`~generation_utils.GenerationMixin.greedy_search`]. - -For greedy search, the model samples from the predicted logit distribution by simply taking the most probably logit. This logit defines the next token id that is concatenated to the passed token ids and the most probably logit is sampled again. As the *most probably* logit corresponds to simply taking the `argmax`, greedy -search is a *deterministic* generation method. - -It is very easy to write such a loop yourself for a *decoder-only* model, such as [opt-125m](facebook/opt-125m). - -First, we load the model. - -```py -from transformers import AutoModelForCausalLM, AutoTokenizer - -model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m") -tokenizer = AutoTokenizer.from_pretrained("facebook/opt-125m") -``` - -Next, we define a prompt the model be conditioned on to predict the next tokens. - -```py -prompt = "In the winter, it is cold." - -prompt_ids = tokenizer(prompt, return_tensors="pt").input_ids -``` - -Now we can write a short iterative greedy serach loop that generates 20 tokens. - - - -While the generation parameter [`max_length`]( ) is widely used and always set -by default, we strongly recommend users to switch to using `max_new_tokens` instead. -`max_length` will generate tokens **up to** `max_length` where as `max_new_tokens` -will generate exactly `max_new_tokens` independent of the input length. -Unexpected behavior can occur when using `max_length` with `input_ids` that are -longer than `max_length` which is why we recommend using `max_new_tokens` instead. - - - -```py -max_new_tokens = 10 - -input_ids = prompt_ids -for _ in range(max_new_tokens): - with torch.no_grad(): - logits = model(input_ids).logits - - next_token_id = torch.argmax(logits[:, -1:], dim=-1) - - input_ids = torch.concat([input_ids, next_token_id], dim=-1) -``` - -After having generated 10 new tokens, let's take a look at the prediction. - -```py -tokenizer.batch_decode(input_ids) -``` - -Instead of writing this greedy_search method everytime yourself, one can instead -make use of [`~generation_utils.GenerationMixin.generate`]. - -Let's see how to replicate the above example with [`~generation_utils.GenerationMixin.generate`]. - - - -Greedy search is activated by default or when passing `num_beams=1` and `do_sample=False` to -[`~generation_utils.GenerationMixin.generate`] - - - -```py -sequenences = model.generate(prompt_ids, do_sample=False, num_beams=1) -tokenizer.batch_decode(input_ids) -``` - -For **encoder-decoder** models, we first need to encode the prompt with the encoder -and then iteratively forward the `input_ids` through the decoder just like in the example above. -Let's load an encoder-decoder model such as [t5-small]( ): - -```py -from transformers import AutoModelForSeq2SeqLM, AutoTokenizer - -model = AutoModelForSeq2SeqLM.from_pretrained("t5-small") -tokenizer = AutoTokenizer.from_pretrained("t5-small") -``` - -Let's use the same prompt, but now pass it once to the encoder. - -```py -input_ids = prompt_ids - -with torch.no_grad(): - encoder_hidden_states = model.encoder(input_ids).hidden_states -``` - -The decoder of encoder-decoder models contains so-called *cross-attention* layers -which have to be conditioned on encoded hidden states. Therefore, we will -pass the `encoder_hidden_states` at every iteration step below. - - - -To better understand how *encoder-decoder* models function from a theoretical -point of view please take a look at the [Encoder-decoder blog post]( ). - - - -```py -input_ids = torch.tensor([[model.config.decoder_start_token_id]]) - -max_new_tokens = 10 -for _ in range(max_new_tokens): - with torch.no_grad(): - logits = model(input_ids, encoder_hidden_states=encoder_hidden_states).logits - - next_token_id = torch.argmax(logits[:, -1:], dim=-1) -``` - -Transformers `generate` method automatically detects if the model is an encoder-decoder -model and correspondingly correctly prepare the `encoder_hidden_states`. - -```py -sequenences = model.generate(prompt_ids, do_sample=False, num_beams=1) -tokenizer.batch_decode(input_ids) -``` - -When generating longer texts, e.g. `max_new_tokens = 200` -you will notice that [`~generation_utils.GenerationMixin.greedy_search`] will automatically -finish the generation loop early because the model has predicted a so-called *end-of-sentence* token. - -Stopping at the EOS token amongst many other features, such as cached generation, batched generation, etc... -is missing in the native loop above. Therefore, we strongly recommend to use 🤗 Transformers generate -method instead of writing your own training loop. - -Other important generation features that can be set when doing `greedy_search` include: - -- - -## Sample - - - - - -it again into the - -simply takes the index of the most probably - - -## Greedy search generate -sampling methods - -The simplest form of sampling is to t - - -The [`PreTrainedTokenizerFast`] depends on the [🤗 Tokenizers](https://huggingface.co/docs/tokenizers) library. The tokenizers obtained from the 🤗 Tokenizers library can be -loaded very simply into 🤗 Transformers. - -Before getting in the specifics, let's first start by creating a dummy tokenizer in a few lines: - -```python ->>> from tokenizers import Tokenizer ->>> from tokenizers.models import BPE ->>> from tokenizers.trainers import BpeTrainer ->>> from tokenizers.pre_tokenizers import Whitespace - ->>> tokenizer = Tokenizer(BPE(unk_token="[UNK]")) ->>> trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]) - ->>> tokenizer.pre_tokenizer = Whitespace() ->>> files = [...] ->>> tokenizer.train(files, trainer) -``` - -We now have a tokenizer trained on the files we defined. We can either continue using it in that runtime, or save it to -a JSON file for future re-use. - -## Loading directly from the tokenizer object - -Let's see how to leverage this tokenizer object in the 🤗 Transformers library. The -[`PreTrainedTokenizerFast`] class allows for easy instantiation, by accepting the instantiated -*tokenizer* object as an argument: - -```python ->>> from transformers import PreTrainedTokenizerFast - ->>> fast_tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer) -``` - -This object can now be used with all the methods shared by the 🤗 Transformers tokenizers! Head to [the tokenizer -page](main_classes/tokenizer) for more information. - -## Loading from a JSON file - -In order to load a tokenizer from a JSON file, let's first start by saving our tokenizer: - -```python ->>> tokenizer.save("tokenizer.json") -``` - -The path to which we saved this file can be passed to the [`PreTrainedTokenizerFast`] initialization -method using the `tokenizer_file` parameter: - -```python ->>> from transformers import PreTrainedTokenizerFast - ->>> fast_tokenizer = PreTrainedTokenizerFast(tokenizer_file="tokenizer.json") -``` - -This object can now be used with all the methods shared by the 🤗 Transformers tokenizers! Head to [the tokenizer -page](main_classes/tokenizer) for more information. From c254d5f6a4885c28e2af105946e2664bd7ac7637 Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Mon, 8 Dec 2025 22:53:05 +0000 Subject: [PATCH 4/9] WIP --- .../convert_ministral3_weights_to_hf.py | 75 ++++++++++--------- 1 file changed, 39 insertions(+), 36 deletions(-) diff --git a/src/transformers/models/ministral3/convert_ministral3_weights_to_hf.py b/src/transformers/models/ministral3/convert_ministral3_weights_to_hf.py index 2e4715c1b17b..147bac57a6d8 100644 --- a/src/transformers/models/ministral3/convert_ministral3_weights_to_hf.py +++ b/src/transformers/models/ministral3/convert_ministral3_weights_to_hf.py @@ -38,22 +38,22 @@ STATE_DICT_MAPPING = { # Text model keys r"^output.weight": r"lm_head.weight", - r"^norm.weight": r"model.language_model.norm.weight", - r"^tok_embeddings.weight": r"model.language_model.embed_tokens.weight", - r"^layers.(\d+).attention_norm.weight": r"model.language_model.layers.\1.input_layernorm.weight", - r"^layers.(\d+).ffn_norm.weight": r"model.language_model.layers.\1.post_attention_layernorm.weight", - r"^layers.(\d+).attention.w(q|k|v|o).weight": r"model.language_model.layers.\1.self_attn.\2_proj.weight", - r"^layers.(\d+).feed_forward.w1.weight": r"model.language_model.layers.\1.mlp.gate_proj.weight", - r"^layers.(\d+).feed_forward.w2.weight": r"model.language_model.layers.\1.mlp.down_proj.weight", - r"^layers.(\d+).feed_forward.w3.weight": r"model.language_model.layers.\1.mlp.up_proj.weight", - r"^layers.(\d+).attention.w(q|k|v|o).qscale_act": r"model.language_model.layers.\1.self_attn.\2_proj.activation_scale", - r"^layers.(\d+).feed_forward.w1.qscale_act": r"model.language_model.layers.\1.mlp.gate_proj.activation_scale", - r"^layers.(\d+).feed_forward.w2.qscale_act": r"model.language_model.layers.\1.mlp.down_proj.activation_scale", - r"^layers.(\d+).feed_forward.w3.qscale_act": r"model.language_model.layers.\1.mlp.up_proj.activation_scale", - r"^layers.(\d+).attention.w(q|k|v|o).qscale_weight": r"model.language_model.layers.\1.self_attn.\2_proj.weight_scale_inv", - r"^layers.(\d+).feed_forward.w1.qscale_weight": r"model.language_model.layers.\1.mlp.gate_proj.weight_scale_inv", - r"^layers.(\d+).feed_forward.w2.qscale_weight": r"model.language_model.layers.\1.mlp.down_proj.weight_scale_inv", - r"^layers.(\d+).feed_forward.w3.qscale_weight": r"model.language_model.layers.\1.mlp.up_proj.weight_scale_inv", + r"^norm.weight": r"model.norm.weight", + r"^tok_embeddings.weight": r"model.embed_tokens.weight", + r"^layers.(\d+).attention_norm.weight": r"model.layers.\1.input_layernorm.weight", + r"^layers.(\d+).ffn_norm.weight": r"model.layers.\1.post_attention_layernorm.weight", + r"^layers.(\d+).attention.w(q|k|v|o).weight": r"model.layers.\1.self_attn.\2_proj.weight", + r"^layers.(\d+).feed_forward.w1.weight": r"model.layers.\1.mlp.gate_proj.weight", + r"^layers.(\d+).feed_forward.w2.weight": r"model.layers.\1.mlp.down_proj.weight", + r"^layers.(\d+).feed_forward.w3.weight": r"model.layers.\1.mlp.up_proj.weight", + r"^layers.(\d+).attention.w(q|k|v|o).qscale_act": r"model.layers.\1.self_attn.\2_proj.activation_scale", + r"^layers.(\d+).feed_forward.w1.qscale_act": r"model.layers.\1.mlp.gate_proj.activation_scale", + r"^layers.(\d+).feed_forward.w2.qscale_act": r"model.layers.\1.mlp.down_proj.activation_scale", + r"^layers.(\d+).feed_forward.w3.qscale_act": r"model.layers.\1.mlp.up_proj.activation_scale", + r"^layers.(\d+).attention.w(q|k|v|o).qscale_weight": r"model.layers.\1.self_attn.\2_proj.weight_scale_inv", + r"^layers.(\d+).feed_forward.w1.qscale_weight": r"model.layers.\1.mlp.gate_proj.weight_scale_inv", + r"^layers.(\d+).feed_forward.w2.qscale_weight": r"model.layers.\1.mlp.down_proj.weight_scale_inv", + r"^layers.(\d+).feed_forward.w3.qscale_weight": r"model.layers.\1.mlp.up_proj.weight_scale_inv", # Vision model keys r"vision_encoder.transformer.layers.(\d+).attention_norm.weight": r"model.vision_tower.transformer.layers.\1.attention_norm.weight", @@ -114,10 +114,11 @@ def convert_state_dict(original_state_dict: dict, config: Mistral3Config): key_value_dim = head_dim * num_attention_heads query_dim = head_dim * num_attention_heads else: - num_attention_heads = config.text_config.num_attention_heads - hidden_size = config.text_config.hidden_size - head_dim = config.text_config.head_dim - num_key_value_heads = config.text_config.num_key_value_heads + text_config = config.text_config if isinstance(config, Mistral3Config) else config + num_attention_heads = text_config.num_attention_heads + hidden_size = text_config.hidden_size + head_dim = text_config.head_dim + num_key_value_heads = text_config.num_key_value_heads key_value_dim = head_dim * num_key_value_heads query_dim = head_dim * num_attention_heads @@ -161,7 +162,7 @@ def convert_config(original_config: dict, max_position_embeddings: int = 262144) "beta_slow": float(original_config["yarn"]["alpha"]), "mscale_all_dim": 1.0, "mscale": 1.0, - "llama_4_scaling_beta": original_config["llama_4_scaling"]["beta"], + # "llama_4_scaling_beta": original_config["llama_4_scaling"]["beta"], } # These are not always defined depending on `params.json` @@ -173,10 +174,22 @@ def convert_config(original_config: dict, max_position_embeddings: int = 262144) if new_text_config_kwargs["sliding_window"] is not None: new_text_config_kwargs["sliding_window"] = int(new_text_config_kwargs["sliding_window"]) - new_text_config = Ministral3Config(**new_text_config_kwargs) + def get_maybe_quant_config() -> dict: + kwargs = {} + if original_config.get("quantization", {}).get("qformat_weight") == "fp8_e4m3": + assert original_config["quantization"]["qscheme_act"] == "TENSOR" + quantization_config = { + "activation_scheme": "static", + "modules_to_not_convert": ["model.vision_tower", "model.multi_modal_projector", "lm_head"], + "quant_method": "fp8", + "weight_block_size": None, + } + kwargs['quantization_config'] = AutoQuantizationConfig.from_dict(quantization_config) + return kwargs # No vision if original_vision_config is None: + new_text_config = Ministral3Config(**new_text_config_kwargs, **get_maybe_quant_config()) return new_text_config # Vision config @@ -191,17 +204,6 @@ def convert_config(original_config: dict, max_position_embeddings: int = 262144) _ = new_vision_config.pop("max_image_size") new_vision_config = PixtralVisionConfig(hidden_act="silu", **new_vision_config) - kwargs = {} - if original_config.get("quantization", {}).get("qformat_weight") == "fp8_e4m3": - assert original_config["quantization"]["qscheme_act"] == "TENSOR" - quantization_config = { - "activation_scheme": "static", - "modules_to_not_convert": ["model.vision_tower", "model.multi_modal_projector", "lm_head"], - "quant_method": "fp8", - "weight_block_size": None, - } - kwargs["quantization_config"] = AutoQuantizationConfig.from_dict(quantization_config) - new_config = Mistral3Config( vision_config=new_vision_config, text_config=new_text_config, @@ -209,7 +211,7 @@ def convert_config(original_config: dict, max_position_embeddings: int = 262144) image_token_id=image_token_id, spatial_merge_size=spatial_merge_size, vision_feature_layer=-1, - **kwargs, + **get_maybe_quant_config() ) return new_config @@ -228,8 +230,9 @@ def convert_and_write_model(input_dir: str, output_dir: str, max_position_embedd new_dict = convert_state_dict(original_state_dict, config) full_state_dict.update(new_dict) - if config.text_config.tie_word_embeddings: - full_state_dict["lm_head.weight"] = full_state_dict["model.language_model.embed_tokens.weight"] + text_config = config.text_config if isinstance(config, Mistral3Config) else config + if text_config.tie_word_embeddings: + full_state_dict["lm_head.weight"] = full_state_dict["model.embed_tokens.weight"] # Load weights into model and resave them with torch.device("meta"): From 013cbc807b2c3a25e0f2ecdf0219e3f3e85cad11 Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Mon, 8 Dec 2025 23:17:59 +0000 Subject: [PATCH 5/9] WIP --- .../convert_ministral3_weights_to_hf.py | 82 ++++++++++--------- 1 file changed, 43 insertions(+), 39 deletions(-) diff --git a/src/transformers/models/ministral3/convert_ministral3_weights_to_hf.py b/src/transformers/models/ministral3/convert_ministral3_weights_to_hf.py index 147bac57a6d8..4c602bd6e4ea 100644 --- a/src/transformers/models/ministral3/convert_ministral3_weights_to_hf.py +++ b/src/transformers/models/ministral3/convert_ministral3_weights_to_hf.py @@ -35,46 +35,48 @@ # fmt: off -STATE_DICT_MAPPING = { - # Text model keys - r"^output.weight": r"lm_head.weight", - r"^norm.weight": r"model.norm.weight", - r"^tok_embeddings.weight": r"model.embed_tokens.weight", - r"^layers.(\d+).attention_norm.weight": r"model.layers.\1.input_layernorm.weight", - r"^layers.(\d+).ffn_norm.weight": r"model.layers.\1.post_attention_layernorm.weight", - r"^layers.(\d+).attention.w(q|k|v|o).weight": r"model.layers.\1.self_attn.\2_proj.weight", - r"^layers.(\d+).feed_forward.w1.weight": r"model.layers.\1.mlp.gate_proj.weight", - r"^layers.(\d+).feed_forward.w2.weight": r"model.layers.\1.mlp.down_proj.weight", - r"^layers.(\d+).feed_forward.w3.weight": r"model.layers.\1.mlp.up_proj.weight", - r"^layers.(\d+).attention.w(q|k|v|o).qscale_act": r"model.layers.\1.self_attn.\2_proj.activation_scale", - r"^layers.(\d+).feed_forward.w1.qscale_act": r"model.layers.\1.mlp.gate_proj.activation_scale", - r"^layers.(\d+).feed_forward.w2.qscale_act": r"model.layers.\1.mlp.down_proj.activation_scale", - r"^layers.(\d+).feed_forward.w3.qscale_act": r"model.layers.\1.mlp.up_proj.activation_scale", - r"^layers.(\d+).attention.w(q|k|v|o).qscale_weight": r"model.layers.\1.self_attn.\2_proj.weight_scale_inv", - r"^layers.(\d+).feed_forward.w1.qscale_weight": r"model.layers.\1.mlp.gate_proj.weight_scale_inv", - r"^layers.(\d+).feed_forward.w2.qscale_weight": r"model.layers.\1.mlp.down_proj.weight_scale_inv", - r"^layers.(\d+).feed_forward.w3.qscale_weight": r"model.layers.\1.mlp.up_proj.weight_scale_inv", - - # Vision model keys - r"vision_encoder.transformer.layers.(\d+).attention_norm.weight": r"model.vision_tower.transformer.layers.\1.attention_norm.weight", - r"^vision_encoder.transformer.layers.(\d+).ffn_norm.weight": r"model.vision_tower.transformer.layers.\1.ffn_norm.weight", - r"^vision_encoder.transformer.layers.(\d+).attention.w(q|k|v|o).weight": r"model.vision_tower.transformer.layers.\1.attention.\2_proj.weight", - r"^vision_encoder.transformer.layers.(\d+).feed_forward.w1.weight": r"model.vision_tower.transformer.layers.\1.feed_forward.gate_proj.weight", - r"^vision_encoder.transformer.layers.(\d+).feed_forward.w2.weight": r"model.vision_tower.transformer.layers.\1.feed_forward.down_proj.weight", - r"^vision_encoder.transformer.layers.(\d+).feed_forward.w3.weight": r"model.vision_tower.transformer.layers.\1.feed_forward.up_proj.weight", - r"^vision_language_adapter.w_in": r"model.multi_modal_projector.linear_1", - r"^vision_language_adapter.w_out": r"model.multi_modal_projector.linear_2", - r"^vision_encoder.ln_pre.weight": r"model.vision_tower.ln_pre.weight", - r"^vision_encoder.patch_conv.weight": r"model.vision_tower.patch_conv.weight", - r"^patch_merger.merging_layer.weight": r"model.multi_modal_projector.patch_merger.merging_layer.weight", - r"^pre_mm_projector_norm.weight": r"model.multi_modal_projector.norm.weight", -} +def get_sd_mapping(has_vision: bool) -> dict: + model_key = "model.language_model" if has_vision else "model" + return { + # Text model keys + r"^output.weight": r"lm_head.weight", + r"^norm.weight": rf"{model_key}.norm.weight", + r"^tok_embeddings.weight": rf"{model_key}.embed_tokens.weight", + r"^layers.(\d+).attention_norm.weight": rf"{model_key}.layers.\1.input_layernorm.weight", + r"^layers.(\d+).ffn_norm.weight": rf"{model_key}.layers.\1.post_attention_layernorm.weight", + r"^layers.(\d+).attention.w(q|k|v|o).weight": rf"{model_key}.layers.\1.self_attn.\2_proj.weight", + r"^layers.(\d+).feed_forward.w1.weight": rf"{model_key}.layers.\1.mlp.gate_proj.weight", + r"^layers.(\d+).feed_forward.w2.weight": rf"{model_key}.layers.\1.mlp.down_proj.weight", + r"^layers.(\d+).feed_forward.w3.weight": rf"{model_key}.layers.\1.mlp.up_proj.weight", + r"^layers.(\d+).attention.w(q|k|v|o).qscale_act": rf"{model_key}.layers.\1.self_attn.\2_proj.activation_scale", + r"^layers.(\d+).feed_forward.w1.qscale_act": rf"{model_key}.layers.\1.mlp.gate_proj.activation_scale", + r"^layers.(\d+).feed_forward.w2.qscale_act": rf"{model_key}.layers.\1.mlp.down_proj.activation_scale", + r"^layers.(\d+).feed_forward.w3.qscale_act": rf"{model_key}.layers.\1.mlp.up_proj.activation_scale", + r"^layers.(\d+).attention.w(q|k|v|o).qscale_weight": rf"{model_key}.layers.\1.self_attn.\2_proj.weight_scale_inv", + r"^layers.(\d+).feed_forward.w1.qscale_weight": rf"{model_key}.layers.\1.mlp.gate_proj.weight_scale_inv", + r"^layers.(\d+).feed_forward.w2.qscale_weight": rf"{model_key}.layers.\1.mlp.down_proj.weight_scale_inv", + r"^layers.(\d+).feed_forward.w3.qscale_weight": rf"{model_key}.layers.\1.mlp.up_proj.weight_scale_inv", + + # Vision model keys + r"vision_encoder.transformer.layers.(\d+).attention_norm.weight": r"model.vision_tower.transformer.layers.\1.attention_norm.weight", + r"^vision_encoder.transformer.layers.(\d+).ffn_norm.weight": r"model.vision_tower.transformer.layers.\1.ffn_norm.weight", + r"^vision_encoder.transformer.layers.(\d+).attention.w(q|k|v|o).weight": r"model.vision_tower.transformer.layers.\1.attention.\2_proj.weight", + r"^vision_encoder.transformer.layers.(\d+).feed_forward.w1.weight": r"model.vision_tower.transformer.layers.\1.feed_forward.gate_proj.weight", + r"^vision_encoder.transformer.layers.(\d+).feed_forward.w2.weight": r"model.vision_tower.transformer.layers.\1.feed_forward.down_proj.weight", + r"^vision_encoder.transformer.layers.(\d+).feed_forward.w3.weight": r"model.vision_tower.transformer.layers.\1.feed_forward.up_proj.weight", + r"^vision_language_adapter.w_in": r"model.multi_modal_projector.linear_1", + r"^vision_language_adapter.w_out": r"model.multi_modal_projector.linear_2", + r"^vision_encoder.ln_pre.weight": r"model.vision_tower.ln_pre.weight", + r"^vision_encoder.patch_conv.weight": r"model.vision_tower.patch_conv.weight", + r"^patch_merger.merging_layer.weight": r"model.multi_modal_projector.patch_merger.merging_layer.weight", + r"^pre_mm_projector_norm.weight": r"model.multi_modal_projector.norm.weight", + } # fmt: on -def map_old_key_to_new(old_key): +def map_old_key_to_new(old_key, mapping): """Map of a key of the original state dict to the equivalent key in HF format""" - for pattern, replacement in STATE_DICT_MAPPING.items(): + for pattern, replacement in mapping.items(): new_key, n_replace = re.subn(pattern, replacement, old_key) # Early exit of the loop if n_replace > 0: @@ -100,11 +102,13 @@ def convert_state_dict(original_state_dict: dict, config: Mistral3Config): """Convert a state dict file, when a single `nn.Module` is never sharded in different files (usual case).""" new_dict = {} + is_vision = isinstance(config, Mistral3Config) + mapping = get_sd_mapping(is_vision) for old_key, tensor in original_state_dict.items(): if "fake_quantizer" in old_key: continue - new_key = map_old_key_to_new(old_key) + new_key = map_old_key_to_new(old_key, mapping) if "vision" in old_key: num_attention_heads = config.vision_config.num_attention_heads @@ -114,7 +118,7 @@ def convert_state_dict(original_state_dict: dict, config: Mistral3Config): key_value_dim = head_dim * num_attention_heads query_dim = head_dim * num_attention_heads else: - text_config = config.text_config if isinstance(config, Mistral3Config) else config + text_config = config.text_config if is_vision else config num_attention_heads = text_config.num_attention_heads hidden_size = text_config.hidden_size head_dim = text_config.head_dim @@ -162,7 +166,7 @@ def convert_config(original_config: dict, max_position_embeddings: int = 262144) "beta_slow": float(original_config["yarn"]["alpha"]), "mscale_all_dim": 1.0, "mscale": 1.0, - # "llama_4_scaling_beta": original_config["llama_4_scaling"]["beta"], + "llama_4_scaling_beta": original_config.get("llama_4_scaling", {}).get("beta", 0), } # These are not always defined depending on `params.json` From 36356814909eead83a9d13a7cea229b0a8026c93 Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Tue, 9 Dec 2025 00:04:20 +0000 Subject: [PATCH 6/9] WIP --- .../ministral3/convert_ministral3_weights_to_hf.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/transformers/models/ministral3/convert_ministral3_weights_to_hf.py b/src/transformers/models/ministral3/convert_ministral3_weights_to_hf.py index 4c602bd6e4ea..989fc7f795ca 100644 --- a/src/transformers/models/ministral3/convert_ministral3_weights_to_hf.py +++ b/src/transformers/models/ministral3/convert_ministral3_weights_to_hf.py @@ -135,7 +135,7 @@ def convert_state_dict(original_state_dict: dict, config: Mistral3Config): return new_dict -def convert_config(original_config: dict, max_position_embeddings: int = 262144): +def convert_config(original_config: dict, max_position_embeddings: int = 262144, is_vision: bool = True): original_vision_config = original_config.pop("vision_encoder", None) original_text_config = original_config @@ -164,7 +164,7 @@ def convert_config(original_config: dict, max_position_embeddings: int = 262144) "original_max_position_embeddings": original_config["yarn"]["original_max_position_embeddings"], "beta_fast": float(original_config["yarn"]["beta"]), "beta_slow": float(original_config["yarn"]["alpha"]), - "mscale_all_dim": 1.0, + "mscale_all_dim": 1.0 if is_vision else 0.0, "mscale": 1.0, "llama_4_scaling_beta": original_config.get("llama_4_scaling", {}).get("beta", 0), } @@ -224,7 +224,8 @@ def convert_and_write_model(input_dir: str, output_dir: str, max_position_embedd """Convert the model and save it (this implicitly save the config as well).""" params = read_json(os.path.join(input_dir, "params.json")) - config = convert_config(params, max_position_embeddings) + is_vision = isinstance(config, Mistral3Config) + config = convert_config(params, max_position_embeddings, is_vision) full_state_dict = {} # The model may be split between different files, but a single nn.Module is always fully present in a single file @@ -234,7 +235,7 @@ def convert_and_write_model(input_dir: str, output_dir: str, max_position_embedd new_dict = convert_state_dict(original_state_dict, config) full_state_dict.update(new_dict) - text_config = config.text_config if isinstance(config, Mistral3Config) else config + text_config = config.text_config if is_vision else config if text_config.tie_word_embeddings: full_state_dict["lm_head.weight"] = full_state_dict["model.embed_tokens.weight"] From 3e829f8162ded633c2d4324f14f892425d85c680 Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Tue, 9 Dec 2025 11:42:54 +0100 Subject: [PATCH 7/9] uP- --- .../models/ministral3/convert_ministral3_weights_to_hf.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/transformers/models/ministral3/convert_ministral3_weights_to_hf.py b/src/transformers/models/ministral3/convert_ministral3_weights_to_hf.py index 989fc7f795ca..6ec88379a24d 100644 --- a/src/transformers/models/ministral3/convert_ministral3_weights_to_hf.py +++ b/src/transformers/models/ministral3/convert_ministral3_weights_to_hf.py @@ -137,6 +137,7 @@ def convert_state_dict(original_state_dict: dict, config: Mistral3Config): def convert_config(original_config: dict, max_position_embeddings: int = 262144, is_vision: bool = True): original_vision_config = original_config.pop("vision_encoder", None) + assert is_vision == original_vision_config, f"is_vision={is_vision} but original_vision_config={original_vision_config}" original_text_config = original_config # Text config @@ -224,7 +225,7 @@ def convert_and_write_model(input_dir: str, output_dir: str, max_position_embedd """Convert the model and save it (this implicitly save the config as well).""" params = read_json(os.path.join(input_dir, "params.json")) - is_vision = isinstance(config, Mistral3Config) + is_vision = params.pop("vision_encoder", None) is not None config = convert_config(params, max_position_embeddings, is_vision) full_state_dict = {} From 7187aba224097fc3a0bc9c442dece64ee6499dd1 Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Tue, 9 Dec 2025 16:55:42 +0100 Subject: [PATCH 8/9] uP- --- .../models/ministral3/convert_ministral3_weights_to_hf.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/transformers/models/ministral3/convert_ministral3_weights_to_hf.py b/src/transformers/models/ministral3/convert_ministral3_weights_to_hf.py index 6ec88379a24d..34d0fe328a90 100644 --- a/src/transformers/models/ministral3/convert_ministral3_weights_to_hf.py +++ b/src/transformers/models/ministral3/convert_ministral3_weights_to_hf.py @@ -137,7 +137,9 @@ def convert_state_dict(original_state_dict: dict, config: Mistral3Config): def convert_config(original_config: dict, max_position_embeddings: int = 262144, is_vision: bool = True): original_vision_config = original_config.pop("vision_encoder", None) - assert is_vision == original_vision_config, f"is_vision={is_vision} but original_vision_config={original_vision_config}" + assert is_vision == original_vision_config, ( + f"is_vision={is_vision} but original_vision_config={original_vision_config}" + ) original_text_config = original_config # Text config @@ -189,7 +191,7 @@ def get_maybe_quant_config() -> dict: "quant_method": "fp8", "weight_block_size": None, } - kwargs['quantization_config'] = AutoQuantizationConfig.from_dict(quantization_config) + kwargs["quantization_config"] = AutoQuantizationConfig.from_dict(quantization_config) return kwargs # No vision @@ -216,7 +218,7 @@ def get_maybe_quant_config() -> dict: image_token_id=image_token_id, spatial_merge_size=spatial_merge_size, vision_feature_layer=-1, - **get_maybe_quant_config() + **get_maybe_quant_config(), ) return new_config From 2d2df3db1ffdab073c0b550f1695698960c15e40 Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Tue, 9 Dec 2025 16:47:19 +0000 Subject: [PATCH 9/9] WIP --- .../ministral3/convert_ministral3_weights_to_hf.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/transformers/models/ministral3/convert_ministral3_weights_to_hf.py b/src/transformers/models/ministral3/convert_ministral3_weights_to_hf.py index 34d0fe328a90..baf6787204dd 100644 --- a/src/transformers/models/ministral3/convert_ministral3_weights_to_hf.py +++ b/src/transformers/models/ministral3/convert_ministral3_weights_to_hf.py @@ -137,7 +137,7 @@ def convert_state_dict(original_state_dict: dict, config: Mistral3Config): def convert_config(original_config: dict, max_position_embeddings: int = 262144, is_vision: bool = True): original_vision_config = original_config.pop("vision_encoder", None) - assert is_vision == original_vision_config, ( + assert is_vision == (original_vision_config is not None), ( f"is_vision={is_vision} but original_vision_config={original_vision_config}" ) original_text_config = original_config @@ -198,6 +198,8 @@ def get_maybe_quant_config() -> dict: if original_vision_config is None: new_text_config = Ministral3Config(**new_text_config_kwargs, **get_maybe_quant_config()) return new_text_config + else: + new_text_config = Ministral3Config(**new_text_config_kwargs) # Vision config new_vision_config = original_vision_config @@ -227,7 +229,7 @@ def convert_and_write_model(input_dir: str, output_dir: str, max_position_embedd """Convert the model and save it (this implicitly save the config as well).""" params = read_json(os.path.join(input_dir, "params.json")) - is_vision = params.pop("vision_encoder", None) is not None + is_vision = params.get("vision_encoder") is not None config = convert_config(params, max_position_embeddings, is_vision) full_state_dict = {} @@ -240,7 +242,8 @@ def convert_and_write_model(input_dir: str, output_dir: str, max_position_embedd text_config = config.text_config if is_vision else config if text_config.tie_word_embeddings: - full_state_dict["lm_head.weight"] = full_state_dict["model.embed_tokens.weight"] + model_key = "model.language_model" if is_vision else "model" + full_state_dict["lm_head.weight"] = full_state_dict[f"{model_key}.embed_tokens.weight"] # Load weights into model and resave them with torch.device("meta"):