huggingface
diff --git a/‎docs/source/en/model_doc/musicgen.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/source/en/model_doc/musicgen.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/source/en/quantization/torchao.md‎
Lines changed: 14 additions & 9 deletions b/‎docs/source/en/quantization/torchao.md‎
Lines changed: 14 additions & 9 deletions
diff --git a/‎examples/3D_parallel.py‎
Lines changed: 1 addition & 1 deletion b/‎examples/3D_parallel.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/pytorch/3d_parallel_checks.py‎
Lines changed: 1 addition & 1 deletion b/‎examples/pytorch/3d_parallel_checks.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/quantization/custom_quantization_int8_example.py‎
Lines changed: 1 addition & 1 deletion b/‎examples/quantization/custom_quantization_int8_example.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/transformers/modeling_utils.py‎
Lines changed: 77 additions & 97 deletions b/‎src/transformers/modeling_utils.py‎
Lines changed: 77 additions & 97 deletions
@@ -60,7 +60,7 @@ This model was contributed by [sanchit-gandhi](https://huggingface.co/sanchit-ga
 
 ```bash
 python src/transformers/models/musicgen/convert_musicgen_transformers.py \
-    --checkpoint small --pytorch_dump_folder /output/path --safe_serialization 
+    --checkpoint small --pytorch_dump_folder /output/path
 ```
 
 ## Generation
 
@@ -639,30 +639,35 @@ print(tokenizer.decode(output[0], skip_special_tokens=True))
 
 ## Serialization
 
-torchao implements [torch.Tensor subclasses](https://pytorch.org/docs/stable/notes/extending.html#subclassing-torch-tensor) for maximum flexibility in supporting new quantized torch.Tensor formats. [Safetensors](https://huggingface.co/docs/safetensors/en/index) serialization and deserialization does not work with torchao.
-
-To avoid arbitrary user code execution, torchao sets `weights_only=True` in [torch.load](https://pytorch.org/docs/stable/generated/torch.load.html) to ensure only tensors are loaded. Any known user functions can be whitelisted with [add_safe_globals](https://pytorch.org/docs/stable/notes/serialization.html#torch.serialization.add_safe_globals).
+Saving the quantized model with `save_pretrained` (in [safetensors](https://huggingface.co/docs/safetensors/en/index) format) is only supported for torchao >= v0.15. For any version below, it is only possible to manually save as unsafe `.bin` checkpoints with [torch.save](https://docs.pytorch.org/docs/stable/generated/torch.save.html).
 
 <hfoptions id="serialization-examples">
 <hfoption id="save-locally">
 
 ```py
-# don't serialize model with Safetensors
+# torchao >= 0.15
 output_dir = "llama3-8b-int4wo-128"
-quantized_model.save_pretrained("llama3-8b-int4wo-128", safe_serialization=False)
+quantized_model.save_pretrained("llama3-8b-int4wo-128")
 ```
 
 </hfoption>
 <hfoption id="push-to-huggingface-hub">
 
 ```py
-# don't serialize model with Safetensors
+# torchao >= 0.15
 USER_ID = "your_huggingface_user_id"
 REPO_ID = "llama3-8b-int4wo-128"
-quantized_model.push_to_hub(f"{USER_ID}/llama3-8b-int4wo-128", safe_serialization=False)
+quantized_model.push_to_hub(f"{USER_ID}/llama3-8b-int4wo-128")
 tokenizer.push_to_hub(f"{USER_ID}/llama3-8b-int4wo-128")
 ```
 
+
+```py
+# torchao < 0.15 -> unsafe serialization
+filename = "llama3-8b-int4wo-128/pytorch_model.bin"
+torch.save(quantized_model.state_dict(), filename)
+```
+
 </hfoption>
 </hfoptions>
 
@@ -687,7 +692,7 @@ quantized_model = AutoModelForCausalLM.from_pretrained(
 )
 # save the quantized model
 output_dir = "llama-3.1-8b-torchao-int8"
-quantized_model.save_pretrained(output_dir, safe_serialization=False)
+quantized_model.save_pretrained(output_dir)
 
 # reload the quantized model
 reloaded_model = AutoModelForCausalLM.from_pretrained(
@@ -724,7 +729,7 @@ quantized_model = AutoModelForCausalLM.from_pretrained(
 )
 # save the quantized model
 output_dir = "llama-3.1-8b-torchao-int4-cpu"
-quantized_model.save_pretrained(output_dir, safe_serialization=False)
+quantized_model.save_pretrained(output_dir)
 
 # reload the quantized model
 reloaded_model = AutoModelForCausalLM.from_pretrained(
 
@@ -340,7 +340,7 @@ def collate_fn(batch):
     else:
         # Fallback to regular save for non-distributed case
         save_dir = "test_model_nondist"
-        model.save_pretrained(save_dir, safe_serialization=False)
+        model.save_pretrained(save_dir)
         tokenizer.save_pretrained(save_dir)  # Save tokenizer too
         logger.info(f"Saved model to {save_dir}")
 
 
@@ -458,7 +458,7 @@ def collate_fn(batch):
     else:
         # Fallback to regular save for non-distributed case
         save_dir = "test_model_nondist"
-        model.save_pretrained(save_dir, safe_serialization=False)
+        model.save_pretrained(save_dir)
         tokenizer.save_pretrained(save_dir)  # Save tokenizer too
         logger.info(f"Saved model to {save_dir}")
 
 
@@ -216,7 +216,7 @@ def _process_model_after_weight_loading(self, model, **kwargs):
         """
         return True
 
-    def is_serializable(self, safe_serialization=None):
+    def is_serializable(self):
         return True
 
     @property
 
@@ -93,7 +93,6 @@
 from .safetensors_conversion import auto_conversion
 from .utils import (
     ADAPTER_SAFE_WEIGHTS_NAME,
-    ADAPTER_WEIGHTS_NAME,
     DUMMY_INPUTS,
     SAFE_WEIGHTS_INDEX_NAME,
     SAFE_WEIGHTS_NAME,
@@ -551,8 +550,7 @@ def _get_resolved_checkpoint_files(
                             raise OSError(
                                 f"{pretrained_model_name_or_path} does not appear to have a file named"
                                 f" {_add_variant(SAFE_WEIGHTS_NAME, variant)} or {_add_variant(SAFE_WEIGHTS_INDEX_NAME, variant)} "
-                                "and thus cannot be loaded with `safetensors`. Please make sure that the model has "
-                                "been saved with `safe_serialization=True` or do not set `use_safetensors=True`."
+                                "and thus cannot be loaded with `safetensors`. Please do not set `use_safetensors=True`."
                             )
                     else:
                         # This repo has no safetensors file of any kind, we switch to PyTorch.
@@ -3009,10 +3007,8 @@ def save_pretrained(
         save_directory: Union[str, os.PathLike],
         is_main_process: bool = True,
         state_dict: Optional[dict] = None,
-        save_function: Callable = torch.save,
         push_to_hub: bool = False,
-        max_shard_size: Union[int, str] = "5GB",
-        safe_serialization: bool = True,
+        max_shard_size: Union[int, str] = "50GB",
         variant: Optional[str] = None,
         token: Optional[Union[str, bool]] = None,
         save_peft_format: bool = True,
@@ -3034,18 +3030,13 @@ def save_pretrained(
                 The state dictionary of the model to save. Will default to `self.state_dict()`, but can be used to only
                 save parts of the model or if special precautions need to be taken when recovering the state dictionary
                 of a model (like when using model parallelism).
-            save_function (`Callable`):
-                The function to use to save the state dictionary. Useful on distributed training like TPUs when one
-                need to replace `torch.save` by another method.
             push_to_hub (`bool`, *optional*, defaults to `False`):
                 Whether or not to push your model to the Hugging Face model hub after saving it. You can specify the
                 repository you want to push to with `repo_id` (will default to the name of `save_directory` in your
                 namespace).
-            max_shard_size (`int` or `str`, *optional*, defaults to `"5GB"`):
+            max_shard_size (`int` or `str`, *optional*, defaults to `"50GB"`):
                 The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size
                 lower than this size. If expressed as a string, needs to be digits followed by a unit (like `"5MB"`).
-                We default it to 5GB in order for models to be able to run easily on free-tier google colab instances
-                without CPU OOM issues.
 
                 <Tip warning={true}>
 
@@ -3054,10 +3045,8 @@ def save_pretrained(
 
                 </Tip>
 
-            safe_serialization (`bool`, *optional*, defaults to `True`):
-                Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).
             variant (`str`, *optional*):
-                If specified, weights are saved in the format pytorch_model.<variant>.bin.
+                If specified, weights are saved in the format model.<variant>.safetensors.
             token (`str` or `bool`, *optional*):
                 The token to use as HTTP bearer authorization for remote files. If `True`, or not specified, will use
                 the token generated when running `hf auth login` (stored in `~/.huggingface`).
@@ -3079,9 +3068,7 @@ def save_pretrained(
 
         hf_quantizer = getattr(self, "hf_quantizer", None)
         quantization_serializable = (
-            hf_quantizer is not None
-            and isinstance(hf_quantizer, HfQuantizer)
-            and hf_quantizer.is_serializable(safe_serialization=safe_serialization)
+            hf_quantizer is not None and isinstance(hf_quantizer, HfQuantizer) and hf_quantizer.is_serializable()
         )
 
         if hf_quantizer is not None and not _hf_peft_config_loaded and not quantization_serializable:
@@ -3117,7 +3104,7 @@ def save_pretrained(
 
         metadata = {}
         if hf_quantizer is not None:
-            state_dict, metadata = hf_quantizer.get_state_dict_and_metadata(self, safe_serialization)
+            state_dict, metadata = hf_quantizer.get_state_dict_and_metadata(self)
         metadata["format"] = "pt"
 
         # Only save the model itself if we are using distributed training
@@ -3209,86 +3196,83 @@ def save_pretrained(
         if self._tp_size is not None:
             state_dict = replace_state_dict_local_with_dtensor(state_dict, self._tp_plan, self._device_mesh)
 
-        if safe_serialization:
-            # TODO: fix safe_serialization for tied weights
-            # Safetensors does not allow tensor aliasing.
-            # We're going to remove aliases before saving
-            ptrs = collections.defaultdict(list)
-            for name, tensor in state_dict.items():
-                if not isinstance(tensor, torch.Tensor):
-                    # Sometimes in the state_dict we have non-tensor objects.
-                    # e.g. in bitsandbytes we have some `str` objects in the state_dict
-                    # In the non-tensor case, fall back to the pointer of the object itself
-                    ptrs[id(tensor)].append(name)
-
-                elif tensor.device.type == "meta":
-                    # In offloaded cases, there may be meta tensors in the state_dict.
-                    # For these cases, key by the pointer of the original tensor object
-                    # (state_dict tensors are detached and therefore no longer shared)
-                    tensor = self.get_parameter(name)
-                    ptrs[id(tensor)].append(name)
+        # Safetensors does not allow tensor aliasing - we're going to remove aliases before saving
+        ptrs = collections.defaultdict(list)
+        for name, tensor in state_dict.items():
+            if not isinstance(tensor, torch.Tensor):
+                # Sometimes in the state_dict we have non-tensor objects.
+                # e.g. in bitsandbytes we have some `str` objects in the state_dict
+                # In the non-tensor case, fall back to the pointer of the object itself
+                ptrs[id(tensor)].append(name)
+
+            elif tensor.device.type == "meta":
+                # In offloaded cases, there may be meta tensors in the state_dict.
+                # For these cases, key by the pointer of the original tensor object
+                # (state_dict tensors are detached and therefore no longer shared)
+                tensor = self.get_parameter(name)
+                ptrs[id(tensor)].append(name)
 
-                else:
-                    ptrs[id_tensor_storage(tensor)].append(name)
-
-            shared_ptrs = {ptr: names for ptr, names in ptrs.items() if len(names) > 1}
-
-            # Recursively descend to find tied weight keys
-            _tied_weights_keys = set(_get_tied_weight_keys(self))
-            error_names = []
-            to_delete_names = set()
-            for names in shared_ptrs.values():
-                # Removing the keys which are declared as known duplicates on
-                # load. This allows to make sure the name which is kept is consistent.
-                if _tied_weights_keys is not None:
-                    found = 0
-                    for name in sorted(names):
-                        matches_pattern = any(re.search(pat, name) for pat in _tied_weights_keys)
-                        if matches_pattern and name in state_dict:
-                            found += 1
-                            if found < len(names):
-                                to_delete_names.add(name)
-            # We are entering a place where the weights and the transformers configuration do NOT match.
-            shared_names, disjoint_names = _find_disjoint(shared_ptrs.values(), state_dict)
-            # Those are actually tensor sharing but disjoint from each other, we can safely clone them
-            # Reloaded won't have the same property, but it shouldn't matter in any meaningful way.
-            for name in disjoint_names:
-                state_dict[name] = state_dict[name].clone()
-
-            # When not all duplicates have been cleaned, still remove those keys, but put a clear warning.
-            # If the link between tensors was done at runtime then `from_pretrained` will not get
-            # the key back leading to random tensor. A proper warning will be shown
-            # during reload (if applicable), but since the file is not necessarily compatible with
-            # the config, better show a proper warning.
-            shared_names, identical_names = _find_identical(shared_names, state_dict)
-            # delete tensors that have identical storage
-            for inames in identical_names:
-                known = inames.intersection(to_delete_names)
-                for name in known:
-                    del state_dict[name]
-                unknown = inames.difference(to_delete_names)
-                if len(unknown) > 1:
-                    error_names.append(unknown)
-
-            if shared_names:
-                error_names.extend(shared_names)
-
-            if len(error_names) > 0:
-                raise RuntimeError(
-                    f"The weights trying to be saved contained shared tensors {error_names} which are not properly defined. We found `_tied_weights_keys` to be: {_tied_weights_keys}.\n"
-                    "This can also just mean that the module's tied weight keys are wrong vs the actual tied weights in the model.",
-                )
+            else:
+                ptrs[id_tensor_storage(tensor)].append(name)
+
+        shared_ptrs = {ptr: names for ptr, names in ptrs.items() if len(names) > 1}
+
+        # Recursively descend to find tied weight keys
+        _tied_weights_keys = set(_get_tied_weight_keys(self))
+        error_names = []
+        to_delete_names = set()
+        for names in shared_ptrs.values():
+            # Removing the keys which are declared as known duplicates on
+            # load. This allows to make sure the name which is kept is consistent.
+            if _tied_weights_keys is not None:
+                found = 0
+                for name in sorted(names):
+                    matches_pattern = any(re.search(pat, name) for pat in _tied_weights_keys)
+                    if matches_pattern and name in state_dict:
+                        found += 1
+                        if found < len(names):
+                            to_delete_names.add(name)
+        # We are entering a place where the weights and the transformers configuration do NOT match.
+        shared_names, disjoint_names = _find_disjoint(shared_ptrs.values(), state_dict)
+        # Those are actually tensor sharing but disjoint from each other, we can safely clone them
+        # Reloaded won't have the same property, but it shouldn't matter in any meaningful way.
+        for name in disjoint_names:
+            state_dict[name] = state_dict[name].clone()
+
+        # When not all duplicates have been cleaned, still remove those keys, but put a clear warning.
+        # If the link between tensors was done at runtime then `from_pretrained` will not get
+        # the key back leading to random tensor. A proper warning will be shown
+        # during reload (if applicable), but since the file is not necessarily compatible with
+        # the config, better show a proper warning.
+        shared_names, identical_names = _find_identical(shared_names, state_dict)
+        # delete tensors that have identical storage
+        for inames in identical_names:
+            known = inames.intersection(to_delete_names)
+            for name in known:
+                del state_dict[name]
+            unknown = inames.difference(to_delete_names)
+            if len(unknown) > 1:
+                error_names.append(unknown)
+
+        if shared_names:
+            error_names.extend(shared_names)
+
+        if len(error_names) > 0:
+            raise RuntimeError(
+                f"The weights trying to be saved contained shared tensors {error_names} which are not properly defined. We found `_tied_weights_keys` to be: {_tied_weights_keys}.\n"
+                "This can also just mean that the module's tied weight keys are wrong vs the actual tied weights in the model.",
+            )
 
         # Revert all renaming and/or weight operations
         if save_original_format:
             state_dict = revert_weight_conversion(self, state_dict)
 
         # Shard the model if it is too big.
         if not _hf_peft_config_loaded:
-            weights_name = SAFE_WEIGHTS_NAME if safe_serialization else WEIGHTS_NAME
+            weights_name = SAFE_WEIGHTS_NAME
             weights_name = _add_variant(weights_name, variant)
         else:
-            weights_name = ADAPTER_SAFE_WEIGHTS_NAME if safe_serialization else ADAPTER_WEIGHTS_NAME
+            weights_name = ADAPTER_SAFE_WEIGHTS_NAME
 
         filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors")
         state_dict_split = split_torch_state_dict_into_shards(
@@ -3357,21 +3341,17 @@ def save_pretrained(
                 del shard_state_dict
                 gc.collect()
 
-            if safe_serialization:
-                # At some point we will need to deal better with save_function (used for TPU and other distributed
-                # joyfulness), but for now this enough. # TODO: we should def parallelize this we are otherwise just waiting
-                # too much before scheduling the next write when its in a different file
-                safe_save_file(shard, os.path.join(save_directory, shard_file), metadata=metadata)
-            else:
-                save_function(shard, os.path.join(save_directory, shard_file))
+            # TODO: we should def parallelize this we are otherwise just waiting
+            # too much before scheduling the next write when its in a different file
+            safe_save_file(shard, os.path.join(save_directory, shard_file), metadata=metadata)
 
         del state_dict
 
         if index is None:
             path_to_weights = os.path.join(save_directory, weights_name)
             logger.info(f"Model weights saved in {path_to_weights}")
         else:
-            save_index_file = SAFE_WEIGHTS_INDEX_NAME if safe_serialization else WEIGHTS_INDEX_NAME
+            save_index_file = SAFE_WEIGHTS_INDEX_NAME
             save_index_file = os.path.join(save_directory, _add_variant(save_index_file, variant))
             # Save the index as well
             with open(save_index_file, "w", encoding="utf-8") as f: