Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 38 additions & 9 deletions src/transformers/feature_extraction_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,11 +67,18 @@ class BatchFeature(UserDict):
tensor_type (`Union[None, str, TensorType]`, *optional*):
You can give a tensor_type here to convert the lists of integers in PyTorch/Numpy Tensors at
initialization.
skip_tensor_conversion (`list[str]` or `set[str]`, *optional*):
List or set of keys that should NOT be converted to tensors, even when `tensor_type` is specified.
"""

def __init__(self, data: Optional[dict[str, Any]] = None, tensor_type: Union[None, str, TensorType] = None):
def __init__(
self,
data: Optional[dict[str, Any]] = None,
tensor_type: Union[None, str, TensorType] = None,
skip_tensor_conversion: Optional[Union[list[str], set[str]]] = None,
):
super().__init__(data)
self.convert_to_tensors(tensor_type=tensor_type)
self.convert_to_tensors(tensor_type=tensor_type, skip_tensor_conversion=skip_tensor_conversion)

def __getitem__(self, item: str) -> Any:
"""
Expand Down Expand Up @@ -110,6 +117,14 @@ def _get_is_as_tensor_fns(self, tensor_type: Optional[Union[str, TensorType]] =
import torch

def as_tensor(value):
if torch.is_tensor(value):
return value

# stack list of tensors if tensor_type is PyTorch (# torch.tensor() does not support list of tensors)
if isinstance(value, (list, tuple)) and len(value) > 0 and torch.is_tensor(value[0]):
return torch.stack(value)

# convert list of numpy arrays to numpy array (stack) if tensor_type is Numpy
Comment on lines +122 to +127
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i dunno if you saw the PR. Community member noticed that VideoMetadata objects throw and error when return type is 'pt', because they can't be converted to tensors

I think we can add the fix here by checking if value is a list/array/etc and early existing otherwise. We won't be able to convert non-list objects anyway

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agreed, I was just wondering why we restricted BatchFeature to only be able to contain arrays/tensors structures in the first place, just to make sure we wouldn't break an important assumption by silently allowing other objects in BatchFeature.
Also these changes should be made along with changes to the ".to()" method no?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think so, and we never had a variety of model inputs in the past. Usually whatever is output from processor goes directly in forward, so 99% chance it's an array-like object

IMO we can break the assumption now

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok sgtm, I might do that in another PR though, as this would be quite a big change, and it might be lost in the 63+ files modified here just due to allowing stacking tensors

if isinstance(value, (list, tuple)) and len(value) > 0:
if isinstance(value[0], np.ndarray):
value = np.array(value)
Expand Down Expand Up @@ -138,14 +153,20 @@ def as_tensor(value, dtype=None):
is_tensor = is_numpy_array
return is_tensor, as_tensor

def convert_to_tensors(self, tensor_type: Optional[Union[str, TensorType]] = None):
def convert_to_tensors(
self,
tensor_type: Optional[Union[str, TensorType]] = None,
skip_tensor_conversion: Optional[Union[list[str], set[str]]] = None,
):
"""
Convert the inner content to tensors.

Args:
tensor_type (`str` or [`~utils.TensorType`], *optional*):
The type of tensors to use. If `str`, should be one of the values of the enum [`~utils.TensorType`]. If
`None`, no modification is done.
skip_tensor_conversion (`list[str]` or `set[str]`, *optional*):
List or set of keys that should NOT be converted to tensors, even when `tensor_type` is specified.
"""
if tensor_type is None:
return self
Expand All @@ -154,18 +175,26 @@ def convert_to_tensors(self, tensor_type: Optional[Union[str, TensorType]] = Non

# Do the tensor conversion in batch
for key, value in self.items():
# Skip keys explicitly marked for no conversion
if skip_tensor_conversion and key in skip_tensor_conversion:
continue

try:
if not is_tensor(value):
tensor = as_tensor(value)

self[key] = tensor
except: # noqa E722
except Exception as e:
if key == "overflowing_values":
raise ValueError("Unable to create tensor returning overflowing values of different lengths. ")
raise ValueError(
f"Unable to create tensor for '{key}' with overflowing values of different lengths. "
f"Original error: {str(e)}"
) from e
raise ValueError(
"Unable to create tensor, you should probably activate padding "
"with 'padding=True' to have batched tensors with the same length."
)
f"Unable to convert output '{key}' (type: {type(value).__name__}) to tensor: {str(e)}\n"
f"You can try:\n"
f" 1. Use padding=True to ensure all outputs have the same shape\n"
f" 2. Set return_tensors=None to return Python objects instead of tensors"
) from e

return self

Expand Down
1 change: 0 additions & 1 deletion src/transformers/image_processing_utils_fast.py
Original file line number Diff line number Diff line change
Expand Up @@ -932,7 +932,6 @@ def _preprocess(
if do_pad:
processed_images = self.pad(processed_images, pad_size=pad_size, disable_grouping=disable_grouping)

processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
return BatchFeature(data={"pixel_values": processed_images}, tensor_type=return_tensors)

def to_dict(self):
Expand Down
1 change: 0 additions & 1 deletion src/transformers/models/beit/image_processing_beit_fast.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,6 @@ def _preprocess(
processed_images_grouped[shape] = stacked_images

processed_images = reorder_images(processed_images_grouped, grouped_images_index)
processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images

return BatchFeature(data={"pixel_values": processed_images}, tensor_type=return_tensors)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -251,10 +251,8 @@ def _preprocess(
processed_images, processed_masks = self.pad(
processed_images, return_mask=True, disable_grouping=disable_grouping
)
processed_masks = torch.stack(processed_masks, dim=0) if return_tensors else processed_masks
data["pixel_mask"] = processed_masks

processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
data["pixel_values"] = processed_images

return BatchFeature(data=data, tensor_type=return_tensors)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -262,7 +262,6 @@ def _preprocess(
processed_images_grouped[shape] = stacked_images

processed_images = reorder_images(processed_images_grouped, grouped_images_index)
processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images

return BatchFeature(
data={"pixel_values": processed_images, "num_patches": num_patches}, tensor_type=return_tensors
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,6 @@ def _preprocess(
processed_images_grouped[shape] = stacked_images

processed_images = reorder_images(processed_images_grouped, grouped_images_index)
processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images

return BatchFeature(data={"pixel_values": processed_images}, tensor_type=return_tensors)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,6 @@ def _preprocess(
processed_images_grouped[shape] = stacked_images

processed_images = reorder_images(processed_images_grouped, grouped_images_index)
processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images

return BatchFeature(data={"pixel_values": processed_images}, tensor_type=return_tensors)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -207,9 +207,6 @@ def _preprocess(
)
high_res_processed_images_grouped[shape] = stacked_high_res_images
high_res_processed_images = reorder_images(high_res_processed_images_grouped, grouped_high_res_images_index)
high_res_processed_images = (
torch.stack(high_res_processed_images, dim=0) if return_tensors else high_res_processed_images
)

resized_images_grouped = {}
for shape, stacked_high_res_padded_images in high_res_padded_images.items():
Expand All @@ -233,7 +230,6 @@ def _preprocess(
)
processed_images_grouped[shape] = stacked_images
processed_images = reorder_images(processed_images_grouped, grouped_resized_images_index)
processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images

return BatchFeature(
data={"pixel_values": processed_images, "high_res_pixel_values": high_res_processed_images},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -888,9 +888,6 @@ def _preprocess(
)
high_res_processed_images_grouped[shape] = stacked_high_res_images
high_res_processed_images = reorder_images(high_res_processed_images_grouped, grouped_high_res_images_index)
high_res_processed_images = (
torch.stack(high_res_processed_images, dim=0) if return_tensors else high_res_processed_images
)

resized_images_grouped = {}
for shape, stacked_high_res_padded_images in high_res_padded_images.items():
Expand All @@ -914,7 +911,6 @@ def _preprocess(
)
processed_images_grouped[shape] = stacked_images
processed_images = reorder_images(processed_images_grouped, grouped_resized_images_index)
processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images

return BatchFeature(
data={"pixel_values": processed_images, "high_res_pixel_values": high_res_processed_images},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,6 @@ def _preprocess(
processed_images_grouped[shape] = stacked_images

processed_images = reorder_images(processed_images_grouped, grouped_images_index)
processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images

return BatchFeature(data={"pixel_values": processed_images}, tensor_type=return_tensors)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,6 @@ def _preprocess(
processed_images_grouped[shape] = stacked_images

processed_images = reorder_images(processed_images_grouped, grouped_images_index)
processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images

return BatchFeature(data={"pixel_values": processed_images}, tensor_type=return_tensors)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -231,7 +231,6 @@ def _preprocess(
processed_images_grouped[shape] = stacked_images

processed_images = reorder_images(processed_images_grouped, grouped_images_index)
processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images

return BatchFeature(data={"pixel_values": processed_images}, tensor_type=return_tensors)

Expand Down
3 changes: 1 addition & 2 deletions src/transformers/models/dpt/image_processing_dpt_fast.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,8 +225,7 @@ def _preprocess(
processed_images_grouped[shape] = stacked_images

processed_images = reorder_images(processed_images_grouped, grouped_images_index)
processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
return BatchFeature(data={"pixel_values": processed_images})
return BatchFeature(data={"pixel_values": processed_images}, tensor_type=return_tensors)

def post_process_semantic_segmentation(self, outputs, target_sizes: Optional[list[tuple]] = None):
"""
Expand Down
3 changes: 1 addition & 2 deletions src/transformers/models/dpt/modular_dpt.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,8 +228,7 @@ def _preprocess(
processed_images_grouped[shape] = stacked_images

processed_images = reorder_images(processed_images_grouped, grouped_images_index)
processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
return BatchFeature(data={"pixel_values": processed_images})
return BatchFeature(data={"pixel_values": processed_images}, tensor_type=return_tensors)

def post_process_depth_estimation(
self,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -153,9 +153,8 @@ def _preprocess(
stacked_pairs = [torch.stack(pair, dim=0) for pair in image_pairs]

# Return in same format as slow processor
image_pairs = torch.stack(stacked_pairs, dim=0) if return_tensors else stacked_pairs

return BatchFeature(data={"pixel_values": image_pairs})
return BatchFeature(data={"pixel_values": stacked_pairs}, tensor_type=return_tensors)

def post_process_keypoint_matching(
self,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,6 @@ def _preprocess(
processed_images_grouped[shape] = stacked_images

processed_images = reorder_images(processed_images_grouped, grouped_images_index)
processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images

return BatchFeature(data={"pixel_values": processed_images}, tensor_type=return_tensors)

Expand Down
21 changes: 11 additions & 10 deletions src/transformers/models/eomt/image_processing_eomt_fast.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,8 +162,7 @@ def _preprocess_image_like_inputs(
)
ignore_index = kwargs.pop("ignore_index", None)
images_kwargs = kwargs.copy()
processed_images, patch_offsets = self._preprocess(images, **images_kwargs)
outputs = BatchFeature({"pixel_values": processed_images})
outputs = self._preprocess(images, **images_kwargs)

if segmentation_maps is not None:
processed_segmentation_maps = self._prepare_image_like_inputs(
Expand All @@ -183,9 +182,9 @@ def _preprocess_image_like_inputs(
}
)

processed_segmentation_maps, _ = self._preprocess(
processed_segmentation_maps = self._preprocess(
images=processed_segmentation_maps, **segmentation_maps_kwargs
)
).pixel_values
processed_segmentation_maps = processed_segmentation_maps.squeeze(1).to(torch.int64)
# Convert to list of binary masks and labels
mask_labels, class_labels = [], []
Expand All @@ -208,8 +207,8 @@ def _preprocess_image_like_inputs(
outputs["mask_labels"] = mask_labels
outputs["class_labels"] = class_labels

if patch_offsets:
outputs["patch_offsets"] = [torch.tensor(offsets) for offsets in patch_offsets]
if outputs.patch_offsets:
outputs["patch_offsets"] = [torch.tensor(offsets) for offsets in outputs.patch_offsets]

return outputs

Expand Down Expand Up @@ -274,11 +273,13 @@ def _preprocess(
stacked_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std
)
processed_images_grouped[shape] = stacked_images
images = reorder_images(processed_images_grouped, grouped_images_index)
processed_images = reorder_images(processed_images_grouped, grouped_images_index)

processed_images = torch.stack(images, dim=0) if return_tensors else images

return processed_images, patch_offsets
return BatchFeature(
data={"pixel_values": processed_images, "patch_offsets": patch_offsets},
tensor_type=return_tensors,
skip_tensor_conversion=["patch_offsets"],
)

def merge_image_patches(
self,
Expand Down
2 changes: 0 additions & 2 deletions src/transformers/models/flava/image_processing_flava_fast.py
Original file line number Diff line number Diff line change
Expand Up @@ -306,7 +306,6 @@ def _preprocess_image(
processed_images_grouped[shape] = stacked_images

processed_images = reorder_images(processed_images_grouped, grouped_images_index)
processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images

return processed_images

Expand Down Expand Up @@ -397,7 +396,6 @@ def _preprocess(
mask_group_max_aspect_ratio=mask_group_max_aspect_ratio,
)
masks = [mask_generator() for _ in range(len(images))]
masks = torch.stack(masks, dim=0) if return_tensors else masks
data["bool_masked_pos"] = masks

return BatchFeature(data=data, tensor_type=return_tensors)
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/fuyu/image_processing_fuyu.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ class FuyuBatchFeature(BatchFeature):
The outputs dictionary from the processors contains a mix of tensors and lists of tensors.
"""

def convert_to_tensors(self, tensor_type: Optional[Union[str, TensorType]] = None):
def convert_to_tensors(self, tensor_type: Optional[Union[str, TensorType]] = None, **kwargs):
"""
Convert the inner content to tensors.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -231,7 +231,6 @@ def _preprocess(
processed_images_grouped[shape] = stacked_images

processed_images = reorder_images(processed_images_grouped, grouped_images_index)
processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
return BatchFeature(
data={"pixel_values": processed_images, "num_crops": num_crops}, tensor_type=return_tensors
)
Expand Down
1 change: 0 additions & 1 deletion src/transformers/models/glpn/image_processing_glpn_fast.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,6 @@ def _preprocess(
processed_groups[shape] = stacked_images

processed_images = reorder_images(processed_groups, grouped_index)
processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
return BatchFeature(data={"pixel_values": processed_images}, tensor_type=return_tensors)

def post_process_depth_estimation(self, outputs, target_sizes=None):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,6 @@ def _preprocess(
processed_images_grouped[shape] = stacked_images

processed_images = reorder_images(processed_images_grouped, grouped_images_index)
processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images

return BatchFeature(
data={"pixel_values": processed_images, "num_patches": num_patches}, tensor_type=return_tensors
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -164,12 +164,8 @@ def _preprocess(

input_ids = reorder_images(input_ids_grouped, grouped_images_index)

return BatchFeature(
data={"input_ids": torch.stack(input_ids, dim=0) if return_tensors else input_ids},
tensor_type=return_tensors,
)
return BatchFeature(data={"input_ids": input_ids}, tensor_type=return_tensors)

pixel_values = torch.stack(pixel_values, dim=0) if return_tensors else pixel_values
return BatchFeature(data={"pixel_values": pixel_values}, tensor_type=return_tensors)

def to_dict(self):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,6 @@ def _preprocess(
processed_videos_grouped[shape] = stacked_videos

processed_videos = reorder_videos(processed_videos_grouped, grouped_videos_index)
processed_videos = torch.stack(processed_videos, dim=0) if return_tensors else processed_videos

return BatchFeature(data={"pixel_values": processed_videos}, tensor_type=return_tensors)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,6 @@ def _preprocess(
processed_videos_grouped[shape] = stacked_videos

processed_videos = reorder_videos(processed_videos_grouped, grouped_videos_index)
processed_videos = torch.stack(processed_videos, dim=0) if return_tensors else processed_videos

return BatchFeature(data={"pixel_values_videos": processed_videos}, tensor_type=return_tensors)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,6 @@ def _preprocess(
processed_images_grouped[shape] = stacked_images

processed_images = reorder_images(processed_images_grouped, grouped_images_index)
processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images

return BatchFeature(data={"pixel_values": processed_images}, tensor_type=return_tensors)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -264,8 +264,8 @@ def _preprocess(

encoded_outputs = BatchFeature(
data={
"flattened_patches": torch.stack(flattened_patches, dim=0) if return_tensors else flattened_patches,
"attention_mask": torch.stack(attention_masks, dim=0) if return_tensors else attention_masks,
"flattened_patches": flattened_patches,
"attention_mask": attention_masks,
"width": width,
"height": height,
"rows": rows,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,6 @@ def _preprocess(
processed_images_grouped[shape] = stacked_images

processed_images = reorder_images(processed_images_grouped, grouped_images_index)
processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images

data = BatchFeature(data={"pixel_values": processed_images}, tensor_type=return_tensors)

Expand Down
Loading