"""Camera-related TVTensor types."""
from typing import Any, Self, override
import torch
from torch import Tensor
from torch.utils._pytree import tree_flatten
from torchvision import tv_tensors
from torchvision.transforms.v2 import functional as _F
from torchvision.transforms.v2.functional import (
register_kernel as _register_kernel,
)
from torchvision.transforms.v2.functional._geometry import (
_center_crop_parse_output_size,
_compute_resized_output_size,
_parse_pad_padding,
)
from torchvision.tv_tensors import TVTensor
[docs]
class CameraImages(tv_tensors.Image):
""":class:`~torchvision.tv_tensors.Image` subclass for multi-camera images with shape ``[N, C, H, W]``.
Inherits from :class:`torchvision.tv_tensors.Image` so every
torchvision v2 image transform dispatches automatically.
For 3D spatial transforms (flip, rotate, etc.) this type passes through
unchanged.
Args:
data: Any data that can be turned into a tensor with :func:`torch.as_tensor`.
dtype (torch.dtype, optional): Desired data type.
device (torch.device, optional): Desired device.
requires_grad (bool, optional): Whether autograd should record operations.
"""
# This stub exists so pyrefly sees the correct constructor signature instead of torch.Tensor.__init__.
def __init__(
self,
data: Any,
*,
dtype: torch.dtype | None = None,
device: torch.device | str | int | None = None,
requires_grad: bool | None = None,
) -> None: ...
def __new__(
cls,
data: Any,
*,
dtype: torch.dtype | None = None,
device: torch.device | str | int | None = None,
requires_grad: bool | None = None,
) -> Self:
tensor = cls._to_tensor(
data, dtype=dtype, device=device, requires_grad=requires_grad
)
if tensor.ndim < 3:
raise ValueError(
f"Expected at least 3D tensor [N, C, H, W], got {tensor.ndim}D"
)
return tensor.as_subclass(cls)
@override
def __repr__(self, *, tensor_contents: Any = None) -> str:
return self._make_repr()
[docs]
class CameraExtrinsics(TVTensor):
""":class:`~torch.Tensor` subclass for camera extrinsic matrices with shape ``[N, 4, 4]``.
Each matrix transforms a point from the dataset's *source frame*
to camera frame. The source frame is dataset-defined: lidar for
lidar-equipped datasets (e.g. KITTI, nuScenes), ego/world for
camera-only datasets.
3D spatial transforms (flip, rotate, etc.) update these matrices
to keep the source-to-camera mapping consistent after the source
frame changes.
Args:
data: Any data that can be turned into a tensor with :func:`torch.as_tensor`.
dtype (torch.dtype, optional): Desired data type.
device (torch.device, optional): Desired device.
requires_grad (bool, optional): Whether autograd should record operations.
"""
# This stub exists so pyrefly sees the correct constructor signature instead of torch.Tensor.__init__.
def __init__(
self,
data: Any,
*,
dtype: torch.dtype | None = None,
device: torch.device | str | int | None = None,
requires_grad: bool | None = None,
) -> None: ...
def __new__(
cls,
data: Any,
*,
dtype: torch.dtype | None = None,
device: torch.device | str | int | None = None,
requires_grad: bool | None = None,
) -> Self:
tensor = cls._to_tensor(
data, dtype=dtype, device=device, requires_grad=requires_grad
)
if tensor.ndim < 2 or tensor.shape[-2:] != (4, 4):
raise ValueError(
f"Expected tensor with shape [..., 4, 4], got {tuple(tensor.shape)}"
)
if not torch.is_floating_point(tensor):
raise ValueError(
f"Extrinsics require floating point tensors, got {tensor.dtype}."
)
return tensor.as_subclass(cls)
@override
def __repr__(self, *, tensor_contents: Any = None) -> str:
return self._make_repr()
[docs]
class CameraIntrinsics(TVTensor):
""":class:`~torch.Tensor` subclass for camera intrinsic matrices with shape ``[N, 3, 3]``.
Each matrix maps from camera-frame 3D coordinates to pixel coordinates.
Image-space transforms (resize, crop, etc.) update these matrices.
3D spatial transforms pass them through unchanged.
Args:
data: Any data that can be turned into a tensor with :func:`torch.as_tensor`.
image_size (tuple): Height and width of the corresponding images as
``(h, w)``. Required for geometric transforms (resize) that need
to compute scale factors.
dtype (torch.dtype, optional): Desired data type.
device (torch.device, optional): Desired device.
requires_grad (bool, optional): Whether autograd should record operations.
"""
image_size: tuple[int, int]
# This stub exists so pyrefly sees the correct constructor signature instead of torch.Tensor.__init__.
def __init__(
self,
data: Any,
*,
image_size: tuple[int, int],
dtype: torch.dtype | None = None,
device: torch.device | str | int | None = None,
requires_grad: bool | None = None,
) -> None: ...
@classmethod
def _wrap(
cls,
tensor: Tensor,
*,
image_size: tuple[int, int],
) -> Self:
if (
not isinstance(image_size, tuple)
or len(image_size) != 2
or not all(isinstance(d, int) and d > 0 for d in image_size)
):
raise ValueError(
f"image_size must be a tuple of two positive ints (h, w), "
f"got {image_size!r}"
)
intrinsics = tensor.as_subclass(cls)
intrinsics.image_size = image_size
return intrinsics
def __new__(
cls,
data: Any,
*,
image_size: tuple[int, int],
dtype: torch.dtype | None = None,
device: torch.device | str | int | None = None,
requires_grad: bool | None = None,
) -> Self:
tensor = cls._to_tensor(
data, dtype=dtype, device=device, requires_grad=requires_grad
)
if tensor.ndim < 2 or tensor.shape[-2:] != (3, 3):
raise ValueError(
f"Expected tensor with shape [..., 3, 3], got {tuple(tensor.shape)}"
)
if not torch.is_floating_point(tensor):
raise ValueError(
f"Intrinsics require floating point tensors, got {tensor.dtype}."
)
return cls._wrap(tensor, image_size=image_size)
@classmethod
@override
def _wrap_output(
cls,
output: Tensor,
args: Any = (),
kwargs: Any = None,
) -> Self:
flat_params, _ = tree_flatten(
tuple(args) + (tuple(kwargs.values()) if kwargs else ())
)
first = next(x for x in flat_params if isinstance(x, CameraIntrinsics))
if isinstance(output, Tensor) and not isinstance(output, CameraIntrinsics):
output = cls._wrap(output, image_size=first.image_size)
return output
@override
def __repr__(self, *, tensor_contents: Any = None) -> str:
return self._make_repr(image_size=self.image_size)
@_register_kernel(_F.resize, CameraIntrinsics)
def _resize_intrinsics(
inpt: CameraIntrinsics,
size: list[int] | None,
max_size: int | None = None,
**kwargs: Any,
) -> CameraIntrinsics:
old_h, old_w = inpt.image_size
new_h, new_w = _compute_resized_output_size(
(old_h, old_w), size=size, max_size=max_size
)
K = inpt.as_subclass(Tensor).clone()
K[..., 0, :] *= new_w / old_w # fx, skew, cx
K[..., 1, :] *= new_h / old_h # fy, cy
return CameraIntrinsics._wrap(K, image_size=(new_h, new_w))
@_register_kernel(_F.crop, CameraIntrinsics)
def _crop_intrinsics(
inpt: CameraIntrinsics, top: int, left: int, height: int, width: int
) -> CameraIntrinsics:
K = inpt.as_subclass(Tensor).clone()
K[..., 0, 2] -= left # cx
K[..., 1, 2] -= top # cy
return CameraIntrinsics._wrap(K, image_size=(height, width))
@_register_kernel(_F.center_crop, CameraIntrinsics)
def _center_crop_intrinsics(
inpt: CameraIntrinsics, output_size: list[int]
) -> CameraIntrinsics:
crop_h, crop_w = _center_crop_parse_output_size(output_size)
old_h, old_w = inpt.image_size
top = (old_h - crop_h) // 2
left = (old_w - crop_w) // 2
return _crop_intrinsics(inpt, top=top, left=left, height=crop_h, width=crop_w)
@_register_kernel(_F.pad, CameraIntrinsics)
def _pad_intrinsics(
inpt: CameraIntrinsics, padding: int | list[int], **kwargs: Any
) -> CameraIntrinsics:
left, right, top, bottom = _parse_pad_padding(padding)
K = inpt.as_subclass(Tensor).clone()
K[..., 0, 2] += left # cx
K[..., 1, 2] += top # cy
old_h, old_w = inpt.image_size
new_h = old_h + top + bottom
new_w = old_w + left + right
return CameraIntrinsics._wrap(K, image_size=(new_h, new_w))
@_register_kernel(_F.resized_crop, CameraIntrinsics)
def _resized_crop_intrinsics(
inpt: CameraIntrinsics,
top: int,
left: int,
height: int,
width: int,
size: list[int],
**kwargs: Any,
) -> CameraIntrinsics:
cropped = _crop_intrinsics(inpt, top=top, left=left, height=height, width=width)
return _resize_intrinsics(cropped, size=size)