diff --git a/fvdb/__init__.py b/fvdb/__init__.py
index 7c0c20ae..e254a8ce 100644
--- a/fvdb/__init__.py
+++ b/fvdb/__init__.py
@@ -126,7 +126,7 @@ def gaussian_render_jagged(
 
 from .convolution_plan import ConvolutionPlan
 from .gaussian_splatting import GaussianSplat3d, ProjectedGaussianSplats
-from .enums import ProjectionType, ShOrderingMode
+from .enums import CameraModel, ProjectionType, RollingShutterType, ShOrderingMode
 
 # Import torch-compatible functions that work with both Tensor and JaggedTensor
 from .torch_jagged import (
@@ -190,6 +190,8 @@ def gaussian_render_jagged(
     "JaggedTensor",
     "GaussianSplat3d",
     "ProjectedGaussianSplats",
+    "CameraModel",
+    "RollingShutterType",
     "ProjectionType",
     "ShOrderingMode",
     "ConvolutionPlan",
diff --git a/fvdb/__init__.pyi b/fvdb/__init__.pyi
index 9dbb4f43..a63af0b5 100644
--- a/fvdb/__init__.pyi
+++ b/fvdb/__init__.pyi
@@ -19,7 +19,7 @@ def _parse_device_string(device_string: str | torch.device) -> torch.device: ...
 from . import nn, utils, viz
 from ._fvdb_cpp import config, hilbert, morton, volume_render
 from .convolution_plan import ConvolutionPlan
-from .enums import ProjectionType, ShOrderingMode
+from .enums import CameraModel, ProjectionType, RollingShutterType, ShOrderingMode
 from .gaussian_splatting import GaussianSplat3d, ProjectedGaussianSplats
 from .grid import Grid
 from .grid_batch import GridBatch, gcat
@@ -109,6 +109,8 @@ __all__ = [
     "GaussianSplat3d",
     "ProjectedGaussianSplats",
     "ConvolutionPlan",
+    "CameraModel",
+    "RollingShutterType",
     "ProjectionType",
     "ShOrderingMode",
     "Grid",
diff --git a/fvdb/_fvdb_cpp.pyi b/fvdb/_fvdb_cpp.pyi
index 5b7c9546..8180736a 100644
--- a/fvdb/_fvdb_cpp.pyi
+++ b/fvdb/_fvdb_cpp.pyi
@@ -196,6 +196,24 @@ class GaussianSplat3d:
         antialias: bool = ...,
         backgrounds: Optional[torch.Tensor] = ...,
     ) -> tuple[torch.Tensor, torch.Tensor]: ...
+    def render_images_from_world(
+        self,
+        world_to_camera_matrices: torch.Tensor,
+        projection_matrices: torch.Tensor,
+        image_width: int,
+        image_height: int,
+        near: float,
+        far: float,
+        camera_model: "CameraModel" = ...,
+        distortion_coeffs: Optional[torch.Tensor] = ...,
+        sh_degree_to_use: int = ...,
+        tile_size: int = ...,
+        min_radius_2d: float = ...,
+        eps_2d: float = ...,
+        antialias: bool = ...,
+        backgrounds: Optional[torch.Tensor] = ...,
+        masks: Optional[torch.Tensor] = ...,
+    ) -> tuple[torch.Tensor, torch.Tensor]: ...
     def sparse_render_images(
         self,
         pixels_to_render: JaggedTensor,
@@ -1178,3 +1196,16 @@ def volume_render(
     packInfo: torch.Tensor,
     transmittanceThresh: float,
 ) -> list[torch.Tensor]: ...
+
+class RollingShutterType(Enum):
+    NONE = ...
+    VERTICAL = ...
+    HORIZONTAL = ...
+
+class CameraModel(Enum):
+    PINHOLE = ...
+    OPENCV_RADTAN_5 = ...
+    OPENCV_RATIONAL_8 = ...
+    OPENCV_RADTAN_THIN_PRISM_9 = ...
+    OPENCV_THIN_PRISM_12 = ...
+    ORTHOGRAPHIC = ...
diff --git a/fvdb/enums.py b/fvdb/enums.py
index c81c3658..9d7b4cce 100644
--- a/fvdb/enums.py
+++ b/fvdb/enums.py
@@ -2,7 +2,7 @@
 # SPDX-License-Identifier: Apache-2.0
 #
 
-from enum import Enum
+from enum import Enum, IntEnum
 
 
 class ProjectionType(str, Enum):
@@ -48,3 +48,73 @@ class ShOrderingMode(str, Enum):
     The feature channels of spherical harmonics are stored in separate blocks for each coefficient. *i.e.* The spherical harmonics
     tensor corresponds to a (row-major) contiguous tensor of shape ``[num_coefficients, channels, num_sh_bases]``, where channels=3 for RGB.
     """
+
+
+class RollingShutterType(IntEnum):
+    """
+    Rolling shutter policy for camera projection / ray generation.
+
+    Rolling shutter models treat different image rows/columns as having different exposure times.
+    FVDB uses this to interpolate between per-camera start/end poses when generating rays.
+    """
+
+    NONE = 0
+    """
+    No rolling shutter: the start pose is used for all pixels.
+    """
+
+    VERTICAL = 1
+    """
+    Vertical rolling shutter: exposure time varies with image row (y).
+    """
+
+    HORIZONTAL = 2
+    """
+    Horizontal rolling shutter: exposure time varies with image column (x).
+    """
+
+
+class CameraModel(IntEnum):
+    """
+    Camera model for projection / ray generation.
+
+    Notes:
+    - ``PINHOLE`` and ``ORTHOGRAPHIC`` ignore distortion coefficients.
+    - ``OPENCV_*`` variants use pinhole intrinsics plus OpenCV-style distortion. When distortion
+      coefficients are provided, FVDB expects a packed layout:
+
+      ``[k1,k2,k3,k4,k5,k6,p1,p2,s1,s2,s3,s4]``
+
+      Unused coefficients for a given model should be set to 0.
+    """
+
+    PINHOLE = 0
+    """
+    Ideal pinhole camera model (no distortion).
+    """
+
+    OPENCV_RADTAN_5 = 1
+    """
+    OpenCV radial-tangential distortion with 5 parameters (k1,k2,p1,p2,k3).
+    """
+
+    OPENCV_RATIONAL_8 = 2
+    """
+    OpenCV rational radial-tangential distortion with 8 parameters (k1..k6,p1,p2).
+    """
+
+    OPENCV_RADTAN_THIN_PRISM_9 = 3
+    """
+    OpenCV radial-tangential + thin-prism distortion with 9 parameters (k1,k2,p1,p2,k3,s1..s4).
+    """
+
+    OPENCV_THIN_PRISM_12 = 4
+    """
+    OpenCV rational radial-tangential + thin-prism distortion with 12 parameters
+    (k1..k6,p1,p2,s1..s4).
+    """
+
+    ORTHOGRAPHIC = 5
+    """
+    Orthographic camera model (no distortion).
+    """
diff --git a/fvdb/gaussian_splatting.py b/fvdb/gaussian_splatting.py
index de6d5201..0a8c10a8 100644
--- a/fvdb/gaussian_splatting.py
+++ b/fvdb/gaussian_splatting.py
@@ -5,8 +5,9 @@
 from typing import Any, Mapping, Sequence, TypeVar, overload
 
 import torch
-from fvdb.enums import ProjectionType
+from fvdb.enums import CameraModel, ProjectionType
 
+from . import _fvdb_cpp as _C
 from ._fvdb_cpp import GaussianSplat3d as GaussianSplat3dCpp
 from ._fvdb_cpp import JaggedTensor as JaggedTensorCpp
 from ._fvdb_cpp import ProjectedGaussianSplats as ProjectedGaussianSplatsCpp
@@ -1877,6 +1878,117 @@ def render_images(
             backgrounds=backgrounds,
         )
 
+    def render_images_from_world(
+        self,
+        world_to_camera_matrices: torch.Tensor,
+        projection_matrices: torch.Tensor,
+        image_width: int,
+        image_height: int,
+        near: float,
+        far: float,
+        camera_model: CameraModel = CameraModel.PINHOLE,
+        distortion_coeffs: torch.Tensor | None = None,
+        sh_degree_to_use: int = -1,
+        tile_size: int = 16,
+        min_radius_2d: float = 0.0,
+        eps_2d: float = 0.3,
+        antialias: bool = False,
+        backgrounds: torch.Tensor | None = None,
+        masks: torch.Tensor | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        """
+        Render dense images by rasterizing directly from world-space 3D Gaussians.
+
+        This is similar to :meth:`render_images`, but the rasterization step is performed in 3D
+        using per-pixel rays against the Gaussian ellipsoids (instead of rasterizing 2D conics
+        produced by a projection step). This enables gradients w.r.t. Gaussian geometry
+        (``means``, ``quats``, ``log_scales``) through rasterization, which is useful for
+        Unscented Transform (UT)-based OpenCV camera models.
+
+        Notes:
+            - This is **dense-only**: outputs are dense tensors of shape ``(C, H, W, ...)``.
+            - Tile intersection data is still computed from a (non-differentiable) projection
+              step, so gradients can be discontinuous when small parameter changes cause a Gaussian
+              to enter/leave a tile (or switch which tiles it overlaps).
+            - Background compositing follows standard "over" alpha compositing. If
+              ``backgrounds`` is provided, the output color is:
+
+              ``color = sum_i (feat_i * alpha_i * T_i) + T_final * background``
+
+              where ``T_final`` is the remaining transmittance at the end of rasterization, and
+              ``alpha = 1 - T_final``.
+            - ``masks`` is a **per-tile** boolean mask (parity with the classic rasterizer).
+              Tiles where ``masks[c, th, tw] == False`` are skipped entirely: the output is
+              background with ``alpha=0`` and the tile contributes **zero gradients**.
+
+        Example:
+
+        .. code-block:: python
+
+            images, alphas = gaussian_splat_3d.render_images_from_world(
+                world_to_camera_matrices,  # [C,4,4]
+                projection_matrices,       # [C,3,3]
+                image_width=640,
+                image_height=480,
+                near=0.01,
+                far=1e10,
+                camera_model=fvdb.CameraModel.OPENCV_RATIONAL_8,
+                distortion_coeffs=dist_coeffs,  # [C,12]
+                backgrounds=bg,                 # [C,D]
+                masks=tile_mask,                # [C,tileH,tileW] (optional)
+            )
+
+        Args:
+            world_to_camera_matrices (torch.Tensor): Tensor of shape ``(C, 4, 4)``.
+            projection_matrices (torch.Tensor): Tensor of shape ``(C, 3, 3)``.
+            image_width (int): Output image width ``W``.
+            image_height (int): Output image height ``H``.
+            near (float): Near clipping plane.
+            far (float): Far clipping plane.
+            camera_model (CameraModel): Camera model used for ray generation and distortion.
+            distortion_coeffs (torch.Tensor | None): Distortion coefficients for OpenCV camera
+                models. Use ``None`` for no distortion. Expected shape is ``(C, 12)`` with packed
+                layout ``[k1,k2,k3,k4,k5,k6,p1,p2,s1,s2,s3,s4]``. For camera models that use fewer
+                coefficients, unused entries should be set to 0.
+            sh_degree_to_use (int): SH degree to use. ``-1`` means use all available SH bases.
+            tile_size (int): Tile size (in pixels). ``tileH = ceil(H / tile_size)``,
+                ``tileW = ceil(W / tile_size)``.
+            min_radius_2d (float): Minimum projected radius (in pixels) used for tiling/culling.
+            eps_2d (float): Padding used during tiling/projection to avoid numerical issues.
+            antialias (bool): If ``True``, applies opacity correction (when available) when using
+                ``eps_2d > 0.0``.
+            backgrounds (torch.Tensor | None): Optional background colors of shape ``(C, D)``,
+                where ``D`` is :attr:`num_channels`. If ``None``, background is treated as 0.
+            masks (torch.Tensor | None): Optional per-tile boolean mask of shape
+                ``(C, tileH, tileW)``. Masked tiles are skipped and filled with background.
+
+        Returns:
+            images (torch.Tensor): Rendered images of shape ``(C, H, W, D)``.
+            alpha_images (torch.Tensor): Alpha images of shape ``(C, H, W, 1)``.
+        """
+        if isinstance(camera_model, CameraModel):
+            camera_model_cpp = getattr(_C.CameraModel, camera_model.name)
+        else:
+            camera_model_cpp = camera_model
+
+        return self._impl.render_images_from_world(
+            world_to_camera_matrices=world_to_camera_matrices,
+            projection_matrices=projection_matrices,
+            image_width=image_width,
+            image_height=image_height,
+            near=near,
+            far=far,
+            camera_model=camera_model_cpp,
+            distortion_coeffs=distortion_coeffs,
+            sh_degree_to_use=sh_degree_to_use,
+            tile_size=tile_size,
+            min_radius_2d=min_radius_2d,
+            eps_2d=eps_2d,
+            antialias=antialias,
+            backgrounds=backgrounds,
+            masks=masks,
+        )
+
     def sparse_render_images(
         self,
         pixels_to_render: JaggedTensorOrTensorT,
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 8fe92fc9..4cde6456 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -10,12 +10,10 @@ option(FVDB_BUILD_TESTS "Configure CMake to build tests" ON)
 option(FVDB_BUILD_BENCHMARKS "Configure CMake to build (google & nvbench) benchmarks" OFF)
 option(FVDB_STRIP_SYMBOLS "Strip symbols from the build" OFF)
 option(FVDB_LINEINFO "Enable lineinfo in the build" OFF)
-option(FVDB_USE_OPENMP "Enable OpenMP for CPU parallelization" ON)
 
 message(STATUS "FVDB: Configure CMake to build tests: ${FVDB_BUILD_TESTS}")
 message(STATUS "FVDB: Configure CMake to build (google & nvbench) benchmarks: ${FVDB_BUILD_BENCHMARKS}")
 message(STATUS "FVDB_STRIP_SYMBOLS: ${FVDB_STRIP_SYMBOLS}")
-message(STATUS "FVDB_USE_OPENMP: ${FVDB_USE_OPENMP}")
 
 # Get dependencies
 include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/get_cpm.cmake)
@@ -51,6 +49,7 @@ set(FVDB_CPP_FILES
     fvdb/detail/autograd/Inject.cpp
     fvdb/detail/autograd/GaussianProjection.cpp
     fvdb/detail/autograd/GaussianRasterize.cpp
+    fvdb/detail/autograd/GaussianRasterizeFromWorld.cpp
     fvdb/detail/autograd/GaussianRasterizeSparse.cpp
     fvdb/detail/autograd/JaggedReduce.cpp
     fvdb/detail/autograd/MaxPoolGrid.cpp
@@ -112,6 +111,8 @@ set(FVDB_CU_FILES
     fvdb/detail/ops/gsplat/GaussianProjectionJaggedForward.cu
     fvdb/detail/ops/gsplat/GaussianRasterizeBackward.cu
     fvdb/detail/ops/gsplat/GaussianRasterizeForward.cu
+    fvdb/detail/ops/gsplat/GaussianRasterizeFromWorldBackward.cu
+    fvdb/detail/ops/gsplat/GaussianRasterizeFromWorldForward.cu
     fvdb/detail/ops/gsplat/GaussianRasterizeNumContributingGaussians.cu
     fvdb/detail/ops/gsplat/GaussianRasterizeTopContributingGaussianIds.cu
     fvdb/detail/ops/gsplat/GaussianRasterizeContributingGaussianIds.cu
diff --git a/src/fvdb/GaussianSplat3d.cpp b/src/fvdb/GaussianSplat3d.cpp
index bc485b95..d2a1b9ad 100644
--- a/src/fvdb/GaussianSplat3d.cpp
+++ b/src/fvdb/GaussianSplat3d.cpp
@@ -12,8 +12,10 @@
 #include <fvdb/detail/autograd/EvaluateSphericalHarmonics.h>
 #include <fvdb/detail/autograd/GaussianProjection.h>
 #include <fvdb/detail/autograd/GaussianRasterize.h>
+#include <fvdb/detail/autograd/GaussianRasterizeFromWorld.h>
 
 // Ops headers
+#include <fvdb/detail/ops/gsplat/GaussianProjectionUT.h>
 #include <fvdb/detail/ops/gsplat/GaussianRasterizeContributingGaussianIds.h>
 #include <fvdb/detail/ops/gsplat/GaussianRasterizeNumContributingGaussians.h>
 #include <fvdb/detail/ops/gsplat/GaussianRasterizeTopContributingGaussianIds.h>
@@ -343,12 +345,13 @@ GaussianSplat3d::projectGaussiansImpl(const torch::Tensor &worldToCameraMatrices
     ret.perGaussian2dMean = projectionResults[1];
     ret.perGaussianDepth  = projectionResults[2];
     ret.perGaussianConic  = projectionResults[3];
+    // FIXME: Use accessors in the kernel and use exapand
+    ret.perGaussianOpacity = opacities().repeat({C, 1});
     if (settings.antialias) {
-        // Antialias compensation is per-camera [C, N], so we must materialize [C, N]
-        ret.perGaussianOpacity = opacities().unsqueeze(0).expand({C, -1}) * projectionResults[4];
-    } else {
-        // Store as [N]; the opacities() accessor will lazily expand to [C, N] when accessed
-        ret.perGaussianOpacity = opacities();
+        ret.perGaussianOpacity *= projectionResults[4];
+        // FIXME (Francis): The contiguity requirement is dumb and should be
+        // removed by using accessors in the kernel
+        ret.perGaussianOpacity = ret.perGaussianOpacity.contiguous();
     }
 
     ret.perGaussianRenderQuantity = [&]() {
@@ -476,12 +479,13 @@ GaussianSplat3d::sparseProjectGaussiansImpl(const JaggedTensor &pixelsToRender,
     ret.perGaussian2dMean = projectionResults[1];
     ret.perGaussianDepth  = projectionResults[2];
     ret.perGaussianConic  = projectionResults[3];
+    // FIXME: Use accessors in the kernel and use expand
+    ret.perGaussianOpacity = opacities().repeat({C, 1});
     if (settings.antialias) {
-        // Antialias compensation is per-camera [C, N], so we must materialize [C, N]
-        ret.perGaussianOpacity = opacities().unsqueeze(0).expand({C, -1}) * projectionResults[4];
-    } else {
-        // Store as [N]; the opacities() accessor will lazily expand to [C, N] when accessed
-        ret.perGaussianOpacity = opacities();
+        ret.perGaussianOpacity *= projectionResults[4];
+        // FIXME (Francis): The contiguity requirement is dumb and should be
+        // removed by using accessors in the kernel
+        ret.perGaussianOpacity = ret.perGaussianOpacity.contiguous();
     }
 
     ret.perGaussianRenderQuantity = [&]() {
@@ -555,7 +559,7 @@ GaussianSplat3d::renderCropFromProjectedGaussiansImpl(
         projectedGaussians.perGaussian2dMean,
         projectedGaussians.perGaussianConic,
         projectedGaussians.perGaussianRenderQuantity,
-        projectedGaussians.opacities(),
+        projectedGaussians.perGaussianOpacity,
         cropWidth_,
         cropHeight_,
         cropOriginW_,
@@ -597,7 +601,7 @@ GaussianSplat3d::sparseRenderImpl(const JaggedTensor &pixelsToRender,
                                                                   state.perGaussian2dMean,
                                                                   state.perGaussianConic,
                                                                   state.perGaussianRenderQuantity,
-                                                                  state.opacities(),
+                                                                  state.perGaussianOpacity,
                                                                   settings.imageWidth,
                                                                   settings.imageHeight,
                                                                   0,
@@ -629,7 +633,7 @@ GaussianSplat3d::renderNumContributingGaussiansImpl(
         return fvdb::detail::ops::dispatchGaussianRasterizeNumContributingGaussians<DeviceTag>(
             state.perGaussian2dMean,
             state.perGaussianConic,
-            state.opacities(),
+            state.perGaussianOpacity,
             state.tileOffsets,
             state.tileGaussianIds,
             settings);
@@ -651,7 +655,7 @@ GaussianSplat3d::sparseRenderNumContributingGaussiansImpl(
         return fvdb::detail::ops::dispatchGaussianSparseRasterizeNumContributingGaussians<
             DeviceTag>(state.perGaussian2dMean,
                        state.perGaussianConic,
-                       state.opacities(),
+                       state.perGaussianOpacity,
                        state.tileOffsets,
                        state.tileGaussianIds,
                        pixelsToRender,
@@ -693,7 +697,7 @@ GaussianSplat3d::renderContributingGaussianIdsImpl(
         return fvdb::detail::ops::dispatchGaussianRasterizeContributingGaussianIds<DeviceTag>(
             state.perGaussian2dMean,
             state.perGaussianConic,
-            state.opacities(),
+            state.perGaussianOpacity,
             state.tileOffsets,
             state.tileGaussianIds,
             settings,
@@ -717,7 +721,7 @@ GaussianSplat3d::sparseRenderContributingGaussianIdsImpl(
         return fvdb::detail::ops::dispatchGaussianSparseRasterizeContributingGaussianIds<DeviceTag>(
             state.perGaussian2dMean,
             state.perGaussianConic,
-            state.opacities(),
+            state.perGaussianOpacity,
             state.tileOffsets,
             state.tileGaussianIds,
             pixelsToRender,
@@ -884,6 +888,154 @@ GaussianSplat3d::renderImages(const torch::Tensor &worldToCameraMatrices,
         state, settings.tileSize, settings.imageWidth, settings.imageHeight, 0, 0, backgrounds);
 }
 
+std::tuple<torch::Tensor, torch::Tensor>
+GaussianSplat3d::renderImagesFromWorld(const torch::Tensor &worldToCameraMatrices,
+                                       const torch::Tensor &projectionMatrices,
+                                       const size_t imageWidth,
+                                       const size_t imageHeight,
+                                       const float near,
+                                       const float far,
+                                       const fvdb::detail::ops::CameraModel cameraModel,
+                                       const std::optional<torch::Tensor> &distortionCoeffs,
+                                       const int64_t shDegreeToUse,
+                                       const size_t tileSize,
+                                       const float minRadius2d,
+                                       const float eps2d,
+                                       const bool antialias,
+                                       const std::optional<torch::Tensor> &backgrounds,
+                                       const std::optional<torch::Tensor> &masks) {
+    FVDB_FUNC_RANGE();
+    const int C = worldToCameraMatrices.size(0); // number of cameras
+    TORCH_CHECK(C > 0, "At least one camera must be provided (got 0)");
+
+    torch::Tensor perGaussianRadius;
+    torch::Tensor perGaussian2dMean;
+    torch::Tensor perGaussianDepth;
+    torch::Tensor perGaussianOpacity;
+    torch::Tensor perGaussianRenderQuantity;
+    torch::Tensor tileOffsets;
+    torch::Tensor tileGaussianIds;
+
+    if (cameraModel == fvdb::detail::ops::CameraModel::PINHOLE ||
+        cameraModel == fvdb::detail::ops::CameraModel::ORTHOGRAPHIC) {
+        // Fast path: reuse the classic projection for tiling/sorting.
+        RenderSettings settings;
+        settings.imageWidth     = imageWidth;
+        settings.imageHeight    = imageHeight;
+        settings.nearPlane      = near;
+        settings.farPlane       = far;
+        settings.projectionType = (cameraModel == fvdb::detail::ops::CameraModel::ORTHOGRAPHIC)
+                                      ? fvdb::detail::ops::ProjectionType::ORTHOGRAPHIC
+                                      : fvdb::detail::ops::ProjectionType::PERSPECTIVE;
+        settings.shDegreeToUse  = shDegreeToUse;
+        settings.radiusClip     = minRadius2d;
+        settings.eps2d          = eps2d;
+        settings.antialias      = antialias;
+        settings.tileSize       = tileSize;
+        settings.renderMode     = RenderSettings::RenderMode::RGB;
+
+        const ProjectedGaussianSplats state =
+            projectGaussiansImpl(worldToCameraMatrices, projectionMatrices, settings);
+
+        perGaussianRadius         = state.perGaussianRadius;
+        perGaussian2dMean         = state.perGaussian2dMean;
+        perGaussianDepth          = state.perGaussianDepth;
+        perGaussianOpacity        = state.perGaussianOpacity;
+        perGaussianRenderQuantity = state.perGaussianRenderQuantity;
+        tileOffsets               = state.tileOffsets;
+        tileGaussianIds           = state.tileGaussianIds;
+    } else {
+        // OpenCV camera models: use UT projection to compute radii/depths for tiling/sorting.
+        // Rasterization itself is still performed with the 3DGS ray-ellipsoid kernel.
+        const torch::Tensor distortionCoeffs_ = distortionCoeffs.has_value()
+                                                    ? distortionCoeffs.value()
+                                                    : torch::empty({C, 0}, mMeans.options());
+
+        fvdb::detail::ops::UTParams utParams = fvdb::detail::ops::UTParams{};
+        const auto projectionResults         = FVDB_DISPATCH_KERNEL(mMeans.device(), [&]() {
+            return fvdb::detail::ops::dispatchGaussianProjectionForwardUT<DeviceTag>(
+                mMeans,
+                mQuats,
+                mLogScales,
+                worldToCameraMatrices,
+                worldToCameraMatrices,
+                projectionMatrices,
+                fvdb::detail::ops::RollingShutterType::NONE,
+                utParams,
+                cameraModel,
+                distortionCoeffs_,
+                static_cast<int64_t>(imageWidth),
+                static_cast<int64_t>(imageHeight),
+                eps2d,
+                near,
+                far,
+                minRadius2d,
+                antialias);
+        });
+
+        perGaussianRadius = std::get<0>(projectionResults);
+        perGaussian2dMean = std::get<1>(projectionResults);
+        perGaussianDepth  = std::get<2>(projectionResults);
+
+        // Opacities are stored per-Gaussian (shared across cameras). Expand to [C,N] and apply
+        // antialiasing compensation (if computed).
+        perGaussianOpacity = opacities().repeat({C, 1});
+        if (antialias) {
+            const torch::Tensor compensations = std::get<4>(projectionResults);
+            TORCH_CHECK(
+                compensations.defined(),
+                "UT projection returned an undefined compensation tensor in antialias mode");
+            perGaussianOpacity *= compensations;
+            perGaussianOpacity = perGaussianOpacity.contiguous();
+        }
+
+        // Evaluate SH for per-camera features. (We use the start pose for view-direction shading.)
+        perGaussianRenderQuantity =
+            evalSphericalHarmonicsImpl(shDegreeToUse, worldToCameraMatrices, perGaussianRadius);
+
+        const int numTilesW = std::ceil(imageWidth / static_cast<float>(tileSize));
+        const int numTilesH = std::ceil(imageHeight / static_cast<float>(tileSize));
+        std::tie(tileOffsets, tileGaussianIds) = FVDB_DISPATCH_KERNEL(mMeans.device(), [&]() {
+            return detail::ops::dispatchGaussianTileIntersection<DeviceTag>(perGaussian2dMean,
+                                                                            perGaussianRadius,
+                                                                            perGaussianDepth,
+                                                                            at::nullopt,
+                                                                            C,
+                                                                            tileSize,
+                                                                            numTilesH,
+                                                                            numTilesW);
+        });
+    }
+
+    const torch::Tensor distortionCoeffsForRaster = distortionCoeffs.has_value()
+                                                        ? distortionCoeffs.value()
+                                                        : torch::empty({C, 0}, mMeans.options());
+
+    auto outputs = detail::autograd::RasterizeGaussiansToPixelsFromWorld3DGS::apply(
+        mMeans,
+        mQuats,
+        mLogScales,
+        perGaussianRenderQuantity,
+        perGaussianOpacity,
+        worldToCameraMatrices,
+        worldToCameraMatrices,
+        projectionMatrices,
+        distortionCoeffsForRaster,
+        fvdb::detail::ops::RollingShutterType::NONE,
+        cameraModel,
+        static_cast<uint32_t>(imageWidth),
+        static_cast<uint32_t>(imageHeight),
+        0,
+        0,
+        static_cast<uint32_t>(tileSize),
+        tileOffsets,
+        tileGaussianIds,
+        backgrounds,
+        masks);
+
+    return {outputs[0], outputs[1]};
+}
+
 std::tuple<torch::Tensor, torch::Tensor>
 GaussianSplat3d::renderDepths(const torch::Tensor &worldToCameraMatrices,
                               const torch::Tensor &projectionMatrices,
diff --git a/src/fvdb/GaussianSplat3d.h b/src/fvdb/GaussianSplat3d.h
index 2c6248e1..3716b6ca 100644
--- a/src/fvdb/GaussianSplat3d.h
+++ b/src/fvdb/GaussianSplat3d.h
@@ -5,12 +5,15 @@
 #define FVDB_GAUSSIANSPLAT3D_H
 
 #include <fvdb/JaggedTensor.h>
+#include <fvdb/detail/ops/gsplat/GaussianCameraModels.h>
 #include <fvdb/detail/ops/gsplat/GaussianRenderSettings.h>
 
 #include <ATen/core/TensorBody.h>
 #include <c10/core/Device.h>
 #include <torch/types.h>
 
+#include <optional>
+
 namespace fvdb {
 
 /// @brief A class representing a Gaussian splat scene in 3D space.
@@ -905,6 +908,34 @@ class GaussianSplat3d {
                  const bool antialias                            = false,
                  const std::optional<torch::Tensor> &backgrounds = std::nullopt);
 
+    /// @brief Render images by rasterizing directly from world-space 3D Gaussians.
+    ///
+    /// This is similar to @ref renderImages but performs rasterization directly from world-space
+    /// Gaussians (means/quats/log-scales) rather than from their 2D projections. This enables
+    /// geometry gradients through the rasterization step.
+    ///
+    /// Tile intersections are still computed using a (non-differentiable) projection step:
+    /// - For `cameraModel == CameraModel::PINHOLE` or `CameraModel::ORTHOGRAPHIC`, we reuse the
+    ///   classic projection path.
+    /// - For OpenCV camera models, we use the Unscented Transform (UT) projection kernel to
+    ///   compute per-Gaussian radii and depths for sorting / tiling, then rasterize with 3DGS.
+    std::tuple<torch::Tensor, torch::Tensor> renderImagesFromWorld(
+        const torch::Tensor &worldToCameraMatrices,
+        const torch::Tensor &projectionMatrices,
+        const size_t imageWidth,
+        const size_t imageHeight,
+        const float near,
+        const float far,
+        const fvdb::detail::ops::CameraModel cameraModel = fvdb::detail::ops::CameraModel::PINHOLE,
+        const std::optional<torch::Tensor> &distortionCoeffs = std::nullopt,
+        const int64_t shDegreeToUse                          = -1,
+        const size_t tileSize                                = 16,
+        const float minRadius2d                              = 0.0,
+        const float eps2d                                    = 0.3,
+        const bool antialias                                 = false,
+        const std::optional<torch::Tensor> &backgrounds      = std::nullopt,
+        const std::optional<torch::Tensor> &masks            = std::nullopt);
+
     /// @brief Render depths of this Gaussian splat scene from the given camera matrices and
     /// projection matrices.
     /// @param worldToCameraMatrices [C, 4, 4] Camera-to-world matrices
diff --git a/src/fvdb/detail/autograd/GaussianRasterizeFromWorld.cpp b/src/fvdb/detail/autograd/GaussianRasterizeFromWorld.cpp
new file mode 100644
index 00000000..219890dd
--- /dev/null
+++ b/src/fvdb/detail/autograd/GaussianRasterizeFromWorld.cpp
@@ -0,0 +1,204 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
+//
+#include <fvdb/detail/autograd/GaussianRasterizeFromWorld.h>
+#include <fvdb/detail/ops/gsplat/GaussianRasterizeFromWorldBackward.h>
+#include <fvdb/detail/ops/gsplat/GaussianRasterizeFromWorldForward.h>
+#include <fvdb/detail/ops/gsplat/GaussianRenderSettings.h>
+#include <fvdb/detail/utils/Nvtx.h>
+#include <fvdb/detail/utils/Utils.h>
+
+#include <vector>
+
+namespace fvdb::detail::autograd {
+
+RasterizeGaussiansToPixelsFromWorld3DGS::VariableList
+RasterizeGaussiansToPixelsFromWorld3DGS::forward(
+    RasterizeGaussiansToPixelsFromWorld3DGS::AutogradContext *ctx,
+    const RasterizeGaussiansToPixelsFromWorld3DGS::Variable &means,
+    const RasterizeGaussiansToPixelsFromWorld3DGS::Variable &quats,
+    const RasterizeGaussiansToPixelsFromWorld3DGS::Variable &logScales,
+    const RasterizeGaussiansToPixelsFromWorld3DGS::Variable &features,
+    const RasterizeGaussiansToPixelsFromWorld3DGS::Variable &opacities,
+    const RasterizeGaussiansToPixelsFromWorld3DGS::Variable &worldToCamMatricesStart,
+    const RasterizeGaussiansToPixelsFromWorld3DGS::Variable &worldToCamMatricesEnd,
+    const RasterizeGaussiansToPixelsFromWorld3DGS::Variable &projectionMatrices,
+    const RasterizeGaussiansToPixelsFromWorld3DGS::Variable &distortionCoeffs,
+    const fvdb::detail::ops::RollingShutterType rollingShutterType,
+    const fvdb::detail::ops::CameraModel cameraModel,
+    const uint32_t imageWidth,
+    const uint32_t imageHeight,
+    const uint32_t imageOriginW,
+    const uint32_t imageOriginH,
+    const uint32_t tileSize,
+    const RasterizeGaussiansToPixelsFromWorld3DGS::Variable &tileOffsets,
+    const RasterizeGaussiansToPixelsFromWorld3DGS::Variable &tileGaussianIds,
+    std::optional<RasterizeGaussiansToPixelsFromWorld3DGS::Variable> backgrounds,
+    std::optional<RasterizeGaussiansToPixelsFromWorld3DGS::Variable> masks) {
+    FVDB_FUNC_RANGE_WITH_NAME("RasterizeGaussiansToPixelsFromWorld3DGS::forward");
+
+    fvdb::detail::ops::RenderSettings settings;
+    settings.imageWidth   = imageWidth;
+    settings.imageHeight  = imageHeight;
+    settings.imageOriginW = imageOriginW;
+    settings.imageOriginH = imageOriginH;
+    settings.tileSize     = tileSize;
+
+    auto outputs = FVDB_DISPATCH_KERNEL_DEVICE(means.device(), [&]() {
+        return ops::dispatchGaussianRasterizeFromWorld3DGSForward<DeviceTag>(
+            means,
+            quats,
+            logScales,
+            features,
+            opacities,
+            worldToCamMatricesStart,
+            worldToCamMatricesEnd,
+            projectionMatrices,
+            distortionCoeffs,
+            rollingShutterType,
+            cameraModel,
+            settings,
+            tileOffsets,
+            tileGaussianIds,
+            backgrounds,
+            masks);
+    });
+
+    Variable renderedFeatures = std::get<0>(outputs);
+    Variable renderedAlphas   = std::get<1>(outputs);
+    Variable lastIds          = std::get<2>(outputs);
+
+    std::vector<Variable> toSave = {means,
+                                    quats,
+                                    logScales,
+                                    features,
+                                    opacities,
+                                    worldToCamMatricesStart,
+                                    worldToCamMatricesEnd,
+                                    projectionMatrices,
+                                    distortionCoeffs,
+                                    tileOffsets,
+                                    tileGaussianIds,
+                                    renderedAlphas,
+                                    lastIds};
+    if (backgrounds.has_value()) {
+        toSave.push_back(backgrounds.value());
+        ctx->saved_data["has_backgrounds"] = true;
+    } else {
+        ctx->saved_data["has_backgrounds"] = false;
+    }
+    if (masks.has_value()) {
+        toSave.push_back(masks.value());
+        ctx->saved_data["has_masks"] = true;
+    } else {
+        ctx->saved_data["has_masks"] = false;
+    }
+    ctx->save_for_backward(toSave);
+
+    ctx->saved_data["imageWidth"]         = (int64_t)imageWidth;
+    ctx->saved_data["imageHeight"]        = (int64_t)imageHeight;
+    ctx->saved_data["imageOriginW"]       = (int64_t)imageOriginW;
+    ctx->saved_data["imageOriginH"]       = (int64_t)imageOriginH;
+    ctx->saved_data["tileSize"]           = (int64_t)tileSize;
+    ctx->saved_data["cameraModel"]        = (int64_t)cameraModel;
+    ctx->saved_data["rollingShutterType"] = (int64_t)rollingShutterType;
+
+    return {renderedFeatures, renderedAlphas};
+}
+
+RasterizeGaussiansToPixelsFromWorld3DGS::VariableList
+RasterizeGaussiansToPixelsFromWorld3DGS::backward(
+    RasterizeGaussiansToPixelsFromWorld3DGS::AutogradContext *ctx,
+    RasterizeGaussiansToPixelsFromWorld3DGS::VariableList gradOutput) {
+    FVDB_FUNC_RANGE_WITH_NAME("RasterizeGaussiansToPixelsFromWorld3DGS::backward");
+
+    Variable dLossDRenderedFeatures = gradOutput.at(0);
+    Variable dLossDRenderedAlphas   = gradOutput.at(1);
+    if (dLossDRenderedFeatures.defined()) {
+        dLossDRenderedFeatures = dLossDRenderedFeatures.contiguous();
+    }
+    if (dLossDRenderedAlphas.defined()) {
+        dLossDRenderedAlphas = dLossDRenderedAlphas.contiguous();
+    }
+
+    VariableList saved               = ctx->get_saved_variables();
+    Variable means                   = saved.at(0);
+    Variable quats                   = saved.at(1);
+    Variable logScales               = saved.at(2);
+    Variable features                = saved.at(3);
+    Variable opacities               = saved.at(4);
+    Variable worldToCamMatricesStart = saved.at(5);
+    Variable worldToCamMatricesEnd   = saved.at(6);
+    Variable projectionMatrices      = saved.at(7);
+    Variable distortionCoeffs        = saved.at(8);
+    Variable tileOffsets             = saved.at(9);
+    Variable tileGaussianIds         = saved.at(10);
+    Variable renderedAlphas          = saved.at(11);
+    Variable lastIds                 = saved.at(12);
+
+    const bool hasBackgrounds                = ctx->saved_data["has_backgrounds"].toBool();
+    const bool hasMasks                      = ctx->saved_data["has_masks"].toBool();
+    std::optional<torch::Tensor> backgrounds = std::nullopt;
+    std::optional<torch::Tensor> masks       = std::nullopt;
+    int64_t optIdx                           = 13;
+    if (hasBackgrounds) {
+        backgrounds = saved.at(optIdx++);
+    }
+    if (hasMasks) {
+        masks = saved.at(optIdx++);
+    }
+
+    const uint32_t imageWidth   = (uint32_t)ctx->saved_data["imageWidth"].toInt();
+    const uint32_t imageHeight  = (uint32_t)ctx->saved_data["imageHeight"].toInt();
+    const uint32_t imageOriginW = (uint32_t)ctx->saved_data["imageOriginW"].toInt();
+    const uint32_t imageOriginH = (uint32_t)ctx->saved_data["imageOriginH"].toInt();
+    const uint32_t tileSize     = (uint32_t)ctx->saved_data["tileSize"].toInt();
+    const auto cameraModel =
+        static_cast<fvdb::detail::ops::CameraModel>(ctx->saved_data["cameraModel"].toInt());
+    const auto rollingShutterType = static_cast<fvdb::detail::ops::RollingShutterType>(
+        ctx->saved_data["rollingShutterType"].toInt());
+
+    fvdb::detail::ops::RenderSettings settings;
+    settings.imageWidth   = imageWidth;
+    settings.imageHeight  = imageHeight;
+    settings.imageOriginW = imageOriginW;
+    settings.imageOriginH = imageOriginH;
+    settings.tileSize     = tileSize;
+
+    auto grads = FVDB_DISPATCH_KERNEL_DEVICE(means.device(), [&]() {
+        return ops::dispatchGaussianRasterizeFromWorld3DGSBackward<DeviceTag>(
+            means,
+            quats,
+            logScales,
+            features,
+            opacities,
+            worldToCamMatricesStart,
+            worldToCamMatricesEnd,
+            projectionMatrices,
+            distortionCoeffs,
+            rollingShutterType,
+            cameraModel,
+            settings,
+            tileOffsets,
+            tileGaussianIds,
+            renderedAlphas,
+            lastIds,
+            dLossDRenderedFeatures,
+            dLossDRenderedAlphas,
+            backgrounds,
+            masks);
+    });
+
+    Variable dMeans     = std::get<0>(grads);
+    Variable dQuats     = std::get<1>(grads);
+    Variable dLogScales = std::get<2>(grads);
+    Variable dFeatures  = std::get<3>(grads);
+    Variable dOpacities = std::get<4>(grads);
+
+    // Return gradients in the same order as forward inputs.
+    return {dMeans,     dQuats,     dLogScales, dFeatures,  dOpacities, Variable(), Variable(),
+            Variable(), Variable(), Variable(), Variable(), Variable(), Variable(), Variable(),
+            Variable(), Variable(), Variable(), Variable(), Variable(), Variable()};
+}
+
+} // namespace fvdb::detail::autograd
diff --git a/src/fvdb/detail/autograd/GaussianRasterizeFromWorld.h b/src/fvdb/detail/autograd/GaussianRasterizeFromWorld.h
new file mode 100644
index 00000000..2174dcd0
--- /dev/null
+++ b/src/fvdb/detail/autograd/GaussianRasterizeFromWorld.h
@@ -0,0 +1,50 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
+//
+#ifndef FVDB_DETAIL_AUTOGRAD_GAUSSIANRASTERIZEFROMWORLD_H
+#define FVDB_DETAIL_AUTOGRAD_GAUSSIANRASTERIZEFROMWORLD_H
+
+#include <fvdb/detail/ops/gsplat/GaussianCameraModels.h>
+
+#include <torch/autograd.h>
+
+#include <optional>
+
+namespace fvdb::detail::autograd {
+
+/// @brief Autograd wrapper for dense rasterization from world-space 3D Gaussians.
+struct RasterizeGaussiansToPixelsFromWorld3DGS
+    : public torch::autograd::Function<RasterizeGaussiansToPixelsFromWorld3DGS> {
+    using VariableList    = torch::autograd::variable_list;
+    using AutogradContext = torch::autograd::AutogradContext;
+    using Variable        = torch::autograd::Variable;
+
+    static VariableList
+    forward(AutogradContext *ctx,
+            const Variable &means,                   // [N,3]
+            const Variable &quats,                   // [N,4]
+            const Variable &logScales,               // [N,3]
+            const Variable &features,                // [C,N,D]
+            const Variable &opacities,               // [C,N]
+            const Variable &worldToCamMatricesStart, // [C,4,4]
+            const Variable &worldToCamMatricesEnd,   // [C,4,4]
+            const Variable &projectionMatrices,      // [C,3,3]
+            const Variable &distortionCoeffs,        // [C,K]
+            const fvdb::detail::ops::RollingShutterType rollingShutterType,
+            const fvdb::detail::ops::CameraModel cameraModel,
+            const uint32_t imageWidth,
+            const uint32_t imageHeight,
+            const uint32_t imageOriginW,
+            const uint32_t imageOriginH,
+            const uint32_t tileSize,
+            const Variable &tileOffsets,                        // [C, tileH, tileW]
+            const Variable &tileGaussianIds,                    // [n_isects]
+            std::optional<Variable> backgrounds = std::nullopt, // [C,D]
+            std::optional<Variable> masks       = std::nullopt);      // [C,tileH,tileW] bool
+
+    static VariableList backward(AutogradContext *ctx, VariableList gradOutput);
+};
+
+} // namespace fvdb::detail::autograd
+
+#endif // FVDB_DETAIL_AUTOGRAD_GAUSSIANRASTERIZEFROMWORLD_H
diff --git a/src/fvdb/detail/ops/gsplat/GaussianCameraMatrixUtils.cuh b/src/fvdb/detail/ops/gsplat/GaussianCameraMatrixUtils.cuh
new file mode 100644
index 00000000..2d7773c5
--- /dev/null
+++ b/src/fvdb/detail/ops/gsplat/GaussianCameraMatrixUtils.cuh
@@ -0,0 +1,61 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
+//
+#ifndef FVDB_DETAIL_OPS_GSPLAT_GAUSSIANCAMERAMATRIXUTILS_CUH
+#define FVDB_DETAIL_OPS_GSPLAT_GAUSSIANCAMERAMATRIXUTILS_CUH
+
+#include <nanovdb/math/Math.h>
+
+namespace fvdb::detail::ops {
+
+template <typename T>
+inline __host__ __device__ void
+loadWorldToCamRtRowMajor4x4(const T *m44, nanovdb::math::Mat3<T> &R, nanovdb::math::Vec3<T> &t) {
+    // Row-major 4x4 with last column = translation.
+    R = nanovdb::math::Mat3<T>(m44[0],
+                               m44[1],
+                               m44[2],   // 1st row
+                               m44[4],
+                               m44[5],
+                               m44[6],   // 2nd row
+                               m44[8],
+                               m44[9],
+                               m44[10]); // 3rd row
+    t = nanovdb::math::Vec3<T>(m44[3], m44[7], m44[11]);
+}
+
+template <typename T, typename Acc44>
+inline __device__ void
+loadWorldToCamRtFromAccessor44(const Acc44 &m44 /* [C,4,4] */,
+                               const int64_t camId,
+                               nanovdb::math::Mat3<T> &R,
+                               nanovdb::math::Vec3<T> &t) {
+    R = nanovdb::math::Mat3<T>(m44[camId][0][0],
+                               m44[camId][0][1],
+                               m44[camId][0][2],  // 1st row
+                               m44[camId][1][0],
+                               m44[camId][1][1],
+                               m44[camId][1][2],  // 2nd row
+                               m44[camId][2][0],
+                               m44[camId][2][1],
+                               m44[camId][2][2]); // 3rd row
+    t = nanovdb::math::Vec3<T>(m44[camId][0][3], m44[camId][1][3], m44[camId][2][3]);
+}
+
+template <typename T, typename Acc33>
+inline __device__ nanovdb::math::Mat3<T>
+loadMat3FromAccessor33(const Acc33 &m33 /* [C,3,3] */, const int64_t camId) {
+    return nanovdb::math::Mat3<T>(m33[camId][0][0],
+                                  m33[camId][0][1],
+                                  m33[camId][0][2],
+                                  m33[camId][1][0],
+                                  m33[camId][1][1],
+                                  m33[camId][1][2],
+                                  m33[camId][2][0],
+                                  m33[camId][2][1],
+                                  m33[camId][2][2]);
+}
+
+} // namespace fvdb::detail::ops
+
+#endif // FVDB_DETAIL_OPS_GSPLAT_GAUSSIANCAMERAMATRIXUTILS_CUH
diff --git a/src/fvdb/detail/ops/gsplat/GaussianCameraModels.h b/src/fvdb/detail/ops/gsplat/GaussianCameraModels.h
new file mode 100644
index 00000000..aa45ba5b
--- /dev/null
+++ b/src/fvdb/detail/ops/gsplat/GaussianCameraModels.h
@@ -0,0 +1,40 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
+//
+#ifndef FVDB_DETAIL_OPS_GSPLAT_GAUSSIANCAMERAMODELS_H
+#define FVDB_DETAIL_OPS_GSPLAT_GAUSSIANCAMERAMODELS_H
+
+#include <cstdint>
+
+namespace fvdb {
+namespace detail {
+namespace ops {
+
+/// @brief Rolling shutter policy for camera projection / ray generation.
+enum class RollingShutterType : int32_t { NONE = 0, VERTICAL = 1, HORIZONTAL = 2 };
+
+/// @brief Camera model for projection (shared across projection and 3DGS rasterization paths).
+///
+/// Notes:
+/// - `PINHOLE` and `ORTHOGRAPHIC` have no distortion.
+/// - `OPENCV_*` variants are pinhole + OpenCV-style distortion, and expect packed coefficients.
+enum class CameraModel : int32_t {
+    // Pinhole intrinsics only (no distortion).
+    PINHOLE = 0,
+
+    // OpenCV variants which are just pinhole intrinsics + optional distortion (all of them use the
+    // same [C,12] distortion coefficients layout: [k1,k2,k3,k4,k5,k6,p1,p2,s1,s2,s3,s4]).
+    OPENCV_RADTAN_5            = 1, // polynomial radial (k1,k2,k3) + tangential (p1,p2)
+    OPENCV_RATIONAL_8          = 2, // rational radial (k1..k6) + tangential (p1,p2)
+    OPENCV_RADTAN_THIN_PRISM_9 = 3, // polynomial radial + tangential + thin-prism (s1..s4)
+    OPENCV_THIN_PRISM_12       = 4, // rational radial + tangential + thin-prism (s1..s4)
+
+    // Orthographic intrinsics (no distortion).
+    ORTHOGRAPHIC = 5,
+};
+
+} // namespace ops
+} // namespace detail
+} // namespace fvdb
+
+#endif // FVDB_DETAIL_OPS_GSPLAT_GAUSSIANCAMERAMODELS_H
diff --git a/src/fvdb/detail/ops/gsplat/GaussianCameraSharedMemory.cuh b/src/fvdb/detail/ops/gsplat/GaussianCameraSharedMemory.cuh
new file mode 100644
index 00000000..0d112ad6
--- /dev/null
+++ b/src/fvdb/detail/ops/gsplat/GaussianCameraSharedMemory.cuh
@@ -0,0 +1,72 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
+//
+#ifndef FVDB_DETAIL_OPS_GSPLAT_GAUSSIANCAMERASHAREDMEMORY_CUH
+#define FVDB_DETAIL_OPS_GSPLAT_GAUSSIANCAMERASHAREDMEMORY_CUH
+
+#include <nanovdb/math/Math.h>
+
+#include <cstdint>
+
+namespace fvdb::detail::ops {
+
+template <typename ScalarType, typename Acc33>
+inline __device__ void
+copyMat3AccessorToShared(const int64_t C,
+                         nanovdb::math::Mat3<ScalarType> *__restrict__ out,
+                         const Acc33 &acc /* [C,3,3] */) {
+    constexpr int64_t kElementsPerMat3 = 9;
+    for (int64_t i = threadIdx.x; i < C * kElementsPerMat3; i += blockDim.x) {
+        const int64_t camId      = i / kElementsPerMat3;
+        const int64_t entryId    = i % kElementsPerMat3;
+        const int64_t rowId      = entryId / 3;
+        const int64_t colId      = entryId % 3;
+        out[camId][rowId][colId] = acc[camId][rowId][colId];
+    }
+}
+
+template <typename ScalarType, typename Acc44>
+inline __device__ void
+copyWorldToCamRotationToShared(const int64_t C,
+                               nanovdb::math::Mat3<ScalarType> *__restrict__ out,
+                               const Acc44 &acc /* [C,4,4] */) {
+    constexpr int64_t kElementsPerMat3 = 9;
+    for (int64_t i = threadIdx.x; i < C * kElementsPerMat3; i += blockDim.x) {
+        const int64_t camId      = i / kElementsPerMat3;
+        const int64_t entryId    = i % kElementsPerMat3;
+        const int64_t rowId      = entryId / 3;
+        const int64_t colId      = entryId % 3;
+        out[camId][rowId][colId] = acc[camId][rowId][colId];
+    }
+}
+
+template <typename ScalarType, typename Acc44>
+inline __device__ void
+copyWorldToCamTranslationToShared(const int64_t C,
+                                  nanovdb::math::Vec3<ScalarType> *__restrict__ out,
+                                  const Acc44 &acc /* [C,4,4] */) {
+    constexpr int64_t kElementsPerVec3 = 3;
+    for (int64_t i = threadIdx.x; i < C * kElementsPerVec3; i += blockDim.x) {
+        const int64_t camId   = i / kElementsPerVec3;
+        const int64_t entryId = i % kElementsPerVec3;
+        out[camId][entryId]   = acc[camId][entryId][3];
+    }
+}
+
+template <typename ScalarType, typename AccCk>
+inline __device__ void
+copyDistortionCoeffsToShared(const int64_t C,
+                             const int64_t K,
+                             ScalarType *__restrict__ out /* [C*K] */,
+                             const AccCk &acc /* [C,K] */) {
+    const int64_t total = C * K;
+    for (int64_t i = threadIdx.x; i < total; i += blockDim.x) {
+        const int64_t camId      = i / K;
+        const int64_t entryId    = i % K;
+        out[camId * K + entryId] = acc[camId][entryId];
+    }
+}
+
+} // namespace fvdb::detail::ops
+
+#endif // FVDB_DETAIL_OPS_GSPLAT_GAUSSIANCAMERASHAREDMEMORY_CUH
diff --git a/src/fvdb/detail/ops/gsplat/GaussianIntrinsics.cuh b/src/fvdb/detail/ops/gsplat/GaussianIntrinsics.cuh
new file mode 100644
index 00000000..0a3238a4
--- /dev/null
+++ b/src/fvdb/detail/ops/gsplat/GaussianIntrinsics.cuh
@@ -0,0 +1,29 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
+//
+#ifndef FVDB_DETAIL_OPS_GSPLAT_GAUSSIANINTRINSICS_CUH
+#define FVDB_DETAIL_OPS_GSPLAT_GAUSSIANINTRINSICS_CUH
+
+#include <nanovdb/math/Math.h>
+
+namespace fvdb::detail::ops {
+
+template <typename T> struct CameraIntrinsics {
+    T fx, fy, cx, cy;
+};
+
+template <typename T>
+inline __host__ __device__ CameraIntrinsics<T>
+loadIntrinsics(const nanovdb::math::Mat3<T> &K) {
+    return CameraIntrinsics<T>{K[0][0], K[1][1], K[0][2], K[1][2]};
+}
+
+template <typename T>
+inline __host__ __device__ CameraIntrinsics<T>
+loadIntrinsicsRowMajor3x3(const T *K9) {
+    return CameraIntrinsics<T>{K9[0], K9[4], K9[2], K9[5]};
+}
+
+} // namespace fvdb::detail::ops
+
+#endif // FVDB_DETAIL_OPS_GSPLAT_GAUSSIANINTRINSICS_CUH
diff --git a/src/fvdb/detail/ops/gsplat/GaussianOpenCVDistortion.cuh b/src/fvdb/detail/ops/gsplat/GaussianOpenCVDistortion.cuh
new file mode 100644
index 00000000..61e43562
--- /dev/null
+++ b/src/fvdb/detail/ops/gsplat/GaussianOpenCVDistortion.cuh
@@ -0,0 +1,125 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
+//
+#ifndef FVDB_DETAIL_OPS_GSPLAT_GAUSSIANOPENCVDISTORTION_CUH
+#define FVDB_DETAIL_OPS_GSPLAT_GAUSSIANOPENCVDISTORTION_CUH
+
+#include <fvdb/detail/ops/gsplat/GaussianCameraModels.h>
+
+#include <nanovdb/math/Math.h>
+
+#include <cstdint>
+
+namespace fvdb::detail::ops {
+
+/// @brief Apply OpenCV-style distortion to normalized camera-plane coordinates.
+///
+/// This uses FVDB's packed coefficient layout:
+/// `[k1,k2,k3,k4,k5,k6,p1,p2,s1,s2,s3,s4]`.
+///
+/// Notes:
+/// - `CameraModel::PINHOLE` and `CameraModel::ORTHOGRAPHIC` ignore distortion.
+/// - Unused coefficients for a given model are ignored (but still read from the packed array).
+///
+/// @param cameraModel Camera model selector.
+/// @param p_normalized Normalized camera-plane coordinate.
+///   - perspective: (x/z, y/z)
+///   - orthographic: (x, y)
+/// @param distortionCoeffs Pointer to packed coefficients (may be null for non-OpenCV models).
+/// @param numCoeffs Number of coefficients per camera (0 for PINHOLE/ORTHOGRAPHIC, 12 for OPENCV).
+template <typename T>
+inline __host__ __device__ nanovdb::math::Vec2<T>
+applyOpenCVDistortionPacked(const CameraModel cameraModel,
+                            const nanovdb::math::Vec2<T> &p_normalized,
+                            const T *distortionCoeffs,
+                            const int64_t numCoeffs) {
+    // For pinhole/orthographic, distortion is ignored.
+    if (cameraModel == CameraModel::PINHOLE || cameraModel == CameraModel::ORTHOGRAPHIC ||
+        numCoeffs == 0 || distortionCoeffs == nullptr) {
+        return p_normalized;
+    }
+
+    // Packed OpenCV coefficient layout:
+    // [k1,k2,k3,k4,k5,k6,p1,p2,s1,s2,s3,s4]
+    // NOTE: for RADTAN_5 we use k1,k2,k3 and ignore k4..k6; for rational we use k1..k6.
+    const T x  = p_normalized[0];
+    const T y  = p_normalized[1];
+    const T x2 = x * x;
+    const T y2 = y * y;
+    const T xy = x * y;
+    const T r2 = x2 + y2;
+    const T r4 = r2 * r2;
+    const T r6 = r4 * r2;
+
+    const T k1 = distortionCoeffs[0];
+    const T k2 = distortionCoeffs[1];
+    const T k3 = distortionCoeffs[2];
+    const T k4 = distortionCoeffs[3];
+    const T k5 = distortionCoeffs[4];
+    const T k6 = distortionCoeffs[5];
+    const T p1 = distortionCoeffs[6];
+    const T p2 = distortionCoeffs[7];
+    const T s1 = distortionCoeffs[8];
+    const T s2 = distortionCoeffs[9];
+    const T s3 = distortionCoeffs[10];
+    const T s4 = distortionCoeffs[11];
+
+    T radial = T(1);
+    if (cameraModel == CameraModel::OPENCV_RATIONAL_8 ||
+        cameraModel == CameraModel::OPENCV_THIN_PRISM_12) {
+        const T num = T(1) + r2 * (k1 + r2 * (k2 + r2 * k3));
+        const T den = T(1) + r2 * (k4 + r2 * (k5 + r2 * k6));
+        radial      = (den != T(0)) ? (num / den) : T(0);
+    } else if (cameraModel == CameraModel::OPENCV_RADTAN_5 ||
+               cameraModel == CameraModel::OPENCV_RADTAN_THIN_PRISM_9) {
+        radial = T(1) + k1 * r2 + k2 * r4 + k3 * r6;
+    }
+
+    T x_dist = x * radial;
+    T y_dist = y * radial;
+
+    // Tangential
+    x_dist += T(2) * p1 * xy + p2 * (r2 + T(2) * x2);
+    y_dist += p1 * (r2 + T(2) * y2) + T(2) * p2 * xy;
+
+    // Thin prism
+    if (cameraModel == CameraModel::OPENCV_THIN_PRISM_12 ||
+        cameraModel == CameraModel::OPENCV_RADTAN_THIN_PRISM_9) {
+        x_dist += s1 * r2 + s2 * r4;
+        y_dist += s3 * r2 + s4 * r4;
+    }
+
+    return nanovdb::math::Vec2<T>(x_dist, y_dist);
+}
+
+/// @brief Iteratively undistort a packed OpenCV distortion model using fixed-point iteration.
+///
+/// This solves for `x` such that `applyOpenCVDistortionPacked(x) == p_distorted`.
+/// It uses a small fixed iteration count (matching the UT kernel style) and is intended for use in
+/// ray generation.
+template <typename T>
+inline __host__ __device__ nanovdb::math::Vec2<T>
+undistortOpenCVPackedFixedPoint(const CameraModel cameraModel,
+                                const nanovdb::math::Vec2<T> &p_distorted,
+                                const T *distortionCoeffs,
+                                const int64_t numCoeffs,
+                                const int iters = 8) {
+    if (cameraModel == CameraModel::PINHOLE || cameraModel == CameraModel::ORTHOGRAPHIC ||
+        numCoeffs == 0 || distortionCoeffs == nullptr) {
+        return p_distorted;
+    }
+
+    nanovdb::math::Vec2<T> x = p_distorted;
+    for (int it = 0; it < iters; ++it) {
+        const nanovdb::math::Vec2<T> x_dist =
+            applyOpenCVDistortionPacked(cameraModel, x, distortionCoeffs, numCoeffs);
+        const nanovdb::math::Vec2<T> err = x_dist - p_distorted;
+        x[0] -= err[0];
+        x[1] -= err[1];
+    }
+    return x;
+}
+
+} // namespace fvdb::detail::ops
+
+#endif // FVDB_DETAIL_OPS_GSPLAT_GAUSSIANOPENCVDISTORTION_CUH
diff --git a/src/fvdb/detail/ops/gsplat/GaussianProjectionBackward.cu b/src/fvdb/detail/ops/gsplat/GaussianProjectionBackward.cu
index 9f349597..d8f76ed4 100644
--- a/src/fvdb/detail/ops/gsplat/GaussianProjectionBackward.cu
+++ b/src/fvdb/detail/ops/gsplat/GaussianProjectionBackward.cu
@@ -3,6 +3,7 @@
 //
 #include <fvdb/detail/ops/gsplat/GaussianMacros.cuh>
 #include <fvdb/detail/ops/gsplat/GaussianProjectionBackward.h>
+#include <fvdb/detail/ops/gsplat/GaussianProjectionUtils.cuh>
 #include <fvdb/detail/ops/gsplat/GaussianUtils.cuh>
 #include <fvdb/detail/ops/gsplat/GaussianWarpUtils.cuh>
 #include <fvdb/detail/utils/Nvtx.h>
@@ -126,9 +127,8 @@ projectionBackwardKernel(const int32_t offset,
     dLossDConics += idx * 3;
 
     // vjp: compute the inverse of the 2d covariance
-    const nanovdb::math::Mat2<T> covar2dInverse(conics[0], conics[1], conics[1], conics[2]);
-    const nanovdb::math::Mat2<T> dLossDCovar2dInverse(
-        dLossDConics[0], dLossDConics[1] * .5f, dLossDConics[1] * .5f, dLossDConics[2]);
+    const nanovdb::math::Mat2<T> covar2dInverse       = loadConicRowMajor3(conics);
+    const nanovdb::math::Mat2<T> dLossDCovar2dInverse = loadConicGradRowMajor3(dLossDConics);
     nanovdb::math::Mat2<T> dLossDCovar2d =
         inverseVectorJacobianProduct(covar2dInverse, dLossDCovar2dInverse);
 
@@ -141,41 +141,20 @@ projectionBackwardKernel(const int32_t offset,
     }
 
     // transform Gaussian to camera space
-    const nanovdb::math::Mat3<T> R(worldToCamMatrices[0],
-                                   worldToCamMatrices[1],
-                                   worldToCamMatrices[2],   // 1st row
-                                   worldToCamMatrices[4],
-                                   worldToCamMatrices[5],
-                                   worldToCamMatrices[6],   // 2nd row
-                                   worldToCamMatrices[8],
-                                   worldToCamMatrices[9],
-                                   worldToCamMatrices[10]); // 3rd row
-    const nanovdb::math::Vec3<T> t(
-        worldToCamMatrices[3], worldToCamMatrices[7], worldToCamMatrices[11]);
+    nanovdb::math::Mat3<T> R;
+    nanovdb::math::Vec3<T> t;
+    loadWorldToCamRtRowMajor4x4(worldToCamMatrices, R, t);
     nanovdb::math::Mat3<T> covar;
     nanovdb::math::Vec4<T> quat;
     nanovdb::math::Vec3<T> scale;
     if (covars != nullptr) {
         covars += gId * 6;
-        covar = nanovdb::math::Mat3<T>(covars[0],
-                                       covars[1],
-                                       covars[2], // 1st row
-                                       covars[1],
-                                       covars[3],
-                                       covars[4], // 2nd row
-                                       covars[2],
-                                       covars[4],
-                                       covars[5]  // 3rd row
-        );
+        covar = loadCovarianceRowMajor6(covars);
     } else {
         // compute from quaternions and logScales
         quats += gId * 4;
         logScales += gId * 3;
-        quat  = nanovdb::math::Vec4<T>(quats[0], quats[1], quats[2], quats[3]);
-        scale = nanovdb::math::Vec3<T>(::cuda::std::exp(logScales[0]),
-                                       ::cuda::std::exp(logScales[1]),
-                                       ::cuda::std::exp(logScales[2]));
-
+        loadQuatScaleFromLogScalesRowMajor(quats, logScales, quat, scale);
         covar = quaternionAndScaleToCovariance<T>(quat, scale);
     }
 
@@ -185,17 +164,16 @@ projectionBackwardKernel(const int32_t offset,
     const nanovdb::math::Mat3<T> &covarCamSpace = transformCovarianceWorldToCam(R, covar);
 
     // vjp: camera projection
-    const T fx = projectionMatrices[0], cx = projectionMatrices[2], fy = projectionMatrices[4],
-            cy                                     = projectionMatrices[5];
+    const CameraIntrinsics<T> intr                 = loadIntrinsicsRowMajor3x3(projectionMatrices);
     auto [dLossDCovarCamSpace, dLossDMeanCamSpace] = [&]() {
         if constexpr (Ortho) {
             return projectGaussianOrthographicVectorJacobianProduct<T>(
                 meansCamSpace,
                 covarCamSpace,
-                fx,
-                fy,
-                cx,
-                cy,
+                intr.fx,
+                intr.fy,
+                intr.cx,
+                intr.cy,
                 imageWidth,
                 imageHeight,
                 dLossDCovar2d,
@@ -204,10 +182,10 @@ projectionBackwardKernel(const int32_t offset,
             return projectGaussianPerspectiveVectorJacobianProduct<T>(
                 meansCamSpace,
                 covarCamSpace,
-                fx,
-                fy,
-                cx,
-                cy,
+                intr.fx,
+                intr.fy,
+                intr.cx,
+                intr.cy,
                 imageWidth,
                 imageHeight,
                 dLossDCovar2d,
@@ -263,7 +241,6 @@ projectionBackwardKernel(const int32_t offset,
             quaternionAndScaleToCovarianceVectorJacobianProduct<T, true>(
                 quat, scale, rotmat, dLossDCovar);
 
-        warpSum(dLossDQuat, warp_group_g);
         warpSum(dLossDLogScale, warp_group_g);
         if (warp_group_g.thread_rank() == 0) {
             outDLossDQuats += gId * 4;
diff --git a/src/fvdb/detail/ops/gsplat/GaussianProjectionForward.cu b/src/fvdb/detail/ops/gsplat/GaussianProjectionForward.cu
index 950cec6e..2f49ceaf 100644
--- a/src/fvdb/detail/ops/gsplat/GaussianProjectionForward.cu
+++ b/src/fvdb/detail/ops/gsplat/GaussianProjectionForward.cu
@@ -1,7 +1,9 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: Apache-2.0
 //
+#include <fvdb/detail/ops/gsplat/GaussianCameraSharedMemory.cuh>
 #include <fvdb/detail/ops/gsplat/GaussianProjectionForward.h>
+#include <fvdb/detail/ops/gsplat/GaussianProjectionUtils.cuh>
 #include <fvdb/detail/ops/gsplat/GaussianUtils.cuh>
 #include <fvdb/detail/utils/AccessorHelpers.cuh>
 #include <fvdb/detail/utils/Nvtx.h>
@@ -136,15 +138,26 @@ template <typename T, bool Ortho> struct ProjectionForward {
         const Mat3 covarCamSpace = transformCovarianceWorldToCam(worldToCamRotMatrix, covar);
 
         // camera projection
-        const T fx = projectionMatrix[0][0], cx = projectionMatrix[0][2],
-                fy = projectionMatrix[1][1], cy = projectionMatrix[1][2];
-        auto [covar2d, mean2d] = [&]() {
+        const CameraIntrinsics<T> intr = loadIntrinsics(projectionMatrix);
+        auto [covar2d, mean2d]         = [&]() {
             if constexpr (Ortho) {
-                return projectGaussianOrthographic<T>(
-                    meansCamSpace, covarCamSpace, fx, fy, cx, cy, mImageWidth, mImageHeight);
+                return projectGaussianOrthographic<T>(meansCamSpace,
+                                                      covarCamSpace,
+                                                      intr.fx,
+                                                      intr.fy,
+                                                      intr.cx,
+                                                      intr.cy,
+                                                      mImageWidth,
+                                                      mImageHeight);
             } else {
-                return projectGaussianPerspective<T>(
-                    meansCamSpace, covarCamSpace, fx, fy, cx, cy, mImageWidth, mImageHeight);
+                return projectGaussianPerspective<T>(meansCamSpace,
+                                                     covarCamSpace,
+                                                     intr.fx,
+                                                     intr.fy,
+                                                     intr.cx,
+                                                     intr.cy,
+                                                     mImageWidth,
+                                                     mImageHeight);
             }
         }();
 
@@ -157,10 +170,8 @@ template <typename T, bool Ortho> struct ProjectionForward {
 
         const Mat2 covar2dInverse = covar2d.inverse();
 
-        // take 3 sigma as the radius
-        const T b      = 0.5f * (covar2d[0][0] + covar2d[1][1]);
-        const T v1     = b + sqrt(max(0.01f, b * b - det));
-        const T radius = ceil(3.f * sqrt(v1));
+        // take 3 sigma as the radius (non-differentiable)
+        const T radius = radiusFromCovariance2dDet(covar2d, det, T(3));
 
         if (radius <= mRadiusClip) {
             mOutRadiiAcc[cid][gid] = 0;
@@ -168,20 +179,20 @@ template <typename T, bool Ortho> struct ProjectionForward {
         }
 
         // Mask out gaussians outside the image region
-        if (mean2d[0] + radius <= 0 || mean2d[0] - radius >= mImageWidth ||
-            mean2d[1] + radius <= 0 || mean2d[1] - radius >= mImageHeight) {
+        if (isOutsideImageWithRadius(mean2d, radius, radius, mImageWidth, mImageHeight)) {
             mOutRadiiAcc[cid][gid] = 0;
             return;
         }
 
         // Write outputs
-        mOutRadiiAcc[cid][gid]      = int32_t(radius);
-        mOutMeans2dAcc[cid][gid][0] = mean2d[0];
-        mOutMeans2dAcc[cid][gid][1] = mean2d[1];
-        mOutDepthsAcc[cid][gid]     = meansCamSpace[2];
-        mOutConicsAcc[cid][gid][0]  = covar2dInverse[0][0];
-        mOutConicsAcc[cid][gid][1]  = covar2dInverse[0][1];
-        mOutConicsAcc[cid][gid][2]  = covar2dInverse[1][1];
+        mOutRadiiAcc[cid][gid]                   = int32_t(radius);
+        mOutMeans2dAcc[cid][gid][0]              = mean2d[0];
+        mOutMeans2dAcc[cid][gid][1]              = mean2d[1];
+        mOutDepthsAcc[cid][gid]                  = meansCamSpace[2];
+        const nanovdb::math::Vec3<T> conicPacked = packConicRowMajor3(covar2dInverse);
+        mOutConicsAcc[cid][gid][0]               = conicPacked[0];
+        mOutConicsAcc[cid][gid][1]               = conicPacked[1];
+        mOutConicsAcc[cid][gid][2]               = conicPacked[2];
         if (mOutCompensationsAcc != nullptr) {
             mOutCompensationsAcc[idx] = compensation;
         }
@@ -189,37 +200,19 @@ template <typename T, bool Ortho> struct ProjectionForward {
 
     inline __device__ void
     loadCamerasIntoSharedMemory() {
-        // Load projection matrices and world-to-camera matrices into shared memory
+        // Load projection matrices and world-to-camera matrices into shared memory.
         alignas(Mat3) extern __shared__ char sharedMemory[];
 
-        projectionMatsShared    = reinterpret_cast<Mat3 *>(sharedMemory);
-        worldToCamRotMatsShared = reinterpret_cast<Mat3 *>(sharedMemory + C * sizeof(Mat3));
-        worldToCamTranslation   = reinterpret_cast<Vec3 *>(sharedMemory + C * 2 * sizeof(Mat3));
-
-        const int64_t totalElements = C * (9 + 9 + 3);
-        for (int64_t i = threadIdx.x; i < totalElements; i += blockDim.x) {
-            if (i < C * 9) {
-                const auto camId   = i / 9;
-                const auto entryId = i % 9;
-                const auto rowId   = entryId / 3;
-                const auto colId   = entryId % 3;
-                projectionMatsShared[camId][rowId][colId] =
-                    mProjectionMatricesAcc[camId][rowId][colId];
-            } else if (i < 2 * C * 9) {
-                const auto baseIdx = i - C * 9;
-                const auto camId   = baseIdx / 9;
-                const auto entryId = baseIdx % 9;
-                const auto rowId   = entryId / 3;
-                const auto colId   = entryId % 3;
-                worldToCamRotMatsShared[camId][rowId][colId] =
-                    mWorldToCamMatricesAcc[camId][rowId][colId];
-            } else {
-                const auto baseIdx                    = i - 2 * C * 9;
-                const auto camId                      = baseIdx / 3;
-                const auto entryId                    = baseIdx % 3;
-                worldToCamTranslation[camId][entryId] = mWorldToCamMatricesAcc[camId][entryId][3];
-            }
-        }
+        uint8_t *ptr         = reinterpret_cast<uint8_t *>(sharedMemory);
+        projectionMatsShared = reinterpret_cast<Mat3 *>(ptr);
+        ptr += C * sizeof(Mat3);
+        worldToCamRotMatsShared = reinterpret_cast<Mat3 *>(ptr);
+        ptr += C * sizeof(Mat3);
+        worldToCamTranslation = reinterpret_cast<Vec3 *>(ptr);
+
+        copyMat3AccessorToShared<T>(C, projectionMatsShared, mProjectionMatricesAcc);
+        copyWorldToCamRotationToShared<T>(C, worldToCamRotMatsShared, mWorldToCamMatricesAcc);
+        copyWorldToCamTranslationToShared<T>(C, worldToCamTranslation, mWorldToCamMatricesAcc);
     }
 };
 
diff --git a/src/fvdb/detail/ops/gsplat/GaussianProjectionJaggedBackward.cu b/src/fvdb/detail/ops/gsplat/GaussianProjectionJaggedBackward.cu
index 1aa350f8..2c545383 100644
--- a/src/fvdb/detail/ops/gsplat/GaussianProjectionJaggedBackward.cu
+++ b/src/fvdb/detail/ops/gsplat/GaussianProjectionJaggedBackward.cu
@@ -3,6 +3,7 @@
 //
 #include <fvdb/detail/ops/gsplat/GaussianMacros.cuh>
 #include <fvdb/detail/ops/gsplat/GaussianProjectionJaggedBackward.h>
+#include <fvdb/detail/ops/gsplat/GaussianProjectionUtils.cuh>
 #include <fvdb/detail/ops/gsplat/GaussianUtils.cuh>
 #include <fvdb/detail/ops/gsplat/GaussianWarpUtils.cuh>
 #include <fvdb/detail/utils/Nvtx.h>
@@ -80,9 +81,8 @@ jaggedProjectionBackwardKernel(
     dLossDConics += idx * 3;
 
     // vjp: compute the inverse of the 2d covariance
-    const nanovdb::math::Mat2<T> covar2dInverse(conics[0], conics[1], conics[1], conics[2]);
-    const nanovdb::math::Mat2<T> dLossDCovar2dInverse(
-        dLossDConics[0], dLossDConics[1] * .5f, dLossDConics[1] * .5f, dLossDConics[2]);
+    const nanovdb::math::Mat2<T> covar2dInverse       = loadConicRowMajor3(conics);
+    const nanovdb::math::Mat2<T> dLossDCovar2dInverse = loadConicGradRowMajor3(dLossDConics);
     nanovdb::math::Mat2<T> dLossDCovar2d =
         inverseVectorJacobianProduct(covar2dInverse, dLossDCovar2dInverse);
 
@@ -95,39 +95,20 @@ jaggedProjectionBackwardKernel(
     }
 
     // transform Gaussian to camera space
-    const nanovdb::math::Mat3<T> R(worldToCamMatrices[0],
-                                   worldToCamMatrices[1],
-                                   worldToCamMatrices[2],   // 1st row
-                                   worldToCamMatrices[4],
-                                   worldToCamMatrices[5],
-                                   worldToCamMatrices[6],   // 2nd row
-                                   worldToCamMatrices[8],
-                                   worldToCamMatrices[9],
-                                   worldToCamMatrices[10]); // 3rd row
-    const nanovdb::math::Vec3<T> t(
-        worldToCamMatrices[3], worldToCamMatrices[7], worldToCamMatrices[11]);
+    nanovdb::math::Mat3<T> R;
+    nanovdb::math::Vec3<T> t;
+    loadWorldToCamRtRowMajor4x4(worldToCamMatrices, R, t);
     nanovdb::math::Mat3<T> covar;
     nanovdb::math::Vec4<T> quat;
     nanovdb::math::Vec3<T> scale;
     if (covars != nullptr) {
         covars += gId * 6;
-        covar = nanovdb::math::Mat3<T>(covars[0],
-                                       covars[1],
-                                       covars[2], // 1st row
-                                       covars[1],
-                                       covars[3],
-                                       covars[4], // 2nd row
-                                       covars[2],
-                                       covars[4],
-                                       covars[5]  // 3rd row
-        );
+        covar = loadCovarianceRowMajor6(covars);
     } else {
         // compute from quaternions and scales
         quats += gId * 4;
         scales += gId * 3;
-        quat  = nanovdb::math::Vec4<T>(quats[0], quats[1], quats[2], quats[3]);
-        scale = nanovdb::math::Vec3<T>(scales[0], scales[1], scales[2]);
-
+        loadQuatScaleFromScalesRowMajor(quats, scales, quat, scale);
         covar = quaternionAndScaleToCovariance<T>(quat, scale);
     }
     const nanovdb::math::Vec3<T> &meansCamSpace =
@@ -136,17 +117,16 @@ jaggedProjectionBackwardKernel(
     const nanovdb::math::Mat3<T> &covarCamSpace = transformCovarianceWorldToCam(R, covar);
 
     // vjp: camera projection
-    const T fx = projectionMatrices[0], cx = projectionMatrices[2], fy = projectionMatrices[4],
-            cy                                     = projectionMatrices[5];
+    const CameraIntrinsics<T> intr                 = loadIntrinsicsRowMajor3x3(projectionMatrices);
     auto [dLossDCovarCamSpace, dLossDMeanCamSpace] = [&]() {
         if constexpr (Ortho) {
             return projectGaussianOrthographicVectorJacobianProduct<T>(
                 meansCamSpace,
                 covarCamSpace,
-                fx,
-                fy,
-                cx,
-                cy,
+                intr.fx,
+                intr.fy,
+                intr.cx,
+                intr.cy,
                 imageWidth,
                 imageHeight,
                 dLossDCovar2d,
@@ -155,10 +135,10 @@ jaggedProjectionBackwardKernel(
             return projectGaussianPerspectiveVectorJacobianProduct<T>(
                 meansCamSpace,
                 covarCamSpace,
-                fx,
-                fy,
-                cx,
-                cy,
+                intr.fx,
+                intr.fy,
+                intr.cx,
+                intr.cy,
                 imageWidth,
                 imageHeight,
                 dLossDCovar2d,
diff --git a/src/fvdb/detail/ops/gsplat/GaussianProjectionJaggedForward.cu b/src/fvdb/detail/ops/gsplat/GaussianProjectionJaggedForward.cu
index b40c4d18..1211f873 100644
--- a/src/fvdb/detail/ops/gsplat/GaussianProjectionJaggedForward.cu
+++ b/src/fvdb/detail/ops/gsplat/GaussianProjectionJaggedForward.cu
@@ -3,6 +3,7 @@
 //
 #include <fvdb/detail/ops/gsplat/GaussianMacros.cuh>
 #include <fvdb/detail/ops/gsplat/GaussianProjectionJaggedForward.h>
+#include <fvdb/detail/ops/gsplat/GaussianProjectionUtils.cuh>
 #include <fvdb/detail/ops/gsplat/GaussianUtils.cuh>
 #include <fvdb/detail/ops/gsplat/GaussianWarpUtils.cuh>
 #include <fvdb/detail/utils/Nvtx.h>
@@ -63,17 +64,9 @@ jaggedProjectionForwardKernel(const uint32_t B,
     projectionMatrices += cId * 9;
 
     // input is row-major
-    const nanovdb::math::Mat3<T> R(worldToCamMatrices[0],
-                                   worldToCamMatrices[1],
-                                   worldToCamMatrices[2],   // 1st row
-                                   worldToCamMatrices[4],
-                                   worldToCamMatrices[5],
-                                   worldToCamMatrices[6],   // 2nd row
-                                   worldToCamMatrices[8],
-                                   worldToCamMatrices[9],
-                                   worldToCamMatrices[10]); // 3rd row
-    const nanovdb::math::Vec3<T> t(
-        worldToCamMatrices[3], worldToCamMatrices[7], worldToCamMatrices[11]);
+    nanovdb::math::Mat3<T> R;
+    nanovdb::math::Vec3<T> t;
+    loadWorldToCamRtRowMajor4x4(worldToCamMatrices, R, t);
     // transform Gaussian center to camera space
     const nanovdb::math::Vec3<T> meansCamSpace =
         transformPointWorldToCam(R, t, nanovdb::math::Vec3<T>(means[0], means[1], means[2]));
@@ -87,36 +80,39 @@ jaggedProjectionForwardKernel(const uint32_t B,
     nanovdb::math::Mat3<T> covar;
     if (covars != nullptr) {
         covars += gId * 6;
-        covar = nanovdb::math::Mat3<T>(covars[0],
-                                       covars[1],
-                                       covars[2], // 1st row
-                                       covars[1],
-                                       covars[3],
-                                       covars[4], // 2nd row
-                                       covars[2],
-                                       covars[4],
-                                       covars[5]  // 3rd row
-        );
+        covar = loadCovarianceRowMajor6(covars);
     } else {
         // compute from quaternions and scales
         quats += gId * 4;
         scales += gId * 3;
-        covar = quaternionAndScaleToCovariance<T>(
-            nanovdb::math::Vec4<T>(quats[0], quats[1], quats[2], quats[3]),
-            nanovdb::math::Vec3<T>(scales[0], scales[1], scales[2]));
+        nanovdb::math::Vec4<T> quat;
+        nanovdb::math::Vec3<T> scale;
+        loadQuatScaleFromScalesRowMajor(quats, scales, quat, scale);
+        covar = quaternionAndScaleToCovariance<T>(quat, scale);
     }
     const nanovdb::math::Mat3<T> covarCamSpace = transformCovarianceWorldToCam(R, covar);
 
     // camera projection
-    const T fx = projectionMatrices[0], cx = projectionMatrices[2], fy = projectionMatrices[4],
-            cy             = projectionMatrices[5];
-    auto [covar2d, mean2d] = [&]() {
+    const CameraIntrinsics<T> intr = loadIntrinsicsRowMajor3x3(projectionMatrices);
+    auto [covar2d, mean2d]         = [&]() {
         if constexpr (Ortho) {
-            return projectGaussianOrthographic<T>(
-                meansCamSpace, covarCamSpace, fx, fy, cx, cy, imageWidth, imageHeight);
+            return projectGaussianOrthographic<T>(meansCamSpace,
+                                                  covarCamSpace,
+                                                  intr.fx,
+                                                  intr.fy,
+                                                  intr.cx,
+                                                  intr.cy,
+                                                  imageWidth,
+                                                  imageHeight);
         } else {
-            return projectGaussianPerspective<T>(
-                meansCamSpace, covarCamSpace, fx, fy, cx, cy, imageWidth, imageHeight);
+            return projectGaussianPerspective<T>(meansCamSpace,
+                                                 covarCamSpace,
+                                                 intr.fx,
+                                                 intr.fy,
+                                                 intr.cx,
+                                                 intr.cy,
+                                                 imageWidth,
+                                                 imageHeight);
         }
     }();
 
@@ -130,10 +126,8 @@ jaggedProjectionForwardKernel(const uint32_t B,
     // compute the inverse of the 2d covariance
     const nanovdb::math::Mat2<T> covar2dInverse = covar2d.inverse();
 
-    // take 3 sigma as the radius (non differentiable)
-    const T b      = 0.5f * (covar2d[0][0] + covar2d[1][1]);
-    const T v1     = b + sqrt(max(0.01f, b * b - det));
-    const T radius = ceil(3.f * sqrt(v1));
+    // take 3 sigma as the radius (non-differentiable)
+    const T radius = radiusFromCovariance2dDet(covar2d, det, T(3));
     // T v2 = b - sqrt(max(0.1f, b * b - det));
     // T radius = ceil(3.f * sqrt(max(v1, v2)));
 
@@ -143,8 +137,7 @@ jaggedProjectionForwardKernel(const uint32_t B,
     }
 
     // mask out gaussians outside the image region
-    if (mean2d[0] + radius <= 0 || mean2d[0] - radius >= imageWidth || mean2d[1] + radius <= 0 ||
-        mean2d[1] - radius >= imageHeight) {
+    if (isOutsideImageWithRadius(mean2d, radius, radius, imageWidth, imageHeight)) {
         radii[idx] = 0;
         return;
     }
@@ -154,9 +147,7 @@ jaggedProjectionForwardKernel(const uint32_t B,
     means2d[idx * 2]     = mean2d[0];
     means2d[idx * 2 + 1] = mean2d[1];
     depths[idx]          = meansCamSpace[2];
-    conics[idx * 3]      = covar2dInverse[0][0];
-    conics[idx * 3 + 1]  = covar2dInverse[0][1];
-    conics[idx * 3 + 2]  = covar2dInverse[1][1];
+    storeConicRowMajor3(covar2dInverse, conics + idx * 3);
     if (compensations != nullptr) {
         compensations[idx] = compensation;
     }
diff --git a/src/fvdb/detail/ops/gsplat/GaussianProjectionUT.cu b/src/fvdb/detail/ops/gsplat/GaussianProjectionUT.cu
index 4c1125a2..a8f899de 100644
--- a/src/fvdb/detail/ops/gsplat/GaussianProjectionUT.cu
+++ b/src/fvdb/detail/ops/gsplat/GaussianProjectionUT.cu
@@ -2,7 +2,11 @@
 // SPDX-License-Identifier: Apache-2.0
 //
 
+#include <fvdb/detail/ops/gsplat/GaussianCameraSharedMemory.cuh>
+#include <fvdb/detail/ops/gsplat/GaussianOpenCVDistortion.cuh>
 #include <fvdb/detail/ops/gsplat/GaussianProjectionUT.h>
+#include <fvdb/detail/ops/gsplat/GaussianRigidTransform.cuh>
+#include <fvdb/detail/ops/gsplat/GaussianRollingShutter.cuh>
 #include <fvdb/detail/ops/gsplat/GaussianUtils.cuh>
 #include <fvdb/detail/utils/AccessorHelpers.cuh>
 #include <fvdb/detail/utils/Nvtx.h>
@@ -42,19 +46,6 @@ template <typename T> class OpenCVCameraModel {
     using Vec3 = nanovdb::math::Vec3<T>;
     using Mat3 = nanovdb::math::Mat3<T>;
 
-    /// @brief Internal distortion evaluation mode.
-    ///
-    /// This is intentionally separate from the public API `CameraModel` to keep the math paths
-    /// explicit and avoid implying that `CameraModel` is “just distortion” (since it also includes
-    /// projection as well).
-    enum class Model : uint8_t {
-        NONE          = 0,  // no distortion
-        RADTAN_5      = 5,  // k1,k2,p1,p2,k3 (polynomial radial up to r^6)
-        RATIONAL_8    = 8,  // k1,k2,p1,p2,k3,k4,k5,k6 (rational radial)
-        RADTAN_THIN_9 = 9,  // RADTAN_5 + thin prism s1..s4 (polynomial radial + thin-prism)
-        THIN_PRISM_12 = 12, // RATIONAL_8 + s1,s2,s3,s4
-    };
-
     /// @brief Construct a camera model for projection.
     ///
     /// For OpenCV models, coefficients are read from a per-camera packed layout:
@@ -70,70 +61,22 @@ template <typename T> class OpenCVCameraModel {
     ///            cameraModel).
     __device__ __forceinline__
     OpenCVCameraModel(const CameraModel cameraModel, const Mat3 &K_in, const T *distortionCoeffs)
-        : K(K_in) {
+        : K(K_in), mCameraModel(cameraModel), mDistortionCoeffs(distortionCoeffs) {
         // ORTHOGRAPHIC is implemented as pinhole intrinsics without the perspective divide.
         // Distortion is intentionally not supported for orthographic projection.
         orthographic = (cameraModel == CameraModel::ORTHOGRAPHIC);
 
-        if (cameraModel == CameraModel::ORTHOGRAPHIC) {
-            radial = tangential = thinPrism = nullptr;
-            numRadial = numTangential = numThinPrism = 0;
-            model                                    = Model::NONE;
+        if (cameraModel == CameraModel::ORTHOGRAPHIC || cameraModel == CameraModel::PINHOLE) {
+            mNumDistortionCoeffs = 0;
             return;
         }
 
-        if (cameraModel == CameraModel::PINHOLE) {
-            radial = tangential = thinPrism = nullptr;
-            numRadial = numTangential = numThinPrism = 0;
-            model                                    = Model::NONE;
-            return;
-        }
-
-        // OpenCV models require the packed coefficient layout.
-        deviceAssertOrTrap(distortionCoeffs != nullptr);
-        const T *radial_in = distortionCoeffs + kRadialOffset;     // k1..k6
-        const T *tang_in   = distortionCoeffs + kTangentialOffset; // p1,p2
-        const T *thin_in   = distortionCoeffs + kThinPrismOffset;  // s1..s4
-
-        if (cameraModel == CameraModel::OPENCV_RADTAN_5) {
-            radial        = radial_in;
-            numRadial     = 3; // k1,k2,k3 (polynomial)
-            tangential    = tang_in;
-            numTangential = 2;
-            thinPrism     = nullptr;
-            numThinPrism  = 0;
-            model         = Model::RADTAN_5;
-            return;
-        }
-        if (cameraModel == CameraModel::OPENCV_RATIONAL_8) {
-            radial        = radial_in;
-            numRadial     = 6; // k1..k6 (rational)
-            tangential    = tang_in;
-            numTangential = 2;
-            thinPrism     = nullptr;
-            numThinPrism  = 0;
-            model         = Model::RATIONAL_8;
-            return;
-        }
-        if (cameraModel == CameraModel::OPENCV_THIN_PRISM_12) {
-            radial        = radial_in;
-            numRadial     = 6; // k1..k6 (rational)
-            tangential    = tang_in;
-            numTangential = 2;
-            thinPrism     = thin_in;
-            numThinPrism  = 4; // s1..s4
-            model         = Model::THIN_PRISM_12;
-            return;
-        }
-        if (cameraModel == CameraModel::OPENCV_RADTAN_THIN_PRISM_9) {
-            // Polynomial radial + thin-prism; ignore k4..k6 by construction.
-            radial        = radial_in;
-            numRadial     = 3; // k1,k2,k3 (polynomial)
-            tangential    = tang_in;
-            numTangential = 2;
-            thinPrism     = thin_in;
-            numThinPrism  = 4; // s1..s4
-            model         = Model::RADTAN_THIN_9;
+        if (cameraModel == CameraModel::OPENCV_RADTAN_5 ||
+            cameraModel == CameraModel::OPENCV_RATIONAL_8 ||
+            cameraModel == CameraModel::OPENCV_RADTAN_THIN_PRISM_9 ||
+            cameraModel == CameraModel::OPENCV_THIN_PRISM_12) {
+            deviceAssertOrTrap(mDistortionCoeffs != nullptr);
+            mNumDistortionCoeffs = 12;
             return;
         }
 
@@ -162,7 +105,8 @@ template <typename T> class OpenCVCameraModel {
             p_normalized  = Vec2(p_cam[0] * z_inv, p_cam[1] * z_inv);
         }
 
-        const Vec2 p_distorted = applyDistortion(p_normalized);
+        const Vec2 p_distorted = applyOpenCVDistortionPacked(
+            mCameraModel, p_normalized, mDistortionCoeffs, mNumDistortionCoeffs);
 
         // Project to pixel coordinates.
         const T fx = K[0][0];
@@ -194,154 +138,10 @@ template <typename T> class OpenCVCameraModel {
 
     // Camera intrinsics (typically backed by shared memory).
     const Mat3 &K;
-    bool orthographic = false;
-
-    // Packed OpenCV coefficient layout offsets:
-    // [k1,k2,k3,k4,k5,k6,p1,p2,s1,s2,s3,s4]
-    static constexpr int kRadialOffset     = 0; // k1..k6
-    static constexpr int kTangentialOffset = 6; // p1,p2
-    static constexpr int kThinPrismOffset  = 8; // s1..s4
-
-    // Coefficients for the distortion model.
-    const T *radial     = nullptr;     // k1..k6 (but k4..k6 only used in rational model)
-    int numRadial       = 0;           // Number of radial coefficients
-    const T *tangential = nullptr;     // p1,p2
-    int numTangential   = 0;           // Number of tangential coefficients
-    const T *thinPrism  = nullptr;     // s1..s4
-    int numThinPrism    = 0;           // Number of thin prism coefficients
-    Model model         = Model::NONE; // Distortion model
-
-    /// @brief Read a coefficient if present, otherwise return 0.
-    ///
-    /// @param[in] ptr Coefficient array (may be null).
-    /// @param[in] n Number of coefficients in ptr.
-    /// @param[in] i Index to read.
-    /// @return Coefficient value or 0 if out-of-range / null.
-    __host__ __device__ inline T
-    coeffOrZero(const T *ptr, const int n, const int i) const {
-        return (ptr != nullptr && i >= 0 && i < n) ? ptr[i] : T(0);
-    }
-
-    /// @brief Apply OpenCV distortion to a normalized camera-plane point.
-    ///
-    /// Input coordinates are assumed to already be normalized to the camera plane:
-    /// - perspective: \((x/z, y/z)\)
-    /// - orthographic: \((x, y)\)
-    ///
-    /// @param[in] p_normalized Normalized camera-plane coordinates.
-    /// @return Distorted normalized coordinates.
-    __device__ Vec2
-    applyDistortion(const Vec2 &p_normalized) const {
-        const T x  = p_normalized[0];
-        const T y  = p_normalized[1];
-        const T x2 = x * x;
-        const T y2 = y * y;
-        const T xy = x * y;
-
-        const T r2 = x2 + y2;
-        const T r4 = r2 * r2;
-        const T r6 = r4 * r2;
-
-        // Radial distortion.
-        T radial_dist = T(1);
-        if (model == Model::RATIONAL_8 || model == Model::THIN_PRISM_12) {
-            const T k1  = coeffOrZero(radial, numRadial, 0);
-            const T k2  = coeffOrZero(radial, numRadial, 1);
-            const T k3  = coeffOrZero(radial, numRadial, 2);
-            const T k4  = coeffOrZero(radial, numRadial, 3);
-            const T k5  = coeffOrZero(radial, numRadial, 4);
-            const T k6  = coeffOrZero(radial, numRadial, 5);
-            const T num = T(1) + r2 * (k1 + r2 * (k2 + r2 * k3));
-            const T den = T(1) + r2 * (k4 + r2 * (k5 + r2 * k6));
-            radial_dist = (den != T(0)) ? (num / den) : T(0);
-        } else if (model == Model::RADTAN_5 || model == Model::RADTAN_THIN_9) {
-            // Polynomial radial (up to k3 / r^6). Thin-prism terms are applied below if enabled.
-            const T k1  = coeffOrZero(radial, numRadial, 0);
-            const T k2  = coeffOrZero(radial, numRadial, 1);
-            const T k3  = coeffOrZero(radial, numRadial, 2);
-            radial_dist = T(1) + k1 * r2 + k2 * r4 + k3 * r6;
-        }
-
-        T x_dist = x * radial_dist;
-        T y_dist = y * radial_dist;
-
-        // Tangential distortion.
-        const T p1 = coeffOrZero(tangential, numTangential, 0);
-        const T p2 = coeffOrZero(tangential, numTangential, 1);
-        x_dist += T(2) * p1 * xy + p2 * (r2 + T(2) * x2);
-        y_dist += p1 * (r2 + T(2) * y2) + T(2) * p2 * xy;
-
-        // Thin-prism distortion.
-        if (model == Model::THIN_PRISM_12 || model == Model::RADTAN_THIN_9) {
-            const T s1 = coeffOrZero(thinPrism, numThinPrism, 0);
-            const T s2 = coeffOrZero(thinPrism, numThinPrism, 1);
-            const T s3 = coeffOrZero(thinPrism, numThinPrism, 2);
-            const T s4 = coeffOrZero(thinPrism, numThinPrism, 3);
-            x_dist += s1 * r2 + s2 * r4;
-            y_dist += s3 * r2 + s4 * r4;
-        }
-
-        return Vec2(x_dist, y_dist);
-    }
-};
-
-/// @brief UT-local rigid transform (cached rotation + translation).
-///
-/// Quaternion is stored as \([w,x,y,z]\) and is assumed to represent a rotation.
-/// The corresponding rotation matrix \(R(q)\) is cached to avoid recomputing it for every point
-/// transform (UT sigma points, rolling-shutter iterations, depth culls, etc.).
-template <typename T> struct RigidTransform {
-    nanovdb::math::Mat3<T> R;
-    nanovdb::math::Vec4<T> q;
-    nanovdb::math::Vec3<T> t;
-
-    /// @brief Default constructor (identity transform).
-    ///
-    /// Initializes to unit quaternion \([1,0,0,0]\) and zero translation.
-    __device__
-    RigidTransform()
-        : R(nanovdb::math::Mat3<T>(nanovdb::math::Vec3<T>(T(1), T(0), T(0)),
-                                   nanovdb::math::Vec3<T>(T(0), T(1), T(0)),
-                                   nanovdb::math::Vec3<T>(T(0), T(0), T(1)))),
-          q(T(1), T(0), T(0), T(0)), t(T(0), T(0), T(0)) {}
-
-    /// @brief Construct from quaternion and translation.
-    /// @param[in] q_in Rotation quaternion \([w,x,y,z]\).
-    /// @param[in] t_in Translation vector.
-    __device__
-    RigidTransform(const nanovdb::math::Vec4<T> &q_in, const nanovdb::math::Vec3<T> &t_in)
-        : R(quaternionToRotationMatrix(q_in)), q(q_in), t(t_in) {}
-
-    /// @brief Construct from rotation matrix and translation.
-    /// @param[in] R_in Rotation matrix.
-    /// @param[in] t_in Translation vector.
-    __device__
-    RigidTransform(const nanovdb::math::Mat3<T> &R_in, const nanovdb::math::Vec3<T> &t_in)
-        : R(R_in), q(rotationMatrixToQuaternion<T>(R_in)), t(t_in) {}
-
-    /// @brief Apply the transform to a 3D point: \(R(q)\,p + t\).
-    /// @param[in] p_world Point to transform.
-    /// @return Transformed point.
-    __device__ __forceinline__ nanovdb::math::Vec3<T>
-    apply(const nanovdb::math::Vec3<T> &p_world) const {
-        // p_cam = R * p_world + t
-        return R * p_world + t;
-    }
-
-    /// @brief Interpolate between two rigid transforms.
-    ///
-    /// Translation is linearly interpolated; rotation uses NLERP along the shortest arc.
-    ///
-    /// @param[in] u Interpolation parameter in \([0,1]\).
-    /// @param[in] start Start transform.
-    /// @param[in] end End transform.
-    /// @return Interpolated transform.
-    static inline __device__ RigidTransform<T>
-    interpolate(const T u, const RigidTransform<T> &start, const RigidTransform<T> &end) {
-        const nanovdb::math::Vec3<T> t_interp = start.t + u * (end.t - start.t);
-        const nanovdb::math::Vec4<T> q_interp = nlerpQuaternionShortestPath<T>(start.q, end.q, u);
-        return RigidTransform<T>(q_interp, t_interp);
-    }
+    bool orthographic            = false;
+    CameraModel mCameraModel     = CameraModel::PINHOLE;
+    const T *mDistortionCoeffs   = nullptr; // packed [12] for OPENCV_* models
+    int64_t mNumDistortionCoeffs = 0;       // 0 for PINHOLE/ORTHOGRAPHIC, 12 for OPENCV_*
 };
 
 /// @brief Projection status for a single world point.
@@ -465,13 +265,8 @@ template <typename ScalarType> struct WorldToPixelTransform {
         // Fixed small iteration count (good enough for convergence in practice).
         constexpr int kIters = 6;
         for (int it = 0; it < kIters; ++it) {
-            ScalarType t_rs = ScalarType(0);
-            if (rollingShutterType == RollingShutterType::VERTICAL) {
-                t_rs = floor(pix_prev[1]) / max(ScalarType(1), ScalarType(imageHeight - 1));
-            } else if (rollingShutterType == RollingShutterType::HORIZONTAL) {
-                t_rs = floor(pix_prev[0]) / max(ScalarType(1), ScalarType(imageWidth - 1));
-            }
-            t_rs = min(ScalarType(1), max(ScalarType(0), t_rs));
+            const ScalarType t_rs = rollingShutterTimeFromPixel<ScalarType>(
+                rollingShutterType, pix_prev[0], pix_prev[1], imageWidth, imageHeight);
             const RigidTransform<ScalarType> pose_rs =
                 RigidTransform<ScalarType>::interpolate(t_rs, worldToCamStart, worldToCamEnd);
             Vec2 pix_rs(ScalarType(0), ScalarType(0));
@@ -774,7 +569,7 @@ template <typename ScalarType> struct ProjectionForwardUT {
     /// Layout is `[K, R_start, R_end, t_start, t_end, distortionCoeffs]` per camera.
     inline __device__ void
     loadCameraInfoIntoSharedMemory() {
-        // Load projection matrices and world-to-camera matrices into shared memory
+        // Load per-camera matrices/coeffs into shared memory.
         alignas(Mat3) extern __shared__ char sharedMemory[];
 
         // Alignment sanity checks for the shared-memory layout below. If any of these fail, the
@@ -783,9 +578,6 @@ template <typename ScalarType> struct ProjectionForwardUT {
         static_assert(alignof(Mat3) >= alignof(ScalarType),
                       "Mat3 alignment must cover ScalarType alignment");
 
-        constexpr int64_t kMat3Elements = 9; // 3x3
-        constexpr int64_t kVec3Elements = 3; // 3
-
         // Keep a running pointer which we increment to assign shared memory blocks
         uint8_t *pointer = reinterpret_cast<uint8_t *>(sharedMemory);
 
@@ -808,54 +600,18 @@ template <typename ScalarType> struct ProjectionForwardUT {
             mNumDistortionCoeffs > 0 ? reinterpret_cast<ScalarType *>(pointer) : nullptr;
         pointer += C * mNumDistortionCoeffs * sizeof(ScalarType);
 
-        // Layout in element units:
-        const int64_t projectionOffset = 0;
-        const int64_t rotStartOffset   = projectionOffset + C * kMat3Elements;
-        const int64_t rotEndOffset     = rotStartOffset + C * kMat3Elements;
-        const int64_t transStartOffset = rotEndOffset + C * kMat3Elements;
-        const int64_t transEndOffset   = transStartOffset + C * kVec3Elements;
-        const int64_t distortionOffset = transEndOffset + C * kVec3Elements;
-        const int64_t totalElements    = distortionOffset + C * mNumDistortionCoeffs;
-
-        for (int64_t i = threadIdx.x; i < totalElements; i += blockDim.x) {
-            if (i < rotStartOffset) {
-                const auto camId   = (i - projectionOffset) / kMat3Elements;
-                const auto entryId = (i - projectionOffset) % kMat3Elements;
-                const auto rowId   = entryId / kVec3Elements;
-                const auto colId   = entryId % kVec3Elements;
-                projectionMatsShared[camId][rowId][colId] =
-                    mProjectionMatricesAcc[camId][rowId][colId];
-            } else if (i < rotEndOffset) {
-                const auto camId   = (i - rotStartOffset) / kMat3Elements;
-                const auto entryId = (i - rotStartOffset) % kMat3Elements;
-                const auto rowId   = entryId / kVec3Elements;
-                const auto colId   = entryId % kVec3Elements;
-                worldToCamRotMatsStartShared[camId][rowId][colId] =
-                    mWorldToCamMatricesStartAcc[camId][rowId][colId];
-            } else if (i < transStartOffset) {
-                const auto camId   = (i - rotEndOffset) / kMat3Elements;
-                const auto entryId = (i - rotEndOffset) % kMat3Elements;
-                const auto rowId   = entryId / kVec3Elements;
-                const auto colId   = entryId % kVec3Elements;
-                worldToCamRotMatsEndShared[camId][rowId][colId] =
-                    mWorldToCamMatricesEndAcc[camId][rowId][colId];
-            } else if (i < transEndOffset) {
-                const auto camId   = (i - transStartOffset) / kVec3Elements;
-                const auto entryId = (i - transStartOffset) % kVec3Elements;
-                worldToCamTranslationStartShared[camId][entryId] =
-                    mWorldToCamMatricesStartAcc[camId][entryId][3];
-            } else if (i < distortionOffset) {
-                const auto camId   = (i - transEndOffset) / kVec3Elements;
-                const auto entryId = (i - transEndOffset) % kVec3Elements;
-                worldToCamTranslationEndShared[camId][entryId] =
-                    mWorldToCamMatricesEndAcc[camId][entryId][3];
-            } else if (mNumDistortionCoeffs > 0) {
-                const auto baseIdx = i - distortionOffset;
-                const auto camId   = baseIdx / mNumDistortionCoeffs;
-                const auto entryId = baseIdx % mNumDistortionCoeffs;
-                distortionCoeffsShared[camId * mNumDistortionCoeffs + entryId] =
-                    mDistortionCoeffsAcc[camId][entryId];
-            }
+        copyMat3AccessorToShared<ScalarType>(C, projectionMatsShared, mProjectionMatricesAcc);
+        copyWorldToCamRotationToShared<ScalarType>(
+            C, worldToCamRotMatsStartShared, mWorldToCamMatricesStartAcc);
+        copyWorldToCamRotationToShared<ScalarType>(
+            C, worldToCamRotMatsEndShared, mWorldToCamMatricesEndAcc);
+        copyWorldToCamTranslationToShared<ScalarType>(
+            C, worldToCamTranslationStartShared, mWorldToCamMatricesStartAcc);
+        copyWorldToCamTranslationToShared<ScalarType>(
+            C, worldToCamTranslationEndShared, mWorldToCamMatricesEndAcc);
+        if (mNumDistortionCoeffs > 0) {
+            copyDistortionCoeffsToShared<ScalarType>(
+                C, mNumDistortionCoeffs, distortionCoeffsShared, mDistortionCoeffsAcc);
         }
     }
 
diff --git a/src/fvdb/detail/ops/gsplat/GaussianProjectionUT.h b/src/fvdb/detail/ops/gsplat/GaussianProjectionUT.h
index 6fc780c6..164e4f41 100644
--- a/src/fvdb/detail/ops/gsplat/GaussianProjectionUT.h
+++ b/src/fvdb/detail/ops/gsplat/GaussianProjectionUT.h
@@ -4,6 +4,8 @@
 #ifndef FVDB_DETAIL_OPS_GSPLAT_GAUSSIANPROJECTIONUT_H
 #define FVDB_DETAIL_OPS_GSPLAT_GAUSSIANPROJECTIONUT_H
 
+#include <fvdb/detail/ops/gsplat/GaussianCameraModels.h>
+
 #include <ATen/core/TensorBody.h>
 #include <torch/types.h>
 
@@ -13,31 +15,6 @@ namespace fvdb {
 namespace detail {
 namespace ops {
 
-enum class RollingShutterType { NONE = 0, VERTICAL = 1, HORIZONTAL = 2 };
-
-/// @brief Camera model for projection in the UT kernel.
-///
-/// This enum describes the camera projection family used by the UT kernel. It is intentionally
-/// broader than "distortion model" so we can add more complex models (e.g. RPC) later.
-///
-/// Notes:
-/// - `PINHOLE` and `ORTHOGRAPHIC` ignore `distortionCoeffs`.
-/// - `OPENCV_*` variants are pinhole + OpenCV-style distortion, and require packed coefficients.
-enum class CameraModel : int32_t {
-    // Pinhole intrinsics only (no distortion).
-    PINHOLE = 0,
-
-    // Orthographic intrinsics (no distortion).
-    ORTHOGRAPHIC = 5,
-
-    // OpenCV variants which are just pinhole intrinsics + optional distortion (all of them use the
-    // same [C,12] distortion coefficients layout: [k1,k2,k3,k4,k5,k6,p1,p2,s1,s2,s3,s4]).
-    OPENCV_RADTAN_5            = 1, // polynomial radial (k1,k2,k3) + tangential (p1,p2)).
-    OPENCV_RATIONAL_8          = 2, // rational radial (k1..k6) + tangential (p1,p2)).
-    OPENCV_RADTAN_THIN_PRISM_9 = 3, // polynomial radial + tangential + thin-prism (s1..s4)).
-    OPENCV_THIN_PRISM_12       = 4, // rational radial + tangential + thin-prism (s1..s4)).
-};
-
 /// @brief Unscented Transform hyperparameters.
 ///
 /// This kernel implements the canonical 3D UT with a fixed \(2D+1\) sigma point set (7 points).
diff --git a/src/fvdb/detail/ops/gsplat/GaussianProjectionUtils.cuh b/src/fvdb/detail/ops/gsplat/GaussianProjectionUtils.cuh
new file mode 100644
index 00000000..06128665
--- /dev/null
+++ b/src/fvdb/detail/ops/gsplat/GaussianProjectionUtils.cuh
@@ -0,0 +1,109 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
+//
+#ifndef FVDB_DETAIL_OPS_GSPLAT_GAUSSIANPROJECTIONUTILS_CUH
+#define FVDB_DETAIL_OPS_GSPLAT_GAUSSIANPROJECTIONUTILS_CUH
+
+#include <fvdb/detail/ops/gsplat/GaussianCameraMatrixUtils.cuh>
+#include <fvdb/detail/ops/gsplat/GaussianIntrinsics.cuh>
+#include <fvdb/detail/ops/gsplat/GaussianUtils.cuh>
+
+#include <nanovdb/math/Math.h>
+
+#include <cuda/std/cmath>
+
+namespace fvdb::detail::ops {
+
+template <typename T>
+inline __device__ bool
+isOutsideImageWithRadius(const nanovdb::math::Vec2<T> &mean2d,
+                         const T radiusX,
+                         const T radiusY,
+                         const int32_t imageWidth,
+                         const int32_t imageHeight) {
+    return (mean2d[0] + radiusX <= T(0) || mean2d[0] - radiusX >= T(imageWidth) ||
+            mean2d[1] + radiusY <= T(0) || mean2d[1] - radiusY >= T(imageHeight));
+}
+
+template <typename T>
+inline __device__ T
+radiusFromCovariance2dDet(const nanovdb::math::Mat2<T> &covar2d,
+                          const T det,
+                          const T sigma,
+                          const T detClamp = T(0.01)) {
+    // Matches the classic FVDB/gsplat heuristic: use the largest eigenvalue of the 2x2 covariance
+    // (via trace/determinant) and take `sigma` standard deviations.
+    const T b  = T(0.5) * (covar2d[0][0] + covar2d[1][1]);
+    const T v1 = b + ::cuda::std::sqrt(nanovdb::math::Max(detClamp, b * b - det));
+    return ::cuda::std::ceil(sigma * ::cuda::std::sqrt(v1));
+}
+
+template <typename T>
+inline __device__ void
+storeConicRowMajor3(const nanovdb::math::Mat2<T> &covar2dInverse, T *outConic3) {
+    outConic3[0] = covar2dInverse[0][0];
+    outConic3[1] = covar2dInverse[0][1];
+    outConic3[2] = covar2dInverse[1][1];
+}
+
+template <typename T>
+inline __device__ nanovdb::math::Vec3<T>
+packConicRowMajor3(const nanovdb::math::Mat2<T> &covar2dInverse) {
+    return nanovdb::math::Vec3<T>(covar2dInverse[0][0], covar2dInverse[0][1], covar2dInverse[1][1]);
+}
+
+template <typename T>
+inline __device__ nanovdb::math::Mat2<T>
+loadConicRowMajor3(const T *conic3) {
+    return nanovdb::math::Mat2<T>(conic3[0], conic3[1], conic3[1], conic3[2]);
+}
+
+template <typename T>
+inline __device__ nanovdb::math::Mat2<T>
+loadConicGradRowMajor3(const T *dConic3) {
+    // Off-diagonal appears once in packed representation but corresponds to two symmetric entries
+    // in the full 2x2 matrix.
+    return nanovdb::math::Mat2<T>(dConic3[0], dConic3[1] * T(0.5), dConic3[1] * T(0.5), dConic3[2]);
+}
+
+template <typename T>
+inline __device__ nanovdb::math::Mat3<T>
+loadCovarianceRowMajor6(const T *covars6) {
+    // Packed symmetric layout: [xx, xy, xz, yy, yz, zz]
+    return nanovdb::math::Mat3<T>(covars6[0],
+                                  covars6[1],
+                                  covars6[2], // 1st row
+                                  covars6[1],
+                                  covars6[3],
+                                  covars6[4], // 2nd row
+                                  covars6[2],
+                                  covars6[4],
+                                  covars6[5]  // 3rd row
+    );
+}
+
+template <typename T>
+inline __device__ void
+loadQuatScaleFromLogScalesRowMajor(const T *quats4,
+                                   const T *logScales3,
+                                   nanovdb::math::Vec4<T> &outQuatWxyz,
+                                   nanovdb::math::Vec3<T> &outScale) {
+    outQuatWxyz = nanovdb::math::Vec4<T>(quats4[0], quats4[1], quats4[2], quats4[3]);
+    outScale    = nanovdb::math::Vec3<T>(::cuda::std::exp(logScales3[0]),
+                                      ::cuda::std::exp(logScales3[1]),
+                                      ::cuda::std::exp(logScales3[2]));
+}
+
+template <typename T>
+inline __device__ void
+loadQuatScaleFromScalesRowMajor(const T *quats4,
+                                const T *scales3,
+                                nanovdb::math::Vec4<T> &outQuatWxyz,
+                                nanovdb::math::Vec3<T> &outScale) {
+    outQuatWxyz = nanovdb::math::Vec4<T>(quats4[0], quats4[1], quats4[2], quats4[3]);
+    outScale    = nanovdb::math::Vec3<T>(scales3[0], scales3[1], scales3[2]);
+}
+
+} // namespace fvdb::detail::ops
+
+#endif // FVDB_DETAIL_OPS_GSPLAT_GAUSSIANPROJECTIONUTILS_CUH
diff --git a/src/fvdb/detail/ops/gsplat/GaussianRasterizeForward.cu b/src/fvdb/detail/ops/gsplat/GaussianRasterizeForward.cu
index dc620b4f..a4ace0fe 100644
--- a/src/fvdb/detail/ops/gsplat/GaussianRasterizeForward.cu
+++ b/src/fvdb/detail/ops/gsplat/GaussianRasterizeForward.cu
@@ -259,11 +259,21 @@ rasterizeGaussiansForward(RasterizeForwardArgs<ScalarType, NUM_CHANNELS, IS_PACK
     uint32_t activePixelIndex{0};
     cuda::std::tie(pixelInImage, activePixelIndex) = commonArgs.activePixelIndex(row, col);
 
-    if (commonArgs.mHasMasks && pixelInImage && !commonArgs.mMasks[cameraId][tileRow][tileCol]) {
-        auto pixIdx = commonArgs.pixelIndex(cameraId, row, col, activePixelIndex);
-        args.writeFeatures(pixIdx, [&](uint32_t k) {
-            return commonArgs.mHasBackgrounds ? commonArgs.mBackgrounds[cameraId][k] : 0.0f;
-        });
+    // Parity with classic semantics: masked tiles write background and contribute nothing.
+    //
+    // IMPORTANT: this kernel uses block-level barriers later (`__syncthreads_count`,
+    // `__syncthreads`). Any early return must be taken by *all* threads in the block, otherwise
+    // edge tiles can deadlock when some threads are outside the image bounds.
+    const bool tileMasked = commonArgs.mHasMasks && !commonArgs.mMasks[cameraId][tileRow][tileCol];
+    if (tileMasked) {
+        if (pixelInImage) {
+            auto pixIdx = commonArgs.pixelIndex(cameraId, row, col, activePixelIndex);
+            args.writeAlpha(pixIdx, 0.0f);
+            args.writeLastId(pixIdx, -1);
+            args.writeFeatures(pixIdx, [&](uint32_t k) {
+                return commonArgs.mHasBackgrounds ? commonArgs.mBackgrounds[cameraId][k] : 0.0f;
+            });
+        }
         return;
     }
 
@@ -584,16 +594,15 @@ dispatchGaussianRasterizeForward<torch::kCUDA>(
     const uint32_t imageOriginW,
     const uint32_t imageOriginH,
     const uint32_t tileSize,
-    const torch::Tensor &tileOffsets,              // [C, tile_height, tile_width]
-    const torch::Tensor &tileGaussianIds,          // [n_isects]
-    const at::optional<torch::Tensor> &backgrounds // [C, D]
+    const torch::Tensor &tileOffsets,               // [C, tile_height, tile_width]
+    const torch::Tensor &tileGaussianIds,           // [n_isects]
+    const at::optional<torch::Tensor> &backgrounds, // [C, D]
+    const at::optional<torch::Tensor> &masks        // [C, tile_height, tile_width] bool
 ) {
     FVDB_FUNC_RANGE();
     const uint32_t channels = features.size(-1);
     const bool isPacked     = means2d.dim() == 2;
 
-    const std::optional<torch::Tensor> masks = std::nullopt;
-
 #define CALL_FWD_CUDA(N)                                                                        \
     case N: {                                                                                   \
         if (isPacked) {                                                                         \
@@ -676,16 +685,15 @@ dispatchGaussianRasterizeForward<torch::kPrivateUse1>(
     const uint32_t imageOriginH,
     const uint32_t tileSize,
     // intersections
-    const torch::Tensor &tileOffsets,              // [C, tile_height, tile_width]
-    const torch::Tensor &tileGaussianIds,          // [n_isects]
-    const at::optional<torch::Tensor> &backgrounds // [C, D]
+    const torch::Tensor &tileOffsets,               // [C, tile_height, tile_width]
+    const torch::Tensor &tileGaussianIds,           // [n_isects]
+    const at::optional<torch::Tensor> &backgrounds, // [C, D]
+    const at::optional<torch::Tensor> &masks        // [C, tile_height, tile_width] bool
 ) {
     FVDB_FUNC_RANGE();
     const uint32_t channels = features.size(-1);
     const bool isPacked     = means2d.dim() == 2;
 
-    const std::optional<torch::Tensor> masks = std::nullopt;
-
 #define CALL_FWD_PRIVATEUSE1(N)                                                                 \
     case N: {                                                                                   \
         if (isPacked) {                                                                         \
@@ -768,9 +776,10 @@ dispatchGaussianRasterizeForward<torch::kCPU>(
     const uint32_t imageOriginH,
     const uint32_t tileSize,
     // intersections
-    const torch::Tensor &tileOffsets,              // [C, tile_height, tile_width]
-    const torch::Tensor &tileGaussianIds,          // [n_isects]
-    const at::optional<torch::Tensor> &backgrounds // [C, D]
+    const torch::Tensor &tileOffsets,               // [C, tile_height, tile_width]
+    const torch::Tensor &tileGaussianIds,           // [n_isects]
+    const at::optional<torch::Tensor> &backgrounds, // [C, D]
+    const at::optional<torch::Tensor> &masks        // [C, tile_height, tile_width] bool
 ) {
     TORCH_CHECK_NOT_IMPLEMENTED(false, "CPU implementation not available");
 }
diff --git a/src/fvdb/detail/ops/gsplat/GaussianRasterizeForward.h b/src/fvdb/detail/ops/gsplat/GaussianRasterizeForward.h
index 21f1e5d0..31db01a9 100644
--- a/src/fvdb/detail/ops/gsplat/GaussianRasterizeForward.h
+++ b/src/fvdb/detail/ops/gsplat/GaussianRasterizeForward.h
@@ -56,9 +56,10 @@ std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> dispatchGaussianRasteriz
     const uint32_t imageOriginW,
     const uint32_t imageOriginH,
     const uint32_t tileSize,
-    const torch::Tensor &tileOffsets,                            // [C, tile_height, tile_width]
-    const torch::Tensor &tileGaussianIds,                        // [n_isects]
-    const at::optional<torch::Tensor> &backgrounds = at::nullopt // [C, D]
+    const torch::Tensor &tileOffsets,                             // [C, tile_height, tile_width]
+    const torch::Tensor &tileGaussianIds,                         // [n_isects]
+    const at::optional<torch::Tensor> &backgrounds = at::nullopt, // [C, D]
+    const at::optional<torch::Tensor> &masks = at::nullopt // [C, tile_height, tile_width] bool
 );
 
 /// @brief Dispatches the sparse Gaussian rasterization forward pass to the specified device.
diff --git a/src/fvdb/detail/ops/gsplat/GaussianRasterizeFromWorld.cuh b/src/fvdb/detail/ops/gsplat/GaussianRasterizeFromWorld.cuh
new file mode 100644
index 00000000..09a53ee0
--- /dev/null
+++ b/src/fvdb/detail/ops/gsplat/GaussianRasterizeFromWorld.cuh
@@ -0,0 +1,254 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
+//
+#ifndef FVDB_DETAIL_OPS_GSPLAT_GAUSSIANRASTERIZEFROMWORLD_CUH
+#define FVDB_DETAIL_OPS_GSPLAT_GAUSSIANRASTERIZEFROMWORLD_CUH
+
+#include <fvdb/detail/ops/gsplat/GaussianCameraModels.h>
+#include <fvdb/detail/ops/gsplat/GaussianIntrinsics.cuh>
+#include <fvdb/detail/ops/gsplat/GaussianOpenCVDistortion.cuh>
+#include <fvdb/detail/ops/gsplat/GaussianRigidTransform.cuh>
+#include <fvdb/detail/ops/gsplat/GaussianRollingShutter.cuh>
+#include <fvdb/detail/ops/gsplat/GaussianUtils.cuh>
+#include <fvdb/detail/utils/AccessorHelpers.cuh>
+
+#include <nanovdb/math/Math.h>
+#include <nanovdb/math/Ray.h>
+
+#include <cuda/std/tuple>
+
+#include <cstdint>
+
+namespace fvdb::detail::ops {
+
+/// Opacity threshold used by 3DGS alpha compositing.
+constexpr __device__ float kAlphaThreshold = 0.999f;
+
+/// Common dense-tile rasterization arguments shared by from-world forward/backward kernels.
+struct RasterizeFromWorldCommonArgs {
+    using TileOffsetsAccessor     = fvdb::TorchRAcc64<int32_t, 3>;
+    using TileGaussianIdsAccessor = fvdb::TorchRAcc64<int32_t, 1>;
+
+    uint32_t numCameras;
+    uint32_t imageWidth;
+    uint32_t imageHeight;
+    uint32_t imageOriginW;
+    uint32_t imageOriginH;
+    uint32_t tileSize;
+    uint32_t numTilesW;
+    uint32_t numTilesH;
+    uint32_t numChannels;
+    int32_t totalIntersections;
+
+    TileOffsetsAccessor tileOffsets;         // [C, TH, TW]
+    TileGaussianIdsAccessor tileGaussianIds; // [n_isects]
+    const float *backgrounds;                // [C, D] or nullptr
+    const bool *masks;                       // [C, TH, TW] or nullptr
+
+    inline __device__ void
+    denseCoordinates(uint32_t &cameraId,
+                     uint32_t &tileRow,
+                     uint32_t &tileCol,
+                     uint32_t &row,
+                     uint32_t &col) const {
+        const uint32_t linearBlock = blockIdx.x;
+        cameraId                   = linearBlock / (numTilesH * numTilesW);
+        const uint32_t tileLinear  = linearBlock - cameraId * (numTilesH * numTilesW);
+        tileRow                    = tileLinear / numTilesW;
+        tileCol                    = tileLinear - tileRow * numTilesW;
+        row                        = tileRow * tileSize + threadIdx.y;
+        col                        = tileCol * tileSize + threadIdx.x;
+    }
+
+    inline __device__ uint32_t
+    tileId(const uint32_t cameraId, const uint32_t tileRow, const uint32_t tileCol) const {
+        return cameraId * numTilesH * numTilesW + tileRow * numTilesW + tileCol;
+    }
+
+    inline __device__ bool
+    tileMasked(const uint32_t cameraId, const uint32_t tileRow, const uint32_t tileCol) const {
+        return (masks != nullptr) && (!masks[tileId(cameraId, tileRow, tileCol)]);
+    }
+
+    inline __device__ cuda::std::tuple<int32_t, int32_t>
+    tileGaussianRange(const uint32_t cameraId,
+                      const uint32_t tileRow,
+                      const uint32_t tileCol) const {
+        const int32_t rangeStart = tileOffsets[cameraId][tileRow][tileCol];
+
+        const int32_t rangeEnd =
+            ((cameraId == numCameras - 1) && (tileRow == numTilesH - 1) &&
+             (tileCol == numTilesW - 1))
+                ? totalIntersections
+                : ((tileCol + 1 < numTilesW)
+                       ? tileOffsets[cameraId][tileRow][tileCol + 1]
+                       : ((tileRow + 1 < numTilesH) ? tileOffsets[cameraId][tileRow + 1][0]
+                                                    : tileOffsets[cameraId + 1][0][0]));
+        return {rangeStart, rangeEnd};
+    }
+
+    inline __device__ uint32_t
+    pixelId(const uint32_t row, const uint32_t col) const {
+        return row * imageWidth + col;
+    }
+
+    inline __device__ uint32_t
+    outputPixelBase(const uint32_t cameraId, const uint32_t pixId) const {
+        return cameraId * imageHeight * imageWidth + pixId;
+    }
+
+    inline __device__ uint32_t
+    outputFeatureBase(const uint32_t cameraId, const uint32_t pixId) const {
+        return outputPixelBase(cameraId, pixId) * numChannels;
+    }
+
+    inline __device__ float
+    backgroundValue(const uint32_t cameraId, const uint32_t channelId) const {
+        return (backgrounds != nullptr) ? backgrounds[cameraId * numChannels + channelId] : 0.0f;
+    }
+};
+
+template <typename T>
+inline __device__ nanovdb::math::Vec3<T>
+normalizeSafe(const nanovdb::math::Vec3<T> &v) {
+    const T n2 = v.dot(v);
+    if (n2 > T(0)) {
+        return v * (T(1) / sqrt(n2));
+    }
+    return nanovdb::math::Vec3<T>(T(0), T(0), T(0));
+}
+
+/// Vector-Jacobian product for y = normalizeSafe(x).
+template <typename T>
+inline __device__ nanovdb::math::Vec3<T>
+normalizeSafeVJP(const nanovdb::math::Vec3<T> &x, const nanovdb::math::Vec3<T> &v_y) {
+    const T n2 = x.dot(x);
+    if (!(n2 > T(0))) {
+        return nanovdb::math::Vec3<T>(T(0), T(0), T(0));
+    }
+    const T n    = sqrt(n2);
+    const T invn = T(1) / n;
+    // v_x = (I/n - x x^T / n^3) v_y
+    const T invn3 = invn * invn * invn;
+    const T xdotv = x.dot(v_y);
+    return v_y * invn - x * (xdotv * invn3);
+}
+
+template <typename T>
+inline __device__ nanovdb::math::Ray<T>
+pixelToWorldRay(const uint32_t row,
+                const uint32_t col,
+                const uint32_t imageWidth,
+                const uint32_t imageHeight,
+                const uint32_t imageOriginW,
+                const uint32_t imageOriginH,
+                const nanovdb::math::Mat3<T> &R_wc_start,
+                const nanovdb::math::Vec3<T> &t_wc_start,
+                const nanovdb::math::Mat3<T> &R_wc_end,
+                const nanovdb::math::Vec3<T> &t_wc_end,
+                const nanovdb::math::Mat3<T> &K,
+                const T *distCoeffs,
+                const int64_t numDistCoeffs,
+                const RollingShutterType rollingShutterType,
+                const CameraModel cameraModel) {
+    // Pixel center in crop coordinates.
+    const T px = T(col) + T(imageOriginW) + T(0.5);
+    const T py = T(row) + T(imageOriginH) + T(0.5);
+
+    // Rolling shutter time based on pixel location.
+    const T u = rollingShutterTimeFromPixel<T>(rollingShutterType, px, py, imageWidth, imageHeight);
+
+    nanovdb::math::Mat3<T> R_wc;
+    nanovdb::math::Vec3<T> t_wc;
+    if (rollingShutterType == RollingShutterType::NONE) {
+        R_wc = R_wc_start;
+        t_wc = t_wc_start;
+    } else {
+        const RigidTransform<T> worldToCamStart(R_wc_start, t_wc_start);
+        const RigidTransform<T> worldToCamEnd(R_wc_end, t_wc_end);
+        const RigidTransform<T> worldToCam =
+            RigidTransform<T>::interpolate(u, worldToCamStart, worldToCamEnd);
+        R_wc = worldToCam.R;
+        t_wc = worldToCam.t;
+    }
+
+    // Invert rigid transform to get camera->world.
+    const nanovdb::math::Mat3<T> R_cw = R_wc.transpose();
+
+    const CameraIntrinsics<T> intr = loadIntrinsics(K);
+    const nanovdb::math::Vec2<T> p_distorted((px - intr.cx) / intr.fx, (py - intr.cy) / intr.fy);
+    const nanovdb::math::Vec2<T> p =
+        undistortOpenCVPackedFixedPoint(cameraModel, p_distorted, distCoeffs, numDistCoeffs);
+
+    // Note: `nanovdb::math::Ray` is the standard ray type used elsewhere in FVDB.
+    // We store world-space `eye` and `dir` (normalized) and leave [t0,t1] at default values.
+    if (cameraModel == CameraModel::ORTHOGRAPHIC) {
+        // Parallel rays; origin varies with pixel.
+        const nanovdb::math::Vec3<T> o_cam(p[0], p[1], T(0));
+        const nanovdb::math::Vec3<T> d_cam(T(0), T(0), T(1));
+        // p_world = R_cw * (p_cam - t_wc)
+        const nanovdb::math::Vec3<T> origin_w = R_cw * (o_cam - t_wc);
+        nanovdb::math::Vec3<T> dir_w          = normalizeSafe(R_cw * d_cam);
+        return nanovdb::math::Ray<T>(origin_w, dir_w);
+    }
+
+    // Perspective (pinhole / OpenCV distorted pinhole): origin at camera center.
+    const nanovdb::math::Vec3<T> d_cam = normalizeSafe(nanovdb::math::Vec3<T>(p[0], p[1], T(1)));
+    const nanovdb::math::Vec3<T> origin_w =
+        R_cw * (nanovdb::math::Vec3<T>(T(0), T(0), T(0)) - t_wc);
+    nanovdb::math::Vec3<T> dir_w = normalizeSafe(R_cw * d_cam);
+    return nanovdb::math::Ray<T>(origin_w, dir_w);
+}
+
+/// Load quaternion in [w,x,y,z] order.
+template <typename T>
+inline __device__ nanovdb::math::Vec4<T>
+quatLoadWxyz(const T *q) {
+    return nanovdb::math::Vec4<T>(q[0], q[1], q[2], q[3]);
+}
+
+/// Build S^{-1} R^T from quaternion + scale.
+template <typename T>
+inline __device__ nanovdb::math::Mat3<T>
+computeIsclRot(const nanovdb::math::Vec4<T> &quat_wxyz, const nanovdb::math::Vec3<T> &scale) {
+    const nanovdb::math::Mat3<T> R = quaternionToRotationMatrix<T>(quat_wxyz);
+    const nanovdb::math::Mat3<T> S_inv(
+        T(1) / scale[0], T(0), T(0), T(0), T(1) / scale[1], T(0), T(0), T(0), T(1) / scale[2]);
+    return S_inv * R.transpose();
+}
+
+/// Vector-Jacobian product for isclRot = S^{-1} R^T.
+template <typename T>
+inline __device__ void
+isclRotVectorJacobianProduct(const nanovdb::math::Vec4<T> &quat_wxyz,
+                             const nanovdb::math::Vec3<T> &scale,
+                             const nanovdb::math::Mat3<T> &dLossDIsclRot,
+                             nanovdb::math::Vec4<T> &dLossDQuat,
+                             nanovdb::math::Vec3<T> &dLossDLogScale) {
+    // TODO(fvdb): Consider returning {dLossDQuat, dLossDLogScale} to match other VJP helpers in
+    // this module (e.g. `transformCovarianceWorldToCamVectorJacobianProduct`), rather than using
+    // output reference parameters.
+    // iscl_rot = S_inv * R^T, with S_inv = diag(1/scale)
+    const nanovdb::math::Mat3<T> R = quaternionToRotationMatrix<T>(quat_wxyz);
+    const nanovdb::math::Vec3<T> invScale(T(1) / scale[0], T(1) / scale[1], T(1) / scale[2]);
+
+    // For D = A * B, dA = G * B^T, dB = A^T * G.
+    // Here A = S_inv (diag), B = R^T.
+    const nanovdb::math::Mat3<T> dA = dLossDIsclRot * R; // since (R^T)^T = R
+    const nanovdb::math::Mat3<T> A(
+        invScale[0], T(0), T(0), T(0), invScale[1], T(0), T(0), T(0), invScale[2]);
+    const nanovdb::math::Mat3<T> dB = A * dLossDIsclRot;
+    const nanovdb::math::Mat3<T> dR = dB.transpose(); // B = R^T
+
+    dLossDQuat = quaternionToRotationMatrixVectorJacobianProduct<T>(quat_wxyz, dR);
+
+    // Diagonal of dA gives gradients w.r.t invScale.
+    const nanovdb::math::Vec3<T> dInvScale(dA[0][0], dA[1][1], dA[2][2]);
+    // invScale = 1/scale => dlogScale = dscale * scale = -dInvScale / scale
+    dLossDLogScale = nanovdb::math::Vec3<T>(
+        -dInvScale[0] * invScale[0], -dInvScale[1] * invScale[1], -dInvScale[2] * invScale[2]);
+}
+
+} // namespace fvdb::detail::ops
+
+#endif // FVDB_DETAIL_OPS_GSPLAT_GAUSSIANRASTERIZEFROMWORLD_CUH
diff --git a/src/fvdb/detail/ops/gsplat/GaussianRasterizeFromWorldBackward.cu b/src/fvdb/detail/ops/gsplat/GaussianRasterizeFromWorldBackward.cu
new file mode 100644
index 00000000..93975586
--- /dev/null
+++ b/src/fvdb/detail/ops/gsplat/GaussianRasterizeFromWorldBackward.cu
@@ -0,0 +1,624 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
+//
+#include <fvdb/detail/ops/gsplat/GaussianCameraMatrixUtils.cuh>
+#include <fvdb/detail/ops/gsplat/GaussianRasterizeFromWorld.cuh>
+#include <fvdb/detail/ops/gsplat/GaussianRasterizeFromWorldBackward.h>
+#include <fvdb/detail/ops/gsplat/GaussianRasterizeOptionalInputs.h>
+#include <fvdb/detail/ops/gsplat/GaussianWarpUtils.cuh>
+#include <fvdb/detail/utils/Nvtx.h>
+
+#include <ATen/cuda/Atomic.cuh>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <c10/util/Exception.h>
+
+#include <cooperative_groups.h>
+
+namespace fvdb::detail::ops {
+namespace cg = cooperative_groups;
+
+namespace {
+
+template <uint32_t NUM_CHANNELS> struct SharedGaussian {
+    int32_t id;                       // flattened id in [0, C*N)
+    nanovdb::math::Vec3<float> mean;  // world mean
+    nanovdb::math::Vec4<float> quat;  // wxyz
+    nanovdb::math::Vec3<float> scale; // exp(log_scales)
+    nanovdb::math::Mat3<float> isclR; // S^{-1} R^T
+    float opacity;
+};
+
+template <uint32_t NUM_CHANNELS>
+__global__ void
+rasterizeFromWorld3DGSBackwardKernel(
+    const RasterizeFromWorldCommonArgs commonArgs,
+    // Gaussians
+    const torch::PackedTensorAccessor64<float, 2, torch::RestrictPtrTraits> means,     // [N,3]
+    const torch::PackedTensorAccessor64<float, 2, torch::RestrictPtrTraits> quats,     // [N,4]
+    const torch::PackedTensorAccessor64<float, 2, torch::RestrictPtrTraits> logScales, // [N,3]
+    // Per-camera
+    const torch::PackedTensorAccessor64<float, 3, torch::RestrictPtrTraits> features,  // [C,N,D]
+    const torch::PackedTensorAccessor64<float, 2, torch::RestrictPtrTraits> opacities, // [C,N]
+    const torch::PackedTensorAccessor64<float, 3, torch::RestrictPtrTraits>
+        worldToCamStart,                                                               // [C,4,4]
+    const torch::PackedTensorAccessor64<float, 3, torch::RestrictPtrTraits>
+        worldToCamEnd,                                                                 // [C,4,4]
+    const torch::PackedTensorAccessor64<float, 3, torch::RestrictPtrTraits> K,         // [C,3,3]
+    const torch::PackedTensorAccessor64<float, 2, torch::RestrictPtrTraits>
+        distortionCoeffs,                                                              // [C,K]
+    const int64_t numDistCoeffs,
+    const RollingShutterType rollingShutterType,
+    const CameraModel cameraModel,
+    // Forward outputs
+    const torch::PackedTensorAccessor64<float, 4, torch::RestrictPtrTraits>
+        renderedAlphas,                                                                // [C,H,W,1]
+    const torch::PackedTensorAccessor64<int32_t, 3, torch::RestrictPtrTraits> lastIds, // [C,H,W]
+    // Grad outputs
+    const torch::PackedTensorAccessor64<float, 4, torch::RestrictPtrTraits>
+        dLossDRenderedFeatures, // [C,H,W,D]
+    const torch::PackedTensorAccessor64<float, 4, torch::RestrictPtrTraits>
+        dLossDRenderedAlphas,   // [C,H,W,1]
+    // Outputs (grads)
+    float *__restrict__ dMeans,     // [N,3]
+    float *__restrict__ dQuats,     // [N,4]
+    float *__restrict__ dLogScales, // [N,3]
+    float *__restrict__ dFeatures,  // [C,N,D]
+    float *__restrict__ dOpacities  // [C,N]
+) {
+    auto block               = cg::this_thread_block();
+    const uint32_t blockSize = blockDim.x * blockDim.y;
+
+    uint32_t camId, tileRow, tileCol, row, col;
+    commonArgs.denseCoordinates(camId, tileRow, tileCol, row, col);
+    const bool inside = (row < commonArgs.imageHeight && col < commonArgs.imageWidth);
+
+    // Parity with classic rasterizer: masked tiles contribute nothing.
+    //
+    // IMPORTANT: this kernel uses block-level barriers later (`block.sync`). Any early return must
+    // be taken by *all* threads in the block, otherwise edge tiles can deadlock when some threads
+    // are `!inside`. So we make the return block-wide.
+    const bool tileMasked = commonArgs.tileMasked(camId, tileRow, tileCol);
+    if (tileMasked) {
+        return;
+    }
+
+    // Camera pose for this camera.
+    nanovdb::math::Mat3<float> R_wc_start, R_wc_end;
+    nanovdb::math::Vec3<float> t_wc_start, t_wc_end;
+    {
+        loadWorldToCamRtFromAccessor44<float>(worldToCamStart, camId, R_wc_start, t_wc_start);
+        loadWorldToCamRtFromAccessor44<float>(worldToCamEnd, camId, R_wc_end, t_wc_end);
+    }
+
+    const nanovdb::math::Mat3<float> K_cam = loadMat3FromAccessor33<float>(K, camId);
+    const float *distPtr = (numDistCoeffs > 0) ? &distortionCoeffs[camId][0] : nullptr;
+
+    const nanovdb::math::Ray<float> ray = pixelToWorldRay<float>(row,
+                                                                 col,
+                                                                 commonArgs.imageWidth,
+                                                                 commonArgs.imageHeight,
+                                                                 commonArgs.imageOriginW,
+                                                                 commonArgs.imageOriginH,
+                                                                 R_wc_start,
+                                                                 t_wc_start,
+                                                                 R_wc_end,
+                                                                 t_wc_end,
+                                                                 K_cam,
+                                                                 distPtr,
+                                                                 numDistCoeffs,
+                                                                 rollingShutterType,
+                                                                 cameraModel);
+
+    // Whether this pixel participates in the backward pass.
+    //
+    // NOTE: We must *not* early-return for `!inside` because the kernel uses `block.sync` later.
+    const bool rayValid = ray.dir().dot(ray.dir()) > 0.0f;
+    const bool done     = inside && rayValid;
+
+    // Gaussian range for this tile.
+    const auto [rangeStart, rangeEnd] = commonArgs.tileGaussianRange(camId, tileRow, tileCol);
+
+    // If the tile has no intersections, there is nothing to do. This must be a block-wide return.
+    if (rangeEnd <= rangeStart) {
+        return;
+    }
+
+    // Forward state for this pixel.
+    int32_t binFinal = -1;
+    float T_final    = 1.0f;
+    float T          = 1.0f;
+
+    float v_render_c[NUM_CHANNELS];
+#pragma unroll
+    for (uint32_t k = 0; k < NUM_CHANNELS; ++k) {
+        v_render_c[k] = 0.f;
+    }
+    float v_render_a = 0.f;
+
+    if (done) {
+        binFinal = lastIds[camId][row][col];
+
+        const float alphaFinal = renderedAlphas[camId][row][col][0];
+        T_final                = 1.0f - alphaFinal;
+        T                      = T_final;
+
+#pragma unroll
+        for (uint32_t k = 0; k < NUM_CHANNELS; ++k) {
+            v_render_c[k] = dLossDRenderedFeatures[camId][row][col][k];
+        }
+        v_render_a = dLossDRenderedAlphas[camId][row][col][0];
+    }
+
+    float buffer[NUM_CHANNELS];
+#pragma unroll
+    for (uint32_t k = 0; k < NUM_CHANNELS; ++k) {
+        buffer[k] = 0.f;
+    }
+
+    // Shared memory for gaussian batches.
+    extern __shared__ char smem[];
+    int32_t *idBatch = reinterpret_cast<int32_t *>(smem);                      // [blockSize]
+    auto *gBatch =
+        reinterpret_cast<SharedGaussian<NUM_CHANNELS> *>(&idBatch[blockSize]); // [blockSize]
+
+    const uint32_t threadRank      = block.thread_rank();
+    cg::thread_block_tile<32> warp = cg::tiled_partition<32>(block);
+
+    const int32_t nIsects   = rangeEnd - rangeStart;
+    const uint32_t nBatches = (nIsects + (int32_t)blockSize - 1) / (int32_t)blockSize;
+
+    // Reduce max binFinal within warp (used to early-skip).
+    const int32_t warpBinFinal = cg::reduce(warp, binFinal, cg::greater<int32_t>());
+
+    for (uint32_t b = 0; b < nBatches; ++b) {
+        block.sync();
+        const int32_t batchEnd  = rangeEnd - 1 - (int32_t)(blockSize * b);
+        const int32_t batchSize = min((int32_t)blockSize, batchEnd + 1 - rangeStart);
+        const int32_t idx       = batchEnd - (int32_t)threadRank;
+
+        if (idx >= rangeStart) {
+            const int32_t flatId = commonArgs.tileGaussianIds[idx];
+            idBatch[threadRank]  = flatId;
+            const int32_t gid    = flatId % (int32_t)means.size(0);
+            const int32_t cid    = flatId / (int32_t)means.size(0);
+
+            const nanovdb::math::Vec3<float> mean_w(means[gid][0], means[gid][1], means[gid][2]);
+            const nanovdb::math::Vec4<float> quat_wxyz(
+                quats[gid][0], quats[gid][1], quats[gid][2], quats[gid][3]);
+            const nanovdb::math::Vec3<float> scale(
+                __expf(logScales[gid][0]), __expf(logScales[gid][1]), __expf(logScales[gid][2]));
+            const nanovdb::math::Mat3<float> isclR = computeIsclRot<float>(quat_wxyz, scale);
+            const float op                         = opacities[cid][gid];
+
+            gBatch[threadRank].id      = flatId;
+            gBatch[threadRank].mean    = mean_w;
+            gBatch[threadRank].quat    = quat_wxyz;
+            gBatch[threadRank].scale   = scale;
+            gBatch[threadRank].isclR   = isclR;
+            gBatch[threadRank].opacity = op;
+        }
+
+        block.sync();
+
+        // Process gaussians in this batch, from back-to-front.
+        const int32_t startT = max(0, batchEnd - warpBinFinal);
+        for (int32_t t = startT; t < batchSize; ++t) {
+            bool valid = done;
+            if (batchEnd - t > binFinal) {
+                valid = false;
+            }
+
+            float alpha = 0.f;
+            float opac  = 0.f;
+            float vis   = 0.f;
+
+            nanovdb::math::Vec3<float> mean_w(0.f);
+            nanovdb::math::Vec4<float> quat_wxyz(1.f, 0.f, 0.f, 0.f);
+            nanovdb::math::Vec3<float> scale(1.f);
+            nanovdb::math::Mat3<float> Mt;
+            nanovdb::math::Vec3<float> o_minus_mu(0.f);
+            nanovdb::math::Vec3<float> gro(0.f), grd(0.f), grd_n(0.f), gcrod(0.f);
+            float grayDist = 0.f;
+
+            if (valid) {
+                const SharedGaussian<NUM_CHANNELS> g = gBatch[t];
+                mean_w                               = g.mean;
+                quat_wxyz                            = g.quat;
+                scale                                = g.scale;
+                Mt                                   = g.isclR;
+                opac                                 = g.opacity;
+
+                o_minus_mu        = ray.eye() - mean_w;
+                gro               = Mt * o_minus_mu;
+                grd               = Mt * ray.dir();
+                grd_n             = normalizeSafe<float>(grd);
+                gcrod             = grd_n.cross(gro);
+                grayDist          = gcrod.dot(gcrod);
+                const float power = -0.5f * grayDist;
+                vis               = __expf(power);
+                alpha             = min(kAlphaThreshold, opac * vis);
+                if (power > 0.f || alpha < 1.f / 255.f) {
+                    valid = false;
+                }
+            }
+
+            if (!warp.any(valid)) {
+                continue;
+            }
+
+            float v_feat_local[NUM_CHANNELS];
+#pragma unroll
+            for (uint32_t k = 0; k < NUM_CHANNELS; ++k) {
+                v_feat_local[k] = 0.f;
+            }
+
+            nanovdb::math::Vec3<float> v_mean_local(0.f);
+            float v_quat_local[4] = {0.f, 0.f, 0.f, 0.f};
+            nanovdb::math::Vec3<float> v_logscale_local(0.f);
+            float v_opacity_local = 0.f;
+
+            if (valid) {
+                const float ra = 1.0f / (1.0f - alpha);
+                T *= ra;
+
+                const float fac = alpha * T;
+#pragma unroll
+                for (uint32_t k = 0; k < NUM_CHANNELS; ++k) {
+                    v_feat_local[k] = fac * v_render_c[k];
+                }
+
+                // v_alpha accumulation
+                float v_alpha        = 0.f;
+                const int32_t flatId = idBatch[t];
+                const int32_t cid    = flatId / (int32_t)means.size(0);
+                const int32_t gid    = flatId % (int32_t)means.size(0);
+
+#pragma unroll
+                for (uint32_t k = 0; k < NUM_CHANNELS; ++k) {
+                    const float c = features[cid][gid][k];
+                    v_alpha += (c * T - buffer[k] * ra) * v_render_c[k];
+                }
+
+                v_alpha += T_final * ra * v_render_a;
+
+                if (commonArgs.backgrounds != nullptr) {
+                    float accum = 0.f;
+#pragma unroll
+                    for (uint32_t k = 0; k < NUM_CHANNELS; ++k) {
+                        accum += commonArgs.backgroundValue(camId, k) * v_render_c[k];
+                    }
+                    v_alpha += -T_final * ra * accum;
+                }
+
+                if (opac * vis <= kAlphaThreshold) {
+                    const float v_vis                        = opac * v_alpha;
+                    const float v_gradDist                   = -0.5f * vis * v_vis;
+                    const nanovdb::math::Vec3<float> v_gcrod = 2.0f * v_gradDist * gcrod;
+                    const nanovdb::math::Vec3<float> v_grd_n = -(v_gcrod.cross(gro));
+                    const nanovdb::math::Vec3<float> v_gro   = v_gcrod.cross(grd_n);
+
+                    const nanovdb::math::Vec3<float> v_grd = normalizeSafeVJP<float>(grd, v_grd_n);
+
+                    // v_Mt = outer(v_grd, ray.dir) + outer(v_gro, (ray.eye() - mean))
+                    const nanovdb::math::Vec3<float> rayDir = ray.dir();
+                    nanovdb::math::Mat3<float> v_Mt =
+                        nanovdb::math::Mat3<float>(v_grd[0] * rayDir[0],
+                                                   v_grd[0] * rayDir[1],
+                                                   v_grd[0] * rayDir[2],
+                                                   v_grd[1] * rayDir[0],
+                                                   v_grd[1] * rayDir[1],
+                                                   v_grd[1] * rayDir[2],
+                                                   v_grd[2] * rayDir[0],
+                                                   v_grd[2] * rayDir[1],
+                                                   v_grd[2] * rayDir[2]);
+                    v_Mt += nanovdb::math::Mat3<float>(v_gro[0] * o_minus_mu[0],
+                                                       v_gro[0] * o_minus_mu[1],
+                                                       v_gro[0] * o_minus_mu[2],
+                                                       v_gro[1] * o_minus_mu[0],
+                                                       v_gro[1] * o_minus_mu[1],
+                                                       v_gro[1] * o_minus_mu[2],
+                                                       v_gro[2] * o_minus_mu[0],
+                                                       v_gro[2] * o_minus_mu[1],
+                                                       v_gro[2] * o_minus_mu[2]);
+
+                    const nanovdb::math::Vec3<float> v_o_minus_mu = Mt.transpose() * v_gro;
+                    v_mean_local += -v_o_minus_mu;
+
+                    nanovdb::math::Vec4<float> dQuat(0.f);
+                    nanovdb::math::Vec3<float> dLogScale(0.f);
+                    isclRotVectorJacobianProduct<float>(quat_wxyz, scale, v_Mt, dQuat, dLogScale);
+                    v_quat_local[0] += dQuat[0];
+                    v_quat_local[1] += dQuat[1];
+                    v_quat_local[2] += dQuat[2];
+                    v_quat_local[3] += dQuat[3];
+                    v_logscale_local += dLogScale;
+
+                    v_opacity_local = vis * v_alpha;
+                }
+
+                // buffer update
+#pragma unroll
+                for (uint32_t k = 0; k < NUM_CHANNELS; ++k) {
+                    const int32_t flatId = idBatch[t];
+                    const int32_t cid    = flatId / (int32_t)means.size(0);
+                    const int32_t gid    = flatId % (int32_t)means.size(0);
+                    buffer[k] += features[cid][gid][k] * fac;
+                }
+            }
+
+            // Warp-reduce and atomic add once per gaussian per warp.
+            warpSumMut(v_opacity_local, warp);
+            warpSumMut(v_mean_local, warp);
+            warpSumMut<4>(v_quat_local, warp);
+            warpSumMut(v_logscale_local, warp);
+            warpSumMut<NUM_CHANNELS>(v_feat_local, warp);
+
+            if (warp.thread_rank() == 0) {
+                const int32_t flatId = idBatch[t];
+                const int32_t cid    = flatId / (int32_t)means.size(0);
+                const int32_t gid    = flatId % (int32_t)means.size(0);
+
+                // Per-camera grads
+                float *dFeatPtr =
+                    dFeatures + ((cid * (int32_t)means.size(0) + gid) * (int32_t)NUM_CHANNELS);
+#pragma unroll
+                for (uint32_t k = 0; k < NUM_CHANNELS; ++k) {
+                    atomicAdd_system(dFeatPtr + k, v_feat_local[k]);
+                }
+                atomicAdd_system(dOpacities + (cid * (int32_t)means.size(0) + gid),
+                                 v_opacity_local);
+
+                // Geometry grads (shared across cameras)
+                float *dMeanPtr = dMeans + gid * 3;
+                atomicAdd_system(dMeanPtr + 0, v_mean_local[0]);
+                atomicAdd_system(dMeanPtr + 1, v_mean_local[1]);
+                atomicAdd_system(dMeanPtr + 2, v_mean_local[2]);
+
+                float *dQuatPtr = dQuats + gid * 4;
+                atomicAdd_system(dQuatPtr + 0, v_quat_local[0]);
+                atomicAdd_system(dQuatPtr + 1, v_quat_local[1]);
+                atomicAdd_system(dQuatPtr + 2, v_quat_local[2]);
+                atomicAdd_system(dQuatPtr + 3, v_quat_local[3]);
+
+                float *dLsPtr = dLogScales + gid * 3;
+                atomicAdd_system(dLsPtr + 0, v_logscale_local[0]);
+                atomicAdd_system(dLsPtr + 1, v_logscale_local[1]);
+                atomicAdd_system(dLsPtr + 2, v_logscale_local[2]);
+            }
+        }
+    }
+}
+
+template <uint32_t NUM_CHANNELS>
+std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor>
+launchBackward(const torch::Tensor &means,
+               const torch::Tensor &quats,
+               const torch::Tensor &logScales,
+               const torch::Tensor &features,
+               const torch::Tensor &opacities,
+               const torch::Tensor &worldToCamStart,
+               const torch::Tensor &worldToCamEnd,
+               const torch::Tensor &K,
+               const torch::Tensor &distortionCoeffs,
+               const RollingShutterType rollingShutterType,
+               const CameraModel cameraModel,
+               const uint32_t imageWidth,
+               const uint32_t imageHeight,
+               const uint32_t imageOriginW,
+               const uint32_t imageOriginH,
+               const uint32_t tileSize,
+               const torch::Tensor &tileOffsets,
+               const torch::Tensor &tileGaussianIds,
+               const torch::Tensor &renderedAlphas,
+               const torch::Tensor &lastIds,
+               const torch::Tensor &dLossDRenderedFeatures,
+               const torch::Tensor &dLossDRenderedAlphas,
+               const at::optional<torch::Tensor> &backgrounds,
+               const at::optional<torch::Tensor> &masks) {
+    const int64_t C = features.size(0);
+    const int64_t N = means.size(0);
+
+    torch::Tensor dMeans     = torch::zeros_like(means);
+    torch::Tensor dQuats     = torch::zeros_like(quats);
+    torch::Tensor dLogScales = torch::zeros_like(logScales);
+    torch::Tensor dFeatures  = torch::zeros_like(features);
+    torch::Tensor dOpacities = torch::zeros_like(opacities);
+
+    const uint32_t tileExtentW = (imageWidth + tileSize - 1) / tileSize;
+    const uint32_t tileExtentH = (imageHeight + tileSize - 1) / tileSize;
+    const dim3 blockDim(tileSize, tileSize, 1);
+    const dim3 gridDim(C * tileExtentH * tileExtentW, 1, 1);
+    const int32_t totalIntersections = static_cast<int32_t>(tileGaussianIds.size(0));
+    const int64_t numDistCoeffs      = distortionCoeffs.size(1);
+
+    RasterizeFromWorldCommonArgs commonArgs{
+        static_cast<uint32_t>(C),
+        imageWidth,
+        imageHeight,
+        imageOriginW,
+        imageOriginH,
+        tileSize,
+        tileExtentW,
+        tileExtentH,
+        NUM_CHANNELS,
+        totalIntersections,
+        tileOffsets.packed_accessor64<int32_t, 3, torch::RestrictPtrTraits>(),
+        tileGaussianIds.packed_accessor64<int32_t, 1, torch::RestrictPtrTraits>(),
+        nullptr,
+        nullptr};
+
+    const PreparedRasterOptionalInputs opt = prepareRasterOptionalInputs(
+        features, C, tileExtentH, tileExtentW, (int64_t)NUM_CHANNELS, backgrounds, masks);
+    commonArgs.backgrounds = opt.backgrounds;
+    commonArgs.masks       = opt.masks;
+
+    const size_t blockSize = (size_t)tileSize * (size_t)tileSize;
+    const size_t sharedMem = blockSize * (sizeof(int32_t) + sizeof(SharedGaussian<NUM_CHANNELS>));
+
+    auto stream = at::cuda::getDefaultCUDAStream();
+    rasterizeFromWorld3DGSBackwardKernel<NUM_CHANNELS><<<gridDim, blockDim, sharedMem, stream>>>(
+        commonArgs,
+        means.packed_accessor64<float, 2, torch::RestrictPtrTraits>(),
+        quats.packed_accessor64<float, 2, torch::RestrictPtrTraits>(),
+        logScales.packed_accessor64<float, 2, torch::RestrictPtrTraits>(),
+        features.packed_accessor64<float, 3, torch::RestrictPtrTraits>(),
+        opacities.packed_accessor64<float, 2, torch::RestrictPtrTraits>(),
+        worldToCamStart.packed_accessor64<float, 3, torch::RestrictPtrTraits>(),
+        worldToCamEnd.packed_accessor64<float, 3, torch::RestrictPtrTraits>(),
+        K.packed_accessor64<float, 3, torch::RestrictPtrTraits>(),
+        distortionCoeffs.packed_accessor64<float, 2, torch::RestrictPtrTraits>(),
+        numDistCoeffs,
+        rollingShutterType,
+        cameraModel,
+        renderedAlphas.packed_accessor64<float, 4, torch::RestrictPtrTraits>(),
+        lastIds.packed_accessor64<int32_t, 3, torch::RestrictPtrTraits>(),
+        dLossDRenderedFeatures.packed_accessor64<float, 4, torch::RestrictPtrTraits>(),
+        dLossDRenderedAlphas.packed_accessor64<float, 4, torch::RestrictPtrTraits>(),
+        dMeans.data_ptr<float>(),
+        dQuats.data_ptr<float>(),
+        dLogScales.data_ptr<float>(),
+        dFeatures.data_ptr<float>(),
+        dOpacities.data_ptr<float>());
+    C10_CUDA_KERNEL_LAUNCH_CHECK();
+    return {dMeans, dQuats, dLogScales, dFeatures, dOpacities};
+}
+
+} // namespace
+
+template <>
+std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor>
+dispatchGaussianRasterizeFromWorld3DGSBackward<torch::kCUDA>(
+    const torch::Tensor &means,
+    const torch::Tensor &quats,
+    const torch::Tensor &logScales,
+    const torch::Tensor &features,
+    const torch::Tensor &opacities,
+    const torch::Tensor &worldToCamMatricesStart,
+    const torch::Tensor &worldToCamMatricesEnd,
+    const torch::Tensor &projectionMatrices,
+    const torch::Tensor &distortionCoeffs,
+    const RollingShutterType rollingShutterType,
+    const CameraModel cameraModel,
+    const RenderSettings &settings,
+    const torch::Tensor &tileOffsets,
+    const torch::Tensor &tileGaussianIds,
+    const torch::Tensor &renderedAlphas,
+    const torch::Tensor &lastIds,
+    const torch::Tensor &dLossDRenderedFeatures,
+    const torch::Tensor &dLossDRenderedAlphas,
+    const at::optional<torch::Tensor> &backgrounds,
+    const at::optional<torch::Tensor> &masks) {
+    FVDB_FUNC_RANGE();
+
+    const uint32_t imageWidth   = settings.imageWidth;
+    const uint32_t imageHeight  = settings.imageHeight;
+    const uint32_t imageOriginW = settings.imageOriginW;
+    const uint32_t imageOriginH = settings.imageOriginH;
+    const uint32_t tileSize     = settings.tileSize;
+
+    TORCH_CHECK_VALUE(means.is_cuda(), "means must be CUDA");
+    TORCH_CHECK_VALUE(features.is_cuda(), "features must be CUDA");
+    TORCH_CHECK_VALUE(opacities.is_cuda(), "opacities must be CUDA");
+    TORCH_CHECK_VALUE(renderedAlphas.is_cuda(), "renderedAlphas must be CUDA");
+    TORCH_CHECK_VALUE(lastIds.is_cuda(), "lastIds must be CUDA");
+
+    const int64_t C = features.size(0);
+    const int64_t N = means.size(0);
+
+    // Opacities may be provided either per-camera ([C,N]) or shared across cameras ([N]).
+    // TODO(fvdb): Avoid materializing a repeated [C,N] tensor when opacities are shared across
+    // cameras (similar to PR #451).
+    torch::Tensor opacitiesBatched = opacities;
+    if (opacitiesBatched.dim() == 1) {
+        TORCH_CHECK_VALUE(opacitiesBatched.size(0) == N,
+                          "opacities must have shape [N] or [C,N] matching N");
+        opacitiesBatched = opacitiesBatched.unsqueeze(0).repeat({C, 1});
+    }
+    opacitiesBatched = opacitiesBatched.contiguous();
+
+    const uint32_t channels = (uint32_t)features.size(2);
+
+#define CALL_BWD(NCH)                                       \
+    case NCH:                                               \
+        return launchBackward<NCH>(means,                   \
+                                   quats,                   \
+                                   logScales,               \
+                                   features,                \
+                                   opacitiesBatched,        \
+                                   worldToCamMatricesStart, \
+                                   worldToCamMatricesEnd,   \
+                                   projectionMatrices,      \
+                                   distortionCoeffs,        \
+                                   rollingShutterType,      \
+                                   cameraModel,             \
+                                   imageWidth,              \
+                                   imageHeight,             \
+                                   imageOriginW,            \
+                                   imageOriginH,            \
+                                   tileSize,                \
+                                   tileOffsets,             \
+                                   tileGaussianIds,         \
+                                   renderedAlphas,          \
+                                   lastIds,                 \
+                                   dLossDRenderedFeatures,  \
+                                   dLossDRenderedAlphas,    \
+                                   backgrounds,             \
+                                   masks);
+
+    switch (channels) {
+        CALL_BWD(1)
+        CALL_BWD(2)
+        CALL_BWD(3)
+        CALL_BWD(4)
+        CALL_BWD(5)
+        CALL_BWD(8)
+        CALL_BWD(9)
+        CALL_BWD(16)
+        CALL_BWD(17)
+        CALL_BWD(32)
+        CALL_BWD(33)
+        CALL_BWD(64)
+        CALL_BWD(65)
+        CALL_BWD(128)
+        CALL_BWD(129)
+        CALL_BWD(192)
+        CALL_BWD(193)
+        CALL_BWD(256)
+        CALL_BWD(257)
+        CALL_BWD(512)
+        CALL_BWD(513)
+    default:
+        TORCH_CHECK_VALUE(
+            false, "Unsupported channels for rasterize-from-world-3dgs backward: ", channels);
+    }
+
+#undef CALL_BWD
+}
+
+template <>
+std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor>
+dispatchGaussianRasterizeFromWorld3DGSBackward<torch::kCPU>(const torch::Tensor &,
+                                                            const torch::Tensor &,
+                                                            const torch::Tensor &,
+                                                            const torch::Tensor &,
+                                                            const torch::Tensor &,
+                                                            const torch::Tensor &,
+                                                            const torch::Tensor &,
+                                                            const torch::Tensor &,
+                                                            const torch::Tensor &,
+                                                            const RollingShutterType,
+                                                            const CameraModel,
+                                                            const RenderSettings &,
+                                                            const torch::Tensor &,
+                                                            const torch::Tensor &,
+                                                            const torch::Tensor &,
+                                                            const torch::Tensor &,
+                                                            const torch::Tensor &,
+                                                            const torch::Tensor &,
+                                                            const at::optional<torch::Tensor> &,
+                                                            const at::optional<torch::Tensor> &) {
+    TORCH_CHECK_VALUE(false, "dispatchGaussianRasterizeFromWorld3DGSBackward is CUDA-only");
+}
+
+} // namespace fvdb::detail::ops
diff --git a/src/fvdb/detail/ops/gsplat/GaussianRasterizeFromWorldBackward.h b/src/fvdb/detail/ops/gsplat/GaussianRasterizeFromWorldBackward.h
new file mode 100644
index 00000000..4df897ab
--- /dev/null
+++ b/src/fvdb/detail/ops/gsplat/GaussianRasterizeFromWorldBackward.h
@@ -0,0 +1,62 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
+//
+#ifndef FVDB_DETAIL_OPS_GSPLAT_GAUSSIANRASTERIZEFROMWORLDBACKWARD_H
+#define FVDB_DETAIL_OPS_GSPLAT_GAUSSIANRASTERIZEFROMWORLDBACKWARD_H
+
+#include <fvdb/detail/ops/gsplat/GaussianCameraModels.h>
+#include <fvdb/detail/ops/gsplat/GaussianRenderSettings.h>
+
+#include <torch/types.h>
+
+#include <optional>
+#include <tuple>
+
+namespace fvdb::detail::ops {
+
+/// @brief Backward pass for dense rasterization from 3D Gaussians using per-pixel rays.
+///
+/// Gradients are produced for:
+/// - means:     [N, 3]
+/// - quats:     [N, 4]
+/// - logScales: [N, 3]
+/// - features:  [C, N, D]
+/// - opacities: [C, N]
+///
+/// @tparam DeviceType torch::kCUDA (CPU not implemented).
+template <torch::DeviceType>
+std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor>
+dispatchGaussianRasterizeFromWorld3DGSBackward(
+    // Gaussian parameters (world space)
+    const torch::Tensor &means,     // [N, 3]
+    const torch::Tensor &quats,     // [N, 4]
+    const torch::Tensor &logScales, // [N, 3]
+    // Per-camera quantities
+    const torch::Tensor &features,                // [C, N, D]
+    const torch::Tensor &opacities,               // [C, N]
+    const torch::Tensor &worldToCamMatricesStart, // [C, 4, 4]
+    const torch::Tensor &worldToCamMatricesEnd,   // [C, 4, 4]
+    const torch::Tensor &projectionMatrices,      // [C, 3, 3]
+    const torch::Tensor &distortionCoeffs,        // [C, K] (K=0 or 12)
+    const RollingShutterType rollingShutterType,
+    const CameraModel cameraModel,
+    // Render settings
+    const RenderSettings &settings,
+    // Intersections
+    const torch::Tensor &tileOffsets,     // [C, tileH, tileW]
+    const torch::Tensor &tileGaussianIds, // [n_isects] values in [0, C*N)
+    // Forward outputs needed for backward
+    const torch::Tensor &renderedAlphas, // [C, H, W, 1]
+    const torch::Tensor &lastIds,        // [C, H, W]
+    // Gradients of outputs
+    const torch::Tensor &dLossDRenderedFeatures, // [C, H, W, D]
+    const torch::Tensor &dLossDRenderedAlphas,   // [C, H, W, 1]
+    // Optional background (only affects alpha gradient term)
+    const at::optional<torch::Tensor> &backgrounds = at::nullopt, // [C, D]
+    // Optional tile masks (parity with classic rasterizer)
+    const at::optional<torch::Tensor> &masks = at::nullopt // [C, tileH, tileW] bool
+);
+
+} // namespace fvdb::detail::ops
+
+#endif // FVDB_DETAIL_OPS_GSPLAT_GAUSSIANRASTERIZEFROMWORLDBACKWARD_H
diff --git a/src/fvdb/detail/ops/gsplat/GaussianRasterizeFromWorldForward.cu b/src/fvdb/detail/ops/gsplat/GaussianRasterizeFromWorldForward.cu
new file mode 100644
index 00000000..ceef41cb
--- /dev/null
+++ b/src/fvdb/detail/ops/gsplat/GaussianRasterizeFromWorldForward.cu
@@ -0,0 +1,471 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
+//
+#include <fvdb/detail/ops/gsplat/GaussianCameraMatrixUtils.cuh>
+#include <fvdb/detail/ops/gsplat/GaussianRasterizeFromWorld.cuh>
+#include <fvdb/detail/ops/gsplat/GaussianRasterizeFromWorldForward.h>
+#include <fvdb/detail/ops/gsplat/GaussianRasterizeOptionalInputs.h>
+#include <fvdb/detail/utils/Nvtx.h>
+
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <c10/util/Exception.h>
+
+#include <cooperative_groups.h>
+
+namespace fvdb::detail::ops {
+namespace cg = cooperative_groups;
+
+namespace {
+
+template <uint32_t NUM_CHANNELS> struct SharedGaussian {
+    int32_t id;                       // flattened id in [0, C*N)
+    nanovdb::math::Vec3<float> mean;  // world mean
+    nanovdb::math::Mat3<float> isclR; // S^{-1} R^T
+    float opacity;
+};
+
+template <uint32_t NUM_CHANNELS>
+__global__ void
+rasterizeFromWorld3DGSForwardKernel(
+    const RasterizeFromWorldCommonArgs commonArgs,
+    // Gaussians
+    const torch::PackedTensorAccessor64<float, 2, torch::RestrictPtrTraits> means,     // [N,3]
+    const torch::PackedTensorAccessor64<float, 2, torch::RestrictPtrTraits> quats,     // [N,4]
+    const torch::PackedTensorAccessor64<float, 2, torch::RestrictPtrTraits> logScales, // [N,3]
+    // Per-camera
+    const torch::PackedTensorAccessor64<float, 3, torch::RestrictPtrTraits> features,  // [C,N,D]
+    const torch::PackedTensorAccessor64<float, 2, torch::RestrictPtrTraits> opacities, // [C,N]
+    const torch::PackedTensorAccessor64<float, 3, torch::RestrictPtrTraits>
+        worldToCamStart, // [C,4,4] (view as 3D accessor for contiguous)
+    const torch::PackedTensorAccessor64<float, 3, torch::RestrictPtrTraits>
+        worldToCamEnd,   // [C,4,4]
+    const torch::PackedTensorAccessor64<float, 3, torch::RestrictPtrTraits> K, // [C,3,3]
+    const torch::PackedTensorAccessor64<float, 2, torch::RestrictPtrTraits>
+        distortionCoeffs,                                                      // [C,K]
+    const int64_t numDistCoeffs,
+    const RollingShutterType rollingShutterType,
+    const CameraModel cameraModel,
+    // Outputs
+    float *__restrict__ outFeatures, // [C,H,W,D]
+    float *__restrict__ outAlphas,   // [C,H,W,1] packed as [C*H*W]
+    int32_t *__restrict__ outLastIds // [C,H,W]
+) {
+    const uint32_t blockSize = blockDim.x * blockDim.y;
+    auto block               = cg::this_thread_block();
+
+    uint32_t camId, tileRow, tileCol, row, col;
+    commonArgs.denseCoordinates(camId, tileRow, tileCol, row, col);
+    const bool inside = (row < commonArgs.imageHeight && col < commonArgs.imageWidth);
+
+    // Pixel index within this camera.
+    const uint32_t pixId       = commonArgs.pixelId(row, col);
+    const uint32_t imgBaseFeat = commonArgs.outputFeatureBase(camId, pixId);
+    const uint32_t imgBasePix  = commonArgs.outputPixelBase(camId, pixId);
+
+    // Parity with classic rasterizer: masked tiles write background and exit.
+    //
+    // IMPORTANT: this kernel uses block-level barriers later (`__syncthreads_count`, `block.sync`).
+    // Any early return must be taken by *all* threads in the block, otherwise edge tiles can
+    // deadlock when some threads are `!inside`. So we make the return block-wide.
+    const bool tileMasked = commonArgs.tileMasked(camId, tileRow, tileCol);
+    if (tileMasked) {
+        if (inside) {
+            outAlphas[imgBasePix]  = 0.0f;
+            outLastIds[imgBasePix] = -1;
+#pragma unroll
+            for (uint32_t k = 0; k < NUM_CHANNELS; ++k) {
+                outFeatures[imgBaseFeat + k] = commonArgs.backgroundValue(camId, k);
+            }
+        }
+        return;
+    }
+
+    // Build camera pose for this camera (start/end).
+    nanovdb::math::Mat3<float> worldToCamRotationStart, worldToCamRotationEnd;
+    nanovdb::math::Vec3<float> worldToCamTranslationStart, worldToCamTranslationEnd;
+    {
+        loadWorldToCamRtFromAccessor44<float>(
+            worldToCamStart, camId, worldToCamRotationStart, worldToCamTranslationStart);
+        loadWorldToCamRtFromAccessor44<float>(
+            worldToCamEnd, camId, worldToCamRotationEnd, worldToCamTranslationEnd);
+    }
+
+    const nanovdb::math::Mat3<float> K_cam = loadMat3FromAccessor33<float>(K, camId);
+
+    const float *distPtr = (numDistCoeffs > 0) ? &distortionCoeffs[camId][0] : nullptr;
+
+    const nanovdb::math::Ray<float> ray = pixelToWorldRay<float>(row,
+                                                                 col,
+                                                                 commonArgs.imageWidth,
+                                                                 commonArgs.imageHeight,
+                                                                 commonArgs.imageOriginW,
+                                                                 commonArgs.imageOriginH,
+                                                                 worldToCamRotationStart,
+                                                                 worldToCamTranslationStart,
+                                                                 worldToCamRotationEnd,
+                                                                 worldToCamTranslationEnd,
+                                                                 K_cam,
+                                                                 distPtr,
+                                                                 numDistCoeffs,
+                                                                 rollingShutterType,
+                                                                 cameraModel);
+
+    const bool rayValid = ray.dir().dot(ray.dir()) > 0.0f;
+    bool done           = (!inside) || (!rayValid);
+
+    // Determine gaussian range for this tile.
+    const auto [rangeStart, rangeEnd] = commonArgs.tileGaussianRange(camId, tileRow, tileCol);
+
+    // If no intersections, just write background.
+    //
+    // As above, this must be a block-wide return to avoid deadlocks on edge tiles.
+    if (rangeEnd <= rangeStart) {
+        if (inside) {
+            // alpha=0, output background if provided else 0.
+            outAlphas[imgBasePix]  = 0.0f;
+            outLastIds[imgBasePix] = -1;
+#pragma unroll
+            for (uint32_t k = 0; k < NUM_CHANNELS; ++k) {
+                outFeatures[imgBaseFeat + k] = commonArgs.backgroundValue(camId, k);
+            }
+        }
+        return;
+    }
+
+    // Shared memory for batched gaussians.
+    extern __shared__ char smem[];
+    int32_t *idBatch = reinterpret_cast<int32_t *>(smem);                      // [blockSize]
+    auto *gaussBatch =
+        reinterpret_cast<SharedGaussian<NUM_CHANNELS> *>(&idBatch[blockSize]); // [blockSize]
+
+    float transmittance = 1.0f;
+    int32_t curIdx      = -1;
+    float pixOut[NUM_CHANNELS];
+#pragma unroll
+    for (uint32_t k = 0; k < NUM_CHANNELS; ++k) {
+        pixOut[k] = 0.f;
+    }
+
+    const int32_t nIsects     = rangeEnd - rangeStart;
+    const uint32_t nBatches   = (nIsects + blockSize - 1) / blockSize;
+    const uint32_t threadRank = block.thread_rank();
+
+    for (uint32_t b = 0; b < nBatches; ++b) {
+        if (__syncthreads_count(done) >= (int)blockSize) {
+            break;
+        }
+
+        const int32_t batchStart = rangeStart + (int32_t)(blockSize * b);
+        const int32_t idx        = batchStart + (int32_t)threadRank;
+        if (idx < rangeEnd) {
+            const int32_t flatId = commonArgs.tileGaussianIds[idx];
+            idBatch[threadRank]  = flatId;
+            const int32_t gid    = flatId % (int32_t)means.size(0);
+
+            const nanovdb::math::Vec3<float> mean_w(means[gid][0], means[gid][1], means[gid][2]);
+            const nanovdb::math::Vec4<float> quat_wxyz(
+                quats[gid][0], quats[gid][1], quats[gid][2], quats[gid][3]);
+            const nanovdb::math::Vec3<float> scale(
+                __expf(logScales[gid][0]), __expf(logScales[gid][1]), __expf(logScales[gid][2]));
+            const nanovdb::math::Mat3<float> isclR = computeIsclRot<float>(quat_wxyz, scale);
+            const int32_t cid                      = flatId / (int32_t)means.size(0);
+            const float op                         = opacities[cid][gid];
+
+            gaussBatch[threadRank].id      = flatId;
+            gaussBatch[threadRank].mean    = mean_w;
+            gaussBatch[threadRank].isclR   = isclR;
+            gaussBatch[threadRank].opacity = op;
+        }
+
+        __syncthreads();
+
+        const uint32_t batchSize =
+            min((uint32_t)blockSize, (uint32_t)max(0, rangeEnd - batchStart));
+        for (uint32_t t = 0; (t < batchSize) && !done; ++t) {
+            const SharedGaussian<NUM_CHANNELS> g = gaussBatch[t];
+            // 3DGS ray-ellipsoid visibility in "whitened" coordinates (see 3D-GUT paper Fig. 11).
+            // gro   = S^{-1} R^T (o - μ)
+            // grd   = normalize( S^{-1} R^T d )
+            // gcrod = grd × gro  (distance proxy to principal axis in whitened space)
+            const nanovdb::math::Vec3<float> gro   = g.isclR * (ray.eye() - g.mean);
+            const nanovdb::math::Vec3<float> grd   = normalizeSafe<float>(g.isclR * ray.dir());
+            const nanovdb::math::Vec3<float> gcrod = grd.cross(gro);
+            const float grayDist                   = gcrod.dot(gcrod);
+            const float power                      = -0.5f * grayDist;
+            const float vis                        = __expf(power);
+            float alpha                            = min(kAlphaThreshold, g.opacity * vis);
+            if (power > 0.f || alpha < 1.f / 255.f) {
+                continue;
+            }
+            const float nextTransmittance = transmittance * (1.0f - alpha);
+            if (nextTransmittance <= 1e-4f) {
+                done = true;
+                break;
+            }
+            const float contrib = alpha * transmittance;
+            const int32_t cid   = g.id / (int32_t)means.size(0);
+            const int32_t gid   = g.id % (int32_t)means.size(0);
+#pragma unroll
+            for (uint32_t k = 0; k < NUM_CHANNELS; ++k) {
+                pixOut[k] += features[cid][gid][k] * contrib;
+            }
+            curIdx        = (uint32_t)(batchStart + (int32_t)t);
+            transmittance = nextTransmittance;
+        }
+    }
+
+    if (!inside) {
+        return;
+    }
+
+    outAlphas[imgBasePix]  = 1.0f - transmittance;
+    outLastIds[imgBasePix] = curIdx;
+#pragma unroll
+    for (uint32_t k = 0; k < NUM_CHANNELS; ++k) {
+        outFeatures[imgBaseFeat + k] =
+            pixOut[k] + transmittance * commonArgs.backgroundValue(camId, k);
+    }
+}
+
+template <uint32_t NUM_CHANNELS>
+std::tuple<torch::Tensor, torch::Tensor, torch::Tensor>
+launchForward(const torch::Tensor &means,
+              const torch::Tensor &quats,
+              const torch::Tensor &logScales,
+              const torch::Tensor &features,
+              const torch::Tensor &opacities,
+              const torch::Tensor &worldToCamStart,
+              const torch::Tensor &worldToCamEnd,
+              const torch::Tensor &K,
+              const torch::Tensor &distortionCoeffs,
+              const RollingShutterType rollingShutterType,
+              const CameraModel cameraModel,
+              const uint32_t imageWidth,
+              const uint32_t imageHeight,
+              const uint32_t imageOriginW,
+              const uint32_t imageOriginH,
+              const uint32_t tileSize,
+              const torch::Tensor &tileOffsets,
+              const torch::Tensor &tileGaussianIds,
+              const at::optional<torch::Tensor> &backgrounds,
+              const at::optional<torch::Tensor> &masks) {
+    const int64_t C = features.size(0);
+
+    auto opts = features.options();
+    torch::Tensor outFeatures =
+        torch::zeros({C, (int64_t)imageHeight, (int64_t)imageWidth, (int64_t)NUM_CHANNELS}, opts);
+    torch::Tensor outAlphas = torch::zeros({C, (int64_t)imageHeight, (int64_t)imageWidth, 1}, opts);
+    torch::Tensor outLastIds =
+        torch::zeros({C, (int64_t)imageHeight, (int64_t)imageWidth},
+                     torch::TensorOptions().dtype(torch::kInt32).device(features.device()));
+
+    const uint32_t tileExtentW = (imageWidth + tileSize - 1) / tileSize;
+    const uint32_t tileExtentH = (imageHeight + tileSize - 1) / tileSize;
+    const dim3 blockDim(tileSize, tileSize, 1);
+    const dim3 gridDim(C * tileExtentH * tileExtentW, 1, 1);
+
+    const int32_t totalIntersections = (int32_t)tileGaussianIds.size(0);
+    const int64_t numDistCoeffs      = distortionCoeffs.size(1);
+
+    RasterizeFromWorldCommonArgs commonArgs{
+        static_cast<uint32_t>(C),
+        imageWidth,
+        imageHeight,
+        imageOriginW,
+        imageOriginH,
+        tileSize,
+        tileExtentW,
+        tileExtentH,
+        NUM_CHANNELS,
+        totalIntersections,
+        tileOffsets.packed_accessor64<int32_t, 3, torch::RestrictPtrTraits>(),
+        tileGaussianIds.packed_accessor64<int32_t, 1, torch::RestrictPtrTraits>(),
+        nullptr,
+        nullptr};
+
+    const PreparedRasterOptionalInputs opt = prepareRasterOptionalInputs(
+        features, C, tileExtentH, tileExtentW, (int64_t)NUM_CHANNELS, backgrounds, masks);
+    commonArgs.backgrounds = opt.backgrounds;
+    commonArgs.masks       = opt.masks;
+
+    const size_t blockSize = (size_t)tileSize * (size_t)tileSize;
+    const size_t sharedMem = blockSize * (sizeof(int32_t) + sizeof(SharedGaussian<NUM_CHANNELS>));
+
+    auto stream = at::cuda::getDefaultCUDAStream();
+    rasterizeFromWorld3DGSForwardKernel<NUM_CHANNELS><<<gridDim, blockDim, sharedMem, stream>>>(
+        commonArgs,
+        means.packed_accessor64<float, 2, torch::RestrictPtrTraits>(),
+        quats.packed_accessor64<float, 2, torch::RestrictPtrTraits>(),
+        logScales.packed_accessor64<float, 2, torch::RestrictPtrTraits>(),
+        features.packed_accessor64<float, 3, torch::RestrictPtrTraits>(),
+        opacities.packed_accessor64<float, 2, torch::RestrictPtrTraits>(),
+        worldToCamStart.packed_accessor64<float, 3, torch::RestrictPtrTraits>(),
+        worldToCamEnd.packed_accessor64<float, 3, torch::RestrictPtrTraits>(),
+        K.packed_accessor64<float, 3, torch::RestrictPtrTraits>(),
+        distortionCoeffs.packed_accessor64<float, 2, torch::RestrictPtrTraits>(),
+        numDistCoeffs,
+        rollingShutterType,
+        cameraModel,
+        outFeatures.data_ptr<float>(),
+        outAlphas.data_ptr<float>(),
+        outLastIds.data_ptr<int32_t>());
+
+    C10_CUDA_KERNEL_LAUNCH_CHECK();
+    return {outFeatures, outAlphas, outLastIds};
+}
+
+} // namespace
+
+template <>
+std::tuple<torch::Tensor, torch::Tensor, torch::Tensor>
+dispatchGaussianRasterizeFromWorld3DGSForward<torch::kCUDA>(
+    const torch::Tensor &means,
+    const torch::Tensor &quats,
+    const torch::Tensor &logScales,
+    const torch::Tensor &features,
+    const torch::Tensor &opacities,
+    const torch::Tensor &worldToCamMatricesStart,
+    const torch::Tensor &worldToCamMatricesEnd,
+    const torch::Tensor &projectionMatrices,
+    const torch::Tensor &distortionCoeffs,
+    const RollingShutterType rollingShutterType,
+    const CameraModel cameraModel,
+    const RenderSettings &settings,
+    const torch::Tensor &tileOffsets,
+    const torch::Tensor &tileGaussianIds,
+    const at::optional<torch::Tensor> &backgrounds,
+    const at::optional<torch::Tensor> &masks) {
+    FVDB_FUNC_RANGE();
+
+    const uint32_t imageWidth   = settings.imageWidth;
+    const uint32_t imageHeight  = settings.imageHeight;
+    const uint32_t imageOriginW = settings.imageOriginW;
+    const uint32_t imageOriginH = settings.imageOriginH;
+    const uint32_t tileSize     = settings.tileSize;
+
+    TORCH_CHECK_VALUE(means.is_cuda(), "means must be CUDA");
+    TORCH_CHECK_VALUE(features.is_cuda(), "features must be CUDA");
+    TORCH_CHECK_VALUE(tileOffsets.is_cuda(), "tileOffsets must be CUDA");
+    TORCH_CHECK_VALUE(tileGaussianIds.is_cuda(), "tileGaussianIds must be CUDA");
+
+    TORCH_CHECK_VALUE(means.dim() == 2 && means.size(1) == 3, "means must have shape [N,3]");
+    TORCH_CHECK_VALUE(quats.dim() == 2 && quats.size(1) == 4, "quats must have shape [N,4]");
+    TORCH_CHECK_VALUE(logScales.dim() == 2 && logScales.size(1) == 3,
+                      "logScales must have shape [N,3]");
+    TORCH_CHECK_VALUE(features.dim() == 3, "features must have shape [C,N,D]");
+
+    const int64_t C = features.size(0);
+    const int64_t N = means.size(0);
+    TORCH_CHECK_VALUE(features.size(1) == N, "features must have shape [C,N,D] matching N");
+    TORCH_CHECK_VALUE(opacities.is_cuda(), "opacities must be CUDA");
+
+    // Opacities may be provided either per-camera ([C,N]) or shared across cameras ([N]).
+    // TODO(fvdb): Avoid materializing a repeated [C,N] tensor when opacities are shared across
+    // cameras. We should be able to pass a view (or otherwise avoid the repeat) similar to the
+    // approach used in PR #451 for other rasterization paths.
+    torch::Tensor opacitiesBatched = opacities;
+    if (opacitiesBatched.dim() == 1) {
+        TORCH_CHECK_VALUE(opacitiesBatched.size(0) == N,
+                          "opacities must have shape [N] or [C,N] matching N");
+        opacitiesBatched = opacitiesBatched.unsqueeze(0).repeat({C, 1});
+    }
+    TORCH_CHECK_VALUE(opacitiesBatched.dim() == 2, "opacities must have shape [C,N]");
+    TORCH_CHECK_VALUE(opacitiesBatched.size(0) == C && opacitiesBatched.size(1) == N,
+                      "opacities must have shape [C,N] matching features and N");
+    opacitiesBatched = opacitiesBatched.contiguous();
+
+    TORCH_CHECK_VALUE(worldToCamMatricesStart.sizes() == torch::IntArrayRef({C, 4, 4}),
+                      "worldToCamMatricesStart must have shape [C,4,4]");
+    TORCH_CHECK_VALUE(worldToCamMatricesEnd.sizes() == torch::IntArrayRef({C, 4, 4}),
+                      "worldToCamMatricesEnd must have shape [C,4,4]");
+    TORCH_CHECK_VALUE(projectionMatrices.sizes() == torch::IntArrayRef({C, 3, 3}),
+                      "projectionMatrices must have shape [C,3,3]");
+
+    const int64_t numDistCoeffs = distortionCoeffs.size(1);
+    TORCH_CHECK_VALUE(distortionCoeffs.dim() == 2 && distortionCoeffs.size(0) == C,
+                      "distortionCoeffs must have shape [C,K]");
+    if (cameraModel == CameraModel::OPENCV_RADTAN_5 ||
+        cameraModel == CameraModel::OPENCV_RATIONAL_8 ||
+        cameraModel == CameraModel::OPENCV_RADTAN_THIN_PRISM_9 ||
+        cameraModel == CameraModel::OPENCV_THIN_PRISM_12) {
+        TORCH_CHECK_VALUE(numDistCoeffs == 12,
+                          "For CameraModel::OPENCV_* distortionCoeffs must be [C,12]");
+    }
+
+    const uint32_t channels = (uint32_t)features.size(2);
+
+#define CALL_FWD(NCH)                                      \
+    case NCH:                                              \
+        return launchForward<NCH>(means,                   \
+                                  quats,                   \
+                                  logScales,               \
+                                  features,                \
+                                  opacitiesBatched,        \
+                                  worldToCamMatricesStart, \
+                                  worldToCamMatricesEnd,   \
+                                  projectionMatrices,      \
+                                  distortionCoeffs,        \
+                                  rollingShutterType,      \
+                                  cameraModel,             \
+                                  imageWidth,              \
+                                  imageHeight,             \
+                                  imageOriginW,            \
+                                  imageOriginH,            \
+                                  tileSize,                \
+                                  tileOffsets,             \
+                                  tileGaussianIds,         \
+                                  backgrounds,             \
+                                  masks);
+
+    switch (channels) {
+        CALL_FWD(1)
+        CALL_FWD(2)
+        CALL_FWD(3)
+        CALL_FWD(4)
+        CALL_FWD(5)
+        CALL_FWD(8)
+        CALL_FWD(9)
+        CALL_FWD(16)
+        CALL_FWD(17)
+        CALL_FWD(32)
+        CALL_FWD(33)
+        CALL_FWD(64)
+        CALL_FWD(65)
+        CALL_FWD(128)
+        CALL_FWD(129)
+        CALL_FWD(192)
+        CALL_FWD(193)
+        CALL_FWD(256)
+        CALL_FWD(257)
+        CALL_FWD(512)
+        CALL_FWD(513)
+    default:
+        TORCH_CHECK_VALUE(false, "Unsupported channels for rasterize-from-world-3dgs: ", channels);
+    }
+
+#undef CALL_FWD
+}
+
+template <>
+std::tuple<torch::Tensor, torch::Tensor, torch::Tensor>
+dispatchGaussianRasterizeFromWorld3DGSForward<torch::kCPU>(const torch::Tensor &,
+                                                           const torch::Tensor &,
+                                                           const torch::Tensor &,
+                                                           const torch::Tensor &,
+                                                           const torch::Tensor &,
+                                                           const torch::Tensor &,
+                                                           const torch::Tensor &,
+                                                           const torch::Tensor &,
+                                                           const torch::Tensor &,
+                                                           const RollingShutterType,
+                                                           const CameraModel,
+                                                           const RenderSettings &,
+                                                           const torch::Tensor &,
+                                                           const torch::Tensor &,
+                                                           const at::optional<torch::Tensor> &,
+                                                           const at::optional<torch::Tensor> &) {
+    TORCH_CHECK_VALUE(false, "dispatchGaussianRasterizeFromWorld3DGSForward is CUDA-only");
+}
+
+} // namespace fvdb::detail::ops
diff --git a/src/fvdb/detail/ops/gsplat/GaussianRasterizeFromWorldForward.h b/src/fvdb/detail/ops/gsplat/GaussianRasterizeFromWorldForward.h
new file mode 100644
index 00000000..4b00dac9
--- /dev/null
+++ b/src/fvdb/detail/ops/gsplat/GaussianRasterizeFromWorldForward.h
@@ -0,0 +1,61 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
+//
+#ifndef FVDB_DETAIL_OPS_GSPLAT_GAUSSIANRASTERIZEFROMWORLDFORWARD_H
+#define FVDB_DETAIL_OPS_GSPLAT_GAUSSIANRASTERIZEFROMWORLDFORWARD_H
+
+#include <fvdb/detail/ops/gsplat/GaussianCameraModels.h>
+#include <fvdb/detail/ops/gsplat/GaussianRenderSettings.h>
+
+#include <torch/types.h>
+
+#include <optional>
+#include <tuple>
+
+namespace fvdb::detail::ops {
+
+/// @brief Rasterize images directly from 3D Gaussians using per-pixel rays.
+///
+/// This kernel follows the gsplat "RasterizeToPixelsFromWorld3DGS" algorithm, but is wired to
+/// FVDB's existing tile intersection representation (`tileOffsets`, `tileGaussianIds`).
+///
+/// Inputs are world-space Gaussians (means/quats/logScales) and per-camera per-gaussian features
+/// and opacities. The camera is defined via world->camera matrices (start/end), intrinsics,
+/// `CameraModel`, rolling shutter policy, and packed OpenCV distortion coefficients.
+///
+/// This is a dense-only rasterizer: outputs are dense tensors of shape
+/// - renderedFeatures: [C, H, W, D]
+/// - renderedAlphas:   [C, H, W, 1]
+/// - lastIds:          [C, H, W]
+///
+/// @tparam DeviceType torch::kCUDA (CPU not implemented).
+template <torch::DeviceType>
+std::tuple<torch::Tensor, torch::Tensor, torch::Tensor>
+dispatchGaussianRasterizeFromWorld3DGSForward(
+    // Gaussian parameters (world space)
+    const torch::Tensor &means,     // [N, 3]
+    const torch::Tensor &quats,     // [N, 4] (w,x,y,z)
+    const torch::Tensor &logScales, // [N, 3]
+    // Per-camera quantities
+    const torch::Tensor &features,                // [C, N, D]
+    const torch::Tensor &opacities,               // [C, N]
+    const torch::Tensor &worldToCamMatricesStart, // [C, 4, 4]
+    const torch::Tensor &worldToCamMatricesEnd,   // [C, 4, 4]
+    const torch::Tensor &projectionMatrices,      // [C, 3, 3]
+    const torch::Tensor &distortionCoeffs,        // [C, K] (K=0 or 12)
+    const RollingShutterType rollingShutterType,
+    const CameraModel cameraModel,
+    // Render settings
+    const RenderSettings &settings,
+    // Intersections
+    const torch::Tensor &tileOffsets,     // [C, tileH, tileW]
+    const torch::Tensor &tileGaussianIds, // [n_isects] values in [0, C*N)
+    // Optional background
+    const at::optional<torch::Tensor> &backgrounds = at::nullopt, // [C, D]
+    // Optional tile masks (parity with classic rasterizer)
+    const at::optional<torch::Tensor> &masks = at::nullopt // [C, tileH, tileW] bool
+);
+
+} // namespace fvdb::detail::ops
+
+#endif // FVDB_DETAIL_OPS_GSPLAT_GAUSSIANRASTERIZEFROMWORLDFORWARD_H
diff --git a/src/fvdb/detail/ops/gsplat/GaussianRasterizeOptionalInputs.h b/src/fvdb/detail/ops/gsplat/GaussianRasterizeOptionalInputs.h
new file mode 100644
index 00000000..79239f53
--- /dev/null
+++ b/src/fvdb/detail/ops/gsplat/GaussianRasterizeOptionalInputs.h
@@ -0,0 +1,59 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
+//
+#ifndef FVDB_DETAIL_OPS_GSPLAT_GAUSSIANRASTERIZEOPTIONALINPUTS_H
+#define FVDB_DETAIL_OPS_GSPLAT_GAUSSIANRASTERIZEOPTIONALINPUTS_H
+
+#include <c10/util/Exception.h>
+#include <torch/types.h>
+
+#include <optional>
+
+namespace fvdb::detail::ops {
+
+struct PreparedRasterOptionalInputs {
+    const float *backgrounds = nullptr;
+    const bool *masks        = nullptr;
+    torch::Tensor backgroundsContig;
+    torch::Tensor masksContig;
+};
+
+inline PreparedRasterOptionalInputs
+prepareRasterOptionalInputs(const torch::Tensor &features,
+                            const int64_t C,
+                            const int64_t tileExtentH,
+                            const int64_t tileExtentW,
+                            const int64_t numChannels,
+                            const at::optional<torch::Tensor> &backgrounds,
+                            const at::optional<torch::Tensor> &masks) {
+    PreparedRasterOptionalInputs out;
+
+    if (backgrounds.has_value()) {
+        TORCH_CHECK(backgrounds.value().is_cuda(), "backgrounds must be CUDA");
+        TORCH_CHECK(backgrounds.value().device() == features.device(),
+                    "backgrounds must be on the same device as features");
+        TORCH_CHECK(backgrounds.value().scalar_type() == torch::kFloat32,
+                    "backgrounds must have dtype=float32");
+        TORCH_CHECK(backgrounds.value().sizes() == torch::IntArrayRef({C, numChannels}),
+                    "backgrounds must have shape [C, NUM_CHANNELS]");
+        out.backgroundsContig = backgrounds.value().contiguous();
+        out.backgrounds       = out.backgroundsContig.data_ptr<float>();
+    }
+
+    if (masks.has_value()) {
+        TORCH_CHECK(masks.value().is_cuda(), "masks must be CUDA");
+        TORCH_CHECK(masks.value().device() == features.device(),
+                    "masks must be on the same device as features");
+        TORCH_CHECK(masks.value().scalar_type() == torch::kBool, "masks must have dtype=bool");
+        TORCH_CHECK(masks.value().sizes() == torch::IntArrayRef({C, tileExtentH, tileExtentW}),
+                    "masks must have shape [C, tileExtentH, tileExtentW]");
+        out.masksContig = masks.value().contiguous();
+        out.masks       = out.masksContig.data_ptr<bool>();
+    }
+
+    return out;
+}
+
+} // namespace fvdb::detail::ops
+
+#endif // FVDB_DETAIL_OPS_GSPLAT_GAUSSIANRASTERIZEOPTIONALINPUTS_H
diff --git a/src/fvdb/detail/ops/gsplat/GaussianRigidTransform.cuh b/src/fvdb/detail/ops/gsplat/GaussianRigidTransform.cuh
new file mode 100644
index 00000000..ff8d4a48
--- /dev/null
+++ b/src/fvdb/detail/ops/gsplat/GaussianRigidTransform.cuh
@@ -0,0 +1,67 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
+//
+#ifndef FVDB_DETAIL_OPS_GSPLAT_GAUSSIANRIGIDTRANSFORM_CUH
+#define FVDB_DETAIL_OPS_GSPLAT_GAUSSIANRIGIDTRANSFORM_CUH
+
+#include <fvdb/detail/ops/gsplat/GaussianUtils.cuh>
+
+#include <nanovdb/math/Math.h>
+
+namespace fvdb::detail::ops {
+
+/// @brief Rigid transform (cached rotation + translation).
+///
+/// Quaternion is stored as \([w,x,y,z]\) and is assumed to represent a rotation.
+/// The corresponding rotation matrix \(R(q)\) is cached to avoid recomputing it for every point
+/// transform (UT sigma points, rolling-shutter iterations, ray generation, etc.).
+template <typename T> struct RigidTransform {
+    nanovdb::math::Mat3<T> R;
+    nanovdb::math::Vec4<T> q;
+    nanovdb::math::Vec3<T> t;
+
+    /// @brief Default constructor (identity transform).
+    ///
+    /// Initializes to unit quaternion \([1,0,0,0]\) and zero translation.
+    inline __host__ __device__
+    RigidTransform()
+        : R(nanovdb::math::Mat3<T>(nanovdb::math::Vec3<T>(T(1), T(0), T(0)),
+                                   nanovdb::math::Vec3<T>(T(0), T(1), T(0)),
+                                   nanovdb::math::Vec3<T>(T(0), T(0), T(1)))),
+          q(T(1), T(0), T(0), T(0)), t(T(0), T(0), T(0)) {}
+
+    /// @brief Construct from quaternion and translation.
+    /// @param[in] q_in Rotation quaternion \([w,x,y,z]\).
+    /// @param[in] t_in Translation vector.
+    inline __host__ __device__
+    RigidTransform(const nanovdb::math::Vec4<T> &q_in, const nanovdb::math::Vec3<T> &t_in)
+        : R(quaternionToRotationMatrix<T>(q_in)), q(q_in), t(t_in) {}
+
+    /// @brief Construct from rotation matrix and translation.
+    /// @param[in] R_in Rotation matrix.
+    /// @param[in] t_in Translation vector.
+    inline __host__ __device__
+    RigidTransform(const nanovdb::math::Mat3<T> &R_in, const nanovdb::math::Vec3<T> &t_in)
+        : R(R_in), q(rotationMatrixToQuaternion<T>(R_in)), t(t_in) {}
+
+    /// @brief Apply the transform to a 3D point: \(R(q)\,p + t\).
+    inline __host__ __device__ nanovdb::math::Vec3<T>
+    apply(const nanovdb::math::Vec3<T> &p_world) const {
+        // p_cam = R * p_world + t
+        return R * p_world + t;
+    }
+
+    /// @brief Interpolate between two rigid transforms.
+    ///
+    /// Translation is linearly interpolated; rotation uses NLERP along the shortest arc.
+    inline static __host__ __device__ RigidTransform<T>
+    interpolate(const T u, const RigidTransform<T> &start, const RigidTransform<T> &end) {
+        const nanovdb::math::Vec3<T> t_interp = start.t + u * (end.t - start.t);
+        const nanovdb::math::Vec4<T> q_interp = nlerpQuaternionShortestPath<T>(start.q, end.q, u);
+        return RigidTransform<T>(q_interp, t_interp);
+    }
+};
+
+} // namespace fvdb::detail::ops
+
+#endif // FVDB_DETAIL_OPS_GSPLAT_GAUSSIANRIGIDTRANSFORM_CUH
diff --git a/src/fvdb/detail/ops/gsplat/GaussianRollingShutter.cuh b/src/fvdb/detail/ops/gsplat/GaussianRollingShutter.cuh
new file mode 100644
index 00000000..be69d9d8
--- /dev/null
+++ b/src/fvdb/detail/ops/gsplat/GaussianRollingShutter.cuh
@@ -0,0 +1,53 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
+//
+#ifndef FVDB_DETAIL_OPS_GSPLAT_GAUSSIANROLLINGSHUTTER_CUH
+#define FVDB_DETAIL_OPS_GSPLAT_GAUSSIANROLLINGSHUTTER_CUH
+
+#include <fvdb/detail/ops/gsplat/GaussianCameraModels.h>
+
+#include <cuda/std/cmath>
+
+namespace fvdb::detail::ops {
+
+template <typename T>
+inline __host__ __device__ T
+clamp01(const T x) {
+    return (x < T(0)) ? T(0) : ((x > T(1)) ? T(1) : x);
+}
+
+/// @brief Convert a pixel coordinate to rolling-shutter time in \([0,1]\).
+///
+/// FVDB convention:
+/// - vertical:    time varies with image row (y)
+/// - horizontal:  time varies with image column (x)
+/// - none:        time is 0
+///
+/// Pixel coordinates are in image space (not normalized), and we use `floor` (matching existing
+/// kernels) before normalizing by (dim-1).
+template <typename T>
+inline __host__ __device__ T
+rollingShutterTimeFromPixel(const RollingShutterType rollingShutterType,
+                            const T px,
+                            const T py,
+                            const int64_t imageWidth,
+                            const int64_t imageHeight) {
+    if (rollingShutterType == RollingShutterType::NONE) {
+        return T(0);
+    }
+
+    T u     = T(0);
+    T denom = T(1);
+    if (rollingShutterType == RollingShutterType::VERTICAL) {
+        denom = (imageHeight > 1) ? T(imageHeight - 1) : T(1);
+        u     = ::cuda::std::floor(py) / denom;
+    } else if (rollingShutterType == RollingShutterType::HORIZONTAL) {
+        denom = (imageWidth > 1) ? T(imageWidth - 1) : T(1);
+        u     = ::cuda::std::floor(px) / denom;
+    }
+    return clamp01(u);
+}
+
+} // namespace fvdb::detail::ops
+
+#endif // FVDB_DETAIL_OPS_GSPLAT_GAUSSIANROLLINGSHUTTER_CUH
diff --git a/src/python/GaussianSplatBinding.cpp b/src/python/GaussianSplatBinding.cpp
index 88255495..1154b650 100644
--- a/src/python/GaussianSplatBinding.cpp
+++ b/src/python/GaussianSplatBinding.cpp
@@ -8,11 +8,28 @@
 #include <fvdb/FVDB.h>
 #include <fvdb/GaussianSplat3d.h>
 #include <fvdb/detail/autograd/EvaluateSphericalHarmonics.h>
+#include <fvdb/detail/ops/gsplat/GaussianCameraModels.h>
 
 #include <torch/extension.h>
 
 void
 bind_gaussian_splat3d(py::module &m) {
+    py::enum_<fvdb::detail::ops::RollingShutterType>(m, "RollingShutterType")
+        .value("NONE", fvdb::detail::ops::RollingShutterType::NONE)
+        .value("VERTICAL", fvdb::detail::ops::RollingShutterType::VERTICAL)
+        .value("HORIZONTAL", fvdb::detail::ops::RollingShutterType::HORIZONTAL)
+        .export_values();
+
+    py::enum_<fvdb::detail::ops::CameraModel>(m, "CameraModel")
+        .value("PINHOLE", fvdb::detail::ops::CameraModel::PINHOLE)
+        .value("OPENCV_RADTAN_5", fvdb::detail::ops::CameraModel::OPENCV_RADTAN_5)
+        .value("OPENCV_RATIONAL_8", fvdb::detail::ops::CameraModel::OPENCV_RATIONAL_8)
+        .value("OPENCV_RADTAN_THIN_PRISM_9",
+               fvdb::detail::ops::CameraModel::OPENCV_RADTAN_THIN_PRISM_9)
+        .value("OPENCV_THIN_PRISM_12", fvdb::detail::ops::CameraModel::OPENCV_THIN_PRISM_12)
+        .value("ORTHOGRAPHIC", fvdb::detail::ops::CameraModel::ORTHOGRAPHIC)
+        .export_values();
+
     py::class_<fvdb::GaussianSplat3d::ProjectedGaussianSplats>(m, "ProjectedGaussianSplats")
         .def_property_readonly("means2d", &fvdb::GaussianSplat3d::ProjectedGaussianSplats::means2d)
         .def_property_readonly("conics", &fvdb::GaussianSplat3d::ProjectedGaussianSplats::conics)
@@ -201,6 +218,24 @@ bind_gaussian_splat3d(py::module &m) {
              py::arg("antialias")        = false,
              py::arg("backgrounds")      = std::nullopt)
 
+        .def("render_images_from_world",
+             &fvdb::GaussianSplat3d::renderImagesFromWorld,
+             py::arg("world_to_camera_matrices"),
+             py::arg("projection_matrices"),
+             py::arg("image_width"),
+             py::arg("image_height"),
+             py::arg("near"),
+             py::arg("far"),
+             py::arg("camera_model")      = fvdb::detail::ops::CameraModel::PINHOLE,
+             py::arg("distortion_coeffs") = std::nullopt,
+             py::arg("sh_degree_to_use")  = -1,
+             py::arg("tile_size")         = 16,
+             py::arg("min_radius_2d")     = 0.0,
+             py::arg("eps_2d")            = 0.3,
+             py::arg("antialias")         = false,
+             py::arg("backgrounds")       = std::nullopt,
+             py::arg("masks")             = std::nullopt)
+
         .def("render_depths",
              &fvdb::GaussianSplat3d::renderDepths,
              py::arg("world_to_camera_matrices"),
diff --git a/src/tests/GaussianRasterizeForwardTest.cpp b/src/tests/GaussianRasterizeForwardTest.cpp
index c64d55cb..a234651f 100644
--- a/src/tests/GaussianRasterizeForwardTest.cpp
+++ b/src/tests/GaussianRasterizeForwardTest.cpp
@@ -8,11 +8,19 @@
 #include <fvdb/detail/ops/gsplat/GaussianSplatSparse.h>
 #include <fvdb/detail/ops/gsplat/GaussianTileIntersection.h>
 
+#include <c10/cuda/CUDAFunctions.h>
+#include <c10/cuda/CUDAGuard.h>
 #include <torch/script.h>
 #include <torch/types.h>
 
 #include <gtest/gtest.h>
 
+#if defined(__linux__)
+#include <signal.h>
+#include <sys/wait.h>
+#include <unistd.h>
+#endif
+
 #include <cstdlib>
 #include <string>
 #include <vector>
@@ -21,6 +29,10 @@
 #error "FVDB_EXTERNAL_TEST_DATA_PATH must be defined"
 #endif
 
+namespace {
+constexpr const char *kMaskedEdgeTileChildEnv = "FVDB_GSPLAT_MASKED_EDGE_TILE_CHILD";
+} // namespace
+
 struct GaussianRasterizeForwardTestFixture : public ::testing::Test {
     void
     loadInputData(const std::string insPath) {
@@ -290,6 +302,127 @@ struct GaussianRasterizeForwardTestFixture : public ::testing::Test {
     uint32_t tileSize;
 };
 
+TEST(GaussianRasterizeForwardMaskedEdgeTile, Child) {
+#if !defined(__linux__)
+    GTEST_SKIP() << "This regression test is Linux-only.";
+#else
+    const char *isChild = std::getenv(kMaskedEdgeTileChildEnv);
+    if (!(isChild && std::string(isChild) == "1")) {
+        GTEST_SKIP() << "Not running child path.";
+    }
+
+    if (c10::cuda::device_count() <= 0) {
+        GTEST_SKIP() << "CUDA not available.";
+    }
+
+    const at::cuda::CUDAGuard device_guard(0);
+
+    constexpr int64_t C = 1;
+
+    constexpr uint32_t imageWidth  = 17;
+    constexpr uint32_t imageHeight = 17;
+    constexpr uint32_t tileSize    = 16;
+    constexpr uint32_t tileExtentH = (imageHeight + tileSize - 1) / tileSize; // 2
+    constexpr uint32_t tileExtentW = (imageWidth + tileSize - 1) / tileSize;  // 2
+
+    auto fopts = torch::TensorOptions().device(torch::kCUDA).dtype(torch::kFloat32);
+
+    // Single Gaussian that intersects the bottom-right edge tile.
+    const auto means2d   = torch::tensor({{{16.5f, 16.5f}}}, fopts);                  // [C,N,2]
+    const auto conics    = torch::tensor({{{1.0f, 0.0f, 1.0f}}}, fopts);              // [C,N,3]
+    const auto features  = torch::tensor({{{0.4f, 0.5f, -0.6f}}}, fopts);             // [C,N,D]
+    const auto opacities = torch::tensor({{0.9f}}, fopts);                            // [C,N]
+    const auto radii     = torch::tensor(
+        {{1}}, torch::TensorOptions().device(torch::kCUDA).dtype(torch::kInt32)); // [C,N]
+    const auto depths      = torch::tensor({{1.0f}}, fopts);                          // [C,N]
+    const auto backgrounds = torch::tensor({{0.1f, -0.2f, 0.3f}}, fopts);             // [C,D]
+
+    auto masks     = torch::ones({C, (int64_t)tileExtentH, (int64_t)tileExtentW},
+                             torch::TensorOptions().device(torch::kCUDA).dtype(torch::kBool));
+    masks[0][1][1] = false; // mask out bottom-right edge tile
+
+    auto [tileOffsets, tileGaussianIds] =
+        fvdb::detail::ops::dispatchGaussianTileIntersection<torch::kCUDA>(
+            means2d, radii, depths, at::nullopt, (uint32_t)C, tileSize, tileExtentH, tileExtentW);
+
+    auto [outFeatures, outAlphas, outLastIds] =
+        fvdb::detail::ops::dispatchGaussianRasterizeForward<torch::kCUDA>(means2d,
+                                                                          conics,
+                                                                          features,
+                                                                          opacities,
+                                                                          imageWidth,
+                                                                          imageHeight,
+                                                                          0,
+                                                                          0,
+                                                                          tileSize,
+                                                                          tileOffsets,
+                                                                          tileGaussianIds,
+                                                                          backgrounds,
+                                                                          masks);
+
+    (void)outLastIds;
+
+    // Ensure the kernel completed (this would hang if there is a deadlock).
+    C10_CUDA_CHECK(cudaDeviceSynchronize());
+
+    // The only in-bounds pixel in the bottom-right edge tile is (16,16). It should be filled
+    // with background and alpha=0.
+    const auto outFeaturesCpu = outFeatures.cpu();
+    const auto outAlphasCpu   = outAlphas.cpu();
+
+    EXPECT_EQ(outAlphasCpu[0][16][16][0].item<float>(), 0.0f);
+    EXPECT_EQ(outFeaturesCpu[0][16][16][0].item<float>(), 0.1f);
+    EXPECT_EQ(outFeaturesCpu[0][16][16][1].item<float>(), -0.2f);
+    EXPECT_EQ(outFeaturesCpu[0][16][16][2].item<float>(), 0.3f);
+#endif
+}
+
+TEST(GaussianRasterizeForwardMaskedEdgeTile, NoDeadlock) {
+#if !defined(__linux__)
+    GTEST_SKIP() << "This regression test is Linux-only.";
+#else
+    const char *isChild = std::getenv(kMaskedEdgeTileChildEnv);
+    if (isChild && std::string(isChild) == "1") {
+        GTEST_SKIP() << "Running child path.";
+    }
+
+    pid_t pid = fork();
+    ASSERT_GE(pid, 0) << "fork() failed";
+
+    if (pid == 0) {
+        // Child: exec the same test binary but run only the child test.
+        setenv(kMaskedEdgeTileChildEnv, "1", 1);
+        execl("/proc/self/exe",
+              "/proc/self/exe",
+              "--gtest_filter=GaussianRasterizeForwardMaskedEdgeTile.Child",
+              "--gtest_color=no",
+              (char *)nullptr);
+        _exit(127);
+    }
+
+    int status = 0;
+    bool exited{false};
+    constexpr int kTimeoutMs = 20000;
+    for (int elapsed = 0; elapsed < kTimeoutMs; elapsed += 50) {
+        const pid_t r = waitpid(pid, &status, WNOHANG);
+        if (r == pid) {
+            exited = true;
+            break;
+        }
+        usleep(50 * 1000);
+    }
+
+    if (!exited) {
+        kill(pid, SIGKILL);
+        (void)waitpid(pid, &status, 0);
+        FAIL() << "Deadlock detected: child process timed out.";
+    }
+
+    ASSERT_TRUE(WIFEXITED(status)) << "Child did not exit cleanly.";
+    ASSERT_EQ(WEXITSTATUS(status), 0) << "Child test failed.";
+#endif
+}
+
 // This is a helper function to generate the output data for the test cases.
 // Only enable this test when you want to update the output data.
 TEST_F(GaussianRasterizeForwardTestFixture, DISABLED_GenerateOutputData) {
diff --git a/tests/unit/test_rasterize_from_world.py b/tests/unit/test_rasterize_from_world.py
new file mode 100644
index 00000000..70fb2c2e
--- /dev/null
+++ b/tests/unit/test_rasterize_from_world.py
@@ -0,0 +1,752 @@
+#!/usr/bin/env python3
+# Copyright Contributors to the OpenVDB Project
+# SPDX-License-Identifier: Apache-2.0
+
+import pytest
+import torch
+
+
+def _build_gaussian_splat(
+    fvdb,
+    *,
+    means: torch.Tensor,
+    quats: torch.Tensor,
+    log_scales: torch.Tensor,
+    logit_opacities: torch.Tensor,
+    sh0: torch.Tensor,
+    shN: torch.Tensor,
+):
+    return fvdb.GaussianSplat3d.from_tensors(
+        means=means,
+        quats=quats,
+        log_scales=log_scales,
+        logit_opacities=logit_opacities,
+        sh0=sh0,
+        shN=shN,
+        accumulate_mean_2d_gradients=False,
+        accumulate_max_2d_radii=False,
+        detach=False,
+    )
+
+
+def _default_camera(
+    device: torch.device,
+    dtype: torch.dtype,
+    *,
+    cx: float,
+    cy: float,
+):
+    world_to_cam = torch.eye(4, device=device, dtype=dtype).unsqueeze(0)
+    K = torch.tensor(
+        [[[18.0, 0.0, cx], [0.0, 18.0, cy], [0.0, 0.0, 1.0]]],
+        device=device,
+        dtype=dtype,
+    )
+    return world_to_cam, K
+
+
+def _render_from_world(
+    fvdb,
+    gs,
+    *,
+    world_to_cam: torch.Tensor,
+    K: torch.Tensor,
+    image_width: int,
+    image_height: int,
+    sh_degree_to_use: int,
+    tile_size: int = 16,
+    backgrounds: torch.Tensor | None = None,
+    masks: torch.Tensor | None = None,
+):
+    return gs.render_images_from_world(
+        world_to_camera_matrices=world_to_cam,
+        projection_matrices=K,
+        image_width=image_width,
+        image_height=image_height,
+        near=0.01,
+        far=1e10,
+        camera_model=fvdb.CameraModel.PINHOLE,
+        distortion_coeffs=None,
+        sh_degree_to_use=sh_degree_to_use,
+        tile_size=tile_size,
+        min_radius_2d=0.0,
+        eps_2d=0.3,
+        antialias=False,
+        backgrounds=backgrounds,
+        masks=masks,
+    )
+
+
+def _assert_nonzero_finite_grad(tensor: torch.Tensor) -> None:
+    assert tensor.grad is not None
+    assert torch.isfinite(tensor.grad).all()
+    assert tensor.grad.abs().sum().item() > 0.0
+
+
+def _render_images_from_world_masked_edge_tile_worker(queue) -> None:
+    """
+    Subprocess worker for the masked edge-tile deadlock regression test.
+
+    Must be top-level for multiprocessing "spawn" pickling.
+    """
+    try:
+        import fvdb
+
+        device = torch.device("cuda")
+        dtype = torch.float32
+
+        C, N, D = 1, 1, 3
+        image_width = 17
+        image_height = 17
+        tile_size = 16  # -> tileH=tileW=2, with a 1-pixel edge tile at (1,1)
+
+        # Place the Gaussian so it projects into the bottom-right edge tile.
+        means = torch.tensor([[1.11, 1.11, 2.5]], device=device, dtype=dtype, requires_grad=True)
+        quats = torch.tensor([[1.0, 0.0, 0.0, 0.0]], device=device, dtype=dtype, requires_grad=True)
+        log_scales = torch.tensor([[-0.6, -0.9, -0.4]], device=device, dtype=dtype, requires_grad=True)
+        logit_opacities = torch.tensor([2.0], device=device, dtype=dtype, requires_grad=True)
+        sh0 = torch.tensor([[[0.8, -0.2, 0.3]]], device=device, dtype=dtype, requires_grad=True)
+        shN = torch.empty((N, 0, D), device=device, dtype=dtype, requires_grad=True)
+
+        gs = _build_gaussian_splat(
+            fvdb,
+            means=means,
+            quats=quats,
+            log_scales=log_scales,
+            logit_opacities=logit_opacities,
+            sh0=sh0,
+            shN=shN,
+        )
+
+        world_to_cam, K = _default_camera(device, dtype, cx=8.0, cy=8.0)
+
+        backgrounds = torch.tensor([[0.10, -0.20, 0.30]], device=device, dtype=dtype)  # [C,D]
+        masks = torch.ones((C, 2, 2), device=device, dtype=torch.bool)
+        masks[0, 1, 1] = False  # mask out the bottom-right edge tile
+
+        rendered, alphas = _render_from_world(
+            fvdb,
+            gs,
+            world_to_cam=world_to_cam,
+            K=K,
+            image_width=image_width,
+            image_height=image_height,
+            sh_degree_to_use=0,
+            tile_size=tile_size,
+            backgrounds=backgrounds,
+            masks=masks,
+        )
+
+        loss = rendered.sum() + alphas.sum()
+        loss.backward()
+        torch.cuda.synchronize()
+
+        queue.put(("ok", None))
+    except Exception:  # pragma: no cover
+        import traceback
+
+        queue.put(("err", traceback.format_exc()))
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+def test_gaussiansplat3d_render_images_from_world_grads_nonzero():
+    # Import inside the test so CPU-only environments can still collect this file.
+    import fvdb
+
+    device = torch.device("cuda")
+    C, N, D = 1, 1, 3
+
+    means = torch.tensor([[0.05, 0.05, 2.5]], device=device, dtype=torch.float32, requires_grad=True)
+    # Unit quaternion (normalized).
+    quats = torch.tensor(
+        [[0.97321693, 0.05016582, 0.20066328, -0.10033164]],
+        device=device,
+        dtype=torch.float32,
+        requires_grad=True,
+    )
+    log_scales = torch.tensor([[-0.6, -0.9, -0.4]], device=device, dtype=torch.float32, requires_grad=True)
+    logit_opacities = torch.tensor([2.0], device=device, dtype=torch.float32, requires_grad=True)
+
+    sh0 = torch.randn((N, 1, D), device=device, dtype=torch.float32, requires_grad=True)
+    shN = torch.empty((N, 0, D), device=device, dtype=torch.float32, requires_grad=True)
+
+    gs = _build_gaussian_splat(
+        fvdb,
+        means=means,
+        quats=quats,
+        log_scales=log_scales,
+        logit_opacities=logit_opacities,
+        sh0=sh0,
+        shN=shN,
+    )
+
+    image_width = 16
+    image_height = 16
+    world_to_cam, K = _default_camera(device, torch.float32, cx=7.5, cy=7.5)
+
+    rendered, alphas = _render_from_world(
+        fvdb,
+        gs,
+        world_to_cam=world_to_cam,
+        K=K,
+        image_width=image_width,
+        image_height=image_height,
+        sh_degree_to_use=0,
+    )
+
+    loss = (rendered * rendered).sum() + alphas.sum()
+    loss.backward()
+
+    _assert_nonzero_finite_grad(means)
+    _assert_nonzero_finite_grad(quats)
+    _assert_nonzero_finite_grad(log_scales)
+    _assert_nonzero_finite_grad(logit_opacities)
+    _assert_nonzero_finite_grad(sh0)
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+def test_gaussiansplat3d_render_images_from_world_grads_match_finite_differences():
+    """
+    Finite-difference check for the 3DGS dense rasterizer backward pass.
+
+    This is intentionally small (C=N=1, single tile) to keep runtime down and to avoid
+    discontinuities from tile assignment changes.
+    """
+    import fvdb
+
+    device = torch.device("cuda")
+    dtype = torch.float32
+
+    C, N, D = 1, 1, 3
+    image_width = 16
+    image_height = 16
+
+    world_to_cam, K = _default_camera(device, dtype, cx=7.5, cy=7.5)
+
+    shN = torch.empty((N, 0, D), device=device, dtype=dtype)
+
+    def loss_from_params(
+        means_t: torch.Tensor,
+        quats_t: torch.Tensor,
+        log_scales_t: torch.Tensor,
+        logit_opacities_t: torch.Tensor,
+        sh0_t: torch.Tensor,
+    ) -> torch.Tensor:
+        gs = _build_gaussian_splat(
+            fvdb,
+            means=means_t,
+            quats=quats_t,
+            log_scales=log_scales_t,
+            logit_opacities=logit_opacities_t,
+            sh0=sh0_t,
+            shN=shN,
+        )
+
+        rendered, alphas = _render_from_world(
+            fvdb,
+            gs,
+            world_to_cam=world_to_cam,
+            K=K,
+            image_width=image_width,
+            image_height=image_height,
+            sh_degree_to_use=0,
+            tile_size=16,  # single tile
+        )
+
+        # Smooth scalar objective; includes both color/features and opacity terms.
+        return (rendered * rendered).sum() + 0.5 * alphas.sum()
+
+    # Baseline parameters.
+    means = torch.tensor([[0.05, 0.05, 2.5]], device=device, dtype=dtype, requires_grad=True)
+    quats = torch.tensor([[1.0, 0.0, 0.0, 0.0]], device=device, dtype=dtype, requires_grad=True)
+    log_scales = torch.tensor([[-0.6, -0.9, -0.4]], device=device, dtype=dtype, requires_grad=True)
+    logit_opacities = torch.tensor([2.0], device=device, dtype=dtype, requires_grad=True)
+    sh0 = torch.tensor([[[0.8, -0.2, 0.3]]], device=device, dtype=dtype, requires_grad=True)  # [N,1,D]
+
+    # Autograd gradients.
+    loss = loss_from_params(means, quats, log_scales, logit_opacities, sh0)
+    loss.backward()
+
+    assert means.grad is not None
+    assert quats.grad is not None
+    assert log_scales.grad is not None
+    assert logit_opacities.grad is not None
+    assert sh0.grad is not None
+
+    # Centered finite differences for a small subset of scalars.
+    eps = 1.0e-3
+    rtol = 2.5e-2
+    atol = 2.5e-2
+
+    def fd_scalar(param_name: str, i0: int, i1: int | None = None, i2: int | None = None) -> float:
+        with torch.no_grad():
+            means_p = means.detach().clone()
+            means_m = means.detach().clone()
+            quats_p = quats.detach().clone()
+            quats_m = quats.detach().clone()
+            log_scales_p = log_scales.detach().clone()
+            log_scales_m = log_scales.detach().clone()
+            logit_p = logit_opacities.detach().clone()
+            logit_m = logit_opacities.detach().clone()
+            sh0_p = sh0.detach().clone()
+            sh0_m = sh0.detach().clone()
+
+            if param_name == "means":
+                assert i1 is not None
+                means_p[i0, i1] += eps
+                means_m[i0, i1] -= eps
+            elif param_name == "quats":
+                assert i1 is not None
+                quats_p[i0, i1] += eps
+                quats_m[i0, i1] -= eps
+            elif param_name == "log_scales":
+                assert i1 is not None
+                log_scales_p[i0, i1] += eps
+                log_scales_m[i0, i1] -= eps
+            elif param_name == "logit_opacities":
+                assert i1 is None
+                logit_p[i0] += eps
+                logit_m[i0] -= eps
+            elif param_name == "sh0":
+                assert i1 is not None and i2 is not None
+                sh0_p[i0, i1, i2] += eps
+                sh0_m[i0, i1, i2] -= eps
+            else:
+                raise AssertionError(f"Unknown param_name: {param_name}")
+
+            lp = loss_from_params(means_p, quats_p, log_scales_p, logit_p, sh0_p).item()
+            lm = loss_from_params(means_m, quats_m, log_scales_m, logit_m, sh0_m).item()
+            return (lp - lm) / (2.0 * eps)
+
+    checks: list[tuple[str, float, float]] = []
+    checks.append(("means[0,0]", float(means.grad[0, 0].item()), fd_scalar("means", 0, 0)))
+    checks.append(("means[0,2]", float(means.grad[0, 2].item()), fd_scalar("means", 0, 2)))
+    checks.append(("log_scales[0,0]", float(log_scales.grad[0, 0].item()), fd_scalar("log_scales", 0, 0)))
+    checks.append(("log_scales[0,2]", float(log_scales.grad[0, 2].item()), fd_scalar("log_scales", 0, 2)))
+    checks.append(("logit_opacities[0]", float(logit_opacities.grad[0].item()), fd_scalar("logit_opacities", 0)))
+    checks.append(("sh0[0,0,0]", float(sh0.grad[0, 0, 0].item()), fd_scalar("sh0", 0, 0, 0)))
+    checks.append(("sh0[0,0,2]", float(sh0.grad[0, 0, 2].item()), fd_scalar("sh0", 0, 0, 2)))
+
+    for name, grad_autograd, grad_fd in checks:
+        assert torch.isfinite(torch.tensor(grad_autograd))
+        assert torch.isfinite(torch.tensor(grad_fd))
+        assert grad_autograd == pytest.approx(
+            grad_fd, rel=rtol, abs=atol
+        ), f"{name}: autograd={grad_autograd} fd={grad_fd}"
+
+    # Quaternion tangent-space finite-difference check.
+    # We compare the FD directional derivative along a small axis-angle perturbation to the
+    # autograd gradient dotted with dq/dt (estimated numerically).
+    def quat_mul_wxyz(q1: torch.Tensor, q2: torch.Tensor) -> torch.Tensor:
+        w1, x1, y1, z1 = q1.unbind(-1)
+        w2, x2, y2, z2 = q2.unbind(-1)
+        return torch.stack(
+            (
+                w1 * w2 - x1 * x2 - y1 * y2 - z1 * z2,
+                w1 * x2 + x1 * w2 + y1 * z2 - z1 * y2,
+                w1 * y2 - x1 * z2 + y1 * w2 + z1 * x2,
+                w1 * z2 + x1 * y2 - y1 * x2 + z1 * w2,
+            ),
+            dim=-1,
+        )
+
+    def quat_exp_axis_angle_wxyz(v: torch.Tensor) -> torch.Tensor:
+        theta = torch.linalg.norm(v)
+        half = 0.5 * theta
+        if float(theta.item()) < 1.0e-12:
+            return torch.tensor([1.0, 0.0, 0.0, 0.0], device=v.device, dtype=v.dtype)
+        axis = v / theta
+        return torch.cat((torch.cos(half).view(1), torch.sin(half) * axis), dim=0)
+
+    with torch.no_grad():
+        q0 = quats.detach()[0]  # [4] wxyz
+        # Two independent tangent directions.
+        dirs = [
+            torch.tensor([1.0, 0.0, 0.0], device=device, dtype=dtype),
+            torch.tensor([0.0, 1.0, 0.2], device=device, dtype=dtype),
+        ]
+        for j, d in enumerate(dirs):
+            d = d / torch.linalg.norm(d)
+            dq_plus = quat_exp_axis_angle_wxyz(eps * d)
+            dq_minus = quat_exp_axis_angle_wxyz(-eps * d)
+            q_plus = quat_mul_wxyz(dq_plus, q0).view(1, 4)
+            q_minus = quat_mul_wxyz(dq_minus, q0).view(1, 4)
+
+            lp = loss_from_params(
+                means.detach(), q_plus, log_scales.detach(), logit_opacities.detach(), sh0.detach()
+            ).item()
+            lm = loss_from_params(
+                means.detach(), q_minus, log_scales.detach(), logit_opacities.detach(), sh0.detach()
+            ).item()
+            fd_dir = (lp - lm) / (2.0 * eps)
+
+            dqdt = (q_plus - q_minus) / (2.0 * eps)  # [1,4]
+            dir_autograd = float((quats.grad.detach() * dqdt).sum().item())
+
+            assert torch.isfinite(torch.tensor(fd_dir))
+            assert torch.isfinite(torch.tensor(dir_autograd))
+            assert dir_autograd == pytest.approx(
+                fd_dir, rel=rtol, abs=atol
+            ), f"quats tangent dir {j}: autograd={dir_autograd} fd={fd_dir}"
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+def test_gaussiansplat3d_render_images_from_world_grads_nonzero_with_shN():
+    import fvdb
+
+    device = torch.device("cuda")
+    C, N, D = 1, 1, 3
+
+    means = torch.tensor([[0.05, 0.05, 2.5]], device=device, dtype=torch.float32, requires_grad=True)
+    # Unit quaternion (normalized).
+    quats = torch.tensor(
+        [[0.97321693, 0.05016582, 0.20066328, -0.10033164]],
+        device=device,
+        dtype=torch.float32,
+        requires_grad=True,
+    )
+    log_scales = torch.tensor([[-0.6, -0.9, -0.4]], device=device, dtype=torch.float32, requires_grad=True)
+    logit_opacities = torch.tensor([2.0], device=device, dtype=torch.float32, requires_grad=True)
+
+    # Degree 1 uses 4 SH bases total; sh0 is [N,1,D] and shN holds the remaining K-1=3 bases.
+    sh0 = torch.randn((N, 1, D), device=device, dtype=torch.float32, requires_grad=True)
+    shN = torch.randn((N, 3, D), device=device, dtype=torch.float32, requires_grad=True)
+
+    gs = _build_gaussian_splat(
+        fvdb,
+        means=means,
+        quats=quats,
+        log_scales=log_scales,
+        logit_opacities=logit_opacities,
+        sh0=sh0,
+        shN=shN,
+    )
+
+    image_width = 16
+    image_height = 16
+    world_to_cam, K = _default_camera(device, torch.float32, cx=7.5, cy=7.5)
+
+    rendered, alphas = _render_from_world(
+        fvdb,
+        gs,
+        world_to_cam=world_to_cam,
+        K=K,
+        image_width=image_width,
+        image_height=image_height,
+        sh_degree_to_use=1,
+    )
+
+    loss = (rendered * rendered).sum() + alphas.sum()
+    loss.backward()
+
+    _assert_nonzero_finite_grad(means)
+    _assert_nonzero_finite_grad(quats)
+    _assert_nonzero_finite_grad(log_scales)
+    _assert_nonzero_finite_grad(logit_opacities)
+    _assert_nonzero_finite_grad(sh0)
+    _assert_nonzero_finite_grad(shN)
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+def test_gaussiansplat3d_render_images_from_world_shN_grads_match_finite_differences():
+    """
+    Finite-difference check for SHN coefficients (non-constant SH terms).
+    """
+    import fvdb
+
+    device = torch.device("cuda")
+    dtype = torch.float32
+
+    N = 1
+    D = 3
+    image_width = 16
+    image_height = 16
+
+    world_to_cam, K = _default_camera(device, dtype, cx=7.5, cy=7.5)
+
+    means = torch.tensor([[0.05, 0.05, 2.5]], device=device, dtype=dtype)
+    quats = torch.tensor([[1.0, 0.0, 0.0, 0.0]], device=device, dtype=dtype)
+    log_scales = torch.tensor([[-0.6, -0.9, -0.4]], device=device, dtype=dtype)
+    logit_opacities = torch.tensor([2.0], device=device, dtype=dtype)
+
+    sh0 = torch.tensor([0.8, -0.2, 0.3], device=device, dtype=dtype).view(N, 1, D).contiguous()
+    shN0 = (
+        torch.tensor(
+            [[0.05, -0.01, 0.02], [0.00, 0.03, -0.04], [0.01, 0.02, 0.00]],
+            device=device,
+            dtype=dtype,
+        )
+        .view(N, 3, D)
+        .contiguous()
+    )
+
+    def loss_from_shN(shN_t: torch.Tensor) -> torch.Tensor:
+        gs = _build_gaussian_splat(
+            fvdb,
+            means=means,
+            quats=quats,
+            log_scales=log_scales,
+            logit_opacities=logit_opacities,
+            sh0=sh0,
+            shN=shN_t,
+        )
+
+        rendered, alphas = _render_from_world(
+            fvdb,
+            gs,
+            world_to_cam=world_to_cam,
+            K=K,
+            image_width=image_width,
+            image_height=image_height,
+            sh_degree_to_use=1,
+        )
+
+        return (rendered * rendered).sum() + 0.5 * alphas.sum()
+
+    shN = shN0.detach().clone().requires_grad_(True)
+    loss = loss_from_shN(shN)
+    loss.backward()
+    assert shN.grad is not None
+
+    eps = 1.0e-3
+    rtol = 2.5e-2
+    atol = 2.5e-2
+
+    def fd_shN(i1: int, i2: int) -> float:
+        with torch.no_grad():
+            shN_p = shN0.detach().clone()
+            shN_m = shN0.detach().clone()
+            shN_p[0, i1, i2] += eps
+            shN_m[0, i1, i2] -= eps
+            lp = loss_from_shN(shN_p).item()
+            lm = loss_from_shN(shN_m).item()
+            return (lp - lm) / (2.0 * eps)
+
+    checks = [
+        ("shN[0,0,0]", float(shN.grad[0, 0, 0].item()), fd_shN(0, 0)),
+        ("shN[0,2,1]", float(shN.grad[0, 2, 1].item()), fd_shN(2, 1)),
+    ]
+
+    for name, grad_autograd, grad_fd in checks:
+        assert torch.isfinite(torch.tensor(grad_autograd))
+        assert torch.isfinite(torch.tensor(grad_fd))
+        assert grad_autograd == pytest.approx(
+            grad_fd, rel=rtol, abs=atol
+        ), f"{name}: autograd={grad_autograd} fd={grad_fd}"
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+def test_gaussiansplat3d_render_images_from_world_backward_with_backgrounds_and_masks():
+    """
+    Regression test: exercise autograd backward when *both* backgrounds and masks are provided.
+
+    Historically, mismatches between the number of forward inputs and backward returns can cause
+    runtime errors in some PyTorch builds. This test ensures the code path runs end-to-end.
+    """
+    import fvdb
+
+    device = torch.device("cuda")
+    dtype = torch.float32
+
+    C, N, D = 1, 1, 3
+    image_width = 16
+    image_height = 16
+    tile_size = 16  # single tile -> masks is [C,1,1]
+
+    means = torch.tensor([[0.05, 0.05, 2.5]], device=device, dtype=dtype, requires_grad=True)
+    # Unit quaternion (normalized).
+    quats = torch.tensor(
+        [[0.97321693, 0.05016582, 0.20066328, -0.10033164]],
+        device=device,
+        dtype=dtype,
+        requires_grad=True,
+    )
+    log_scales = torch.tensor([[-0.6, -0.9, -0.4]], device=device, dtype=dtype, requires_grad=True)
+    logit_opacities = torch.tensor([2.0], device=device, dtype=dtype, requires_grad=True)
+    sh0 = torch.randn((N, 1, D), device=device, dtype=dtype, requires_grad=True)
+    shN = torch.empty((N, 0, D), device=device, dtype=dtype, requires_grad=True)
+
+    gs = _build_gaussian_splat(
+        fvdb,
+        means=means,
+        quats=quats,
+        log_scales=log_scales,
+        logit_opacities=logit_opacities,
+        sh0=sh0,
+        shN=shN,
+    )
+
+    world_to_cam, K = _default_camera(device, dtype, cx=7.5, cy=7.5)
+
+    backgrounds = torch.tensor([[0.1, -0.2, 0.3]], device=device, dtype=dtype)  # [C,D]
+    masks = torch.ones((C, 1, 1), device=device, dtype=torch.bool)  # enable the only tile
+
+    rendered, alphas = _render_from_world(
+        fvdb,
+        gs,
+        world_to_cam=world_to_cam,
+        K=K,
+        image_width=image_width,
+        image_height=image_height,
+        sh_degree_to_use=0,
+        tile_size=tile_size,
+        backgrounds=backgrounds,
+        masks=masks,
+    )
+
+    loss = (rendered * rendered).sum() + 0.5 * alphas.sum()
+    loss.backward()
+
+    _assert_nonzero_finite_grad(means)
+    _assert_nonzero_finite_grad(quats)
+    _assert_nonzero_finite_grad(log_scales)
+    _assert_nonzero_finite_grad(logit_opacities)
+    _assert_nonzero_finite_grad(sh0)
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+def test_gaussiansplat3d_render_images_from_world_masks_write_background_and_zero_grads():
+    import fvdb
+
+    device = torch.device("cuda")
+    dtype = torch.float32
+
+    C, N, D = 1, 1, 3
+    image_width = 16
+    image_height = 16
+    tile_size = 16  # single tile -> mask is [C,1,1]
+
+    means = torch.tensor([[0.05, 0.05, 2.5]], device=device, dtype=dtype, requires_grad=True)
+    quats = torch.tensor([[1.0, 0.0, 0.0, 0.0]], device=device, dtype=dtype, requires_grad=True)
+    log_scales = torch.tensor([[-0.6, -0.9, -0.4]], device=device, dtype=dtype, requires_grad=True)
+    logit_opacities = torch.tensor([2.0], device=device, dtype=dtype, requires_grad=True)
+    sh0 = torch.randn((N, 1, D), device=device, dtype=dtype, requires_grad=True)
+    shN = torch.empty((N, 0, D), device=device, dtype=dtype, requires_grad=True)
+
+    gs = _build_gaussian_splat(
+        fvdb,
+        means=means,
+        quats=quats,
+        log_scales=log_scales,
+        logit_opacities=logit_opacities,
+        sh0=sh0,
+        shN=shN,
+    )
+
+    world_to_cam, K = _default_camera(device, dtype, cx=7.5, cy=7.5)
+
+    backgrounds = torch.tensor([[0.10, -0.20, 0.30]], device=device, dtype=dtype)  # [C,D]
+    masks = torch.zeros((C, 1, 1), device=device, dtype=torch.bool)  # mask out the only tile
+
+    rendered, alphas = _render_from_world(
+        fvdb,
+        gs,
+        world_to_cam=world_to_cam,
+        K=K,
+        image_width=image_width,
+        image_height=image_height,
+        sh_degree_to_use=0,
+        tile_size=tile_size,
+        backgrounds=backgrounds,
+        masks=masks,
+    )
+
+    expected = backgrounds.view(C, 1, 1, D).expand(C, image_height, image_width, D)
+    assert torch.equal(alphas, torch.zeros_like(alphas))
+    assert torch.equal(rendered, expected)
+
+    loss = rendered.sum() + alphas.sum()
+    loss.backward()
+
+    # Masked pixels contribute nothing: grads should be exactly zero.
+    assert means.grad is not None and torch.equal(means.grad, torch.zeros_like(means.grad))
+    assert quats.grad is not None and torch.equal(quats.grad, torch.zeros_like(quats.grad))
+    assert log_scales.grad is not None and torch.equal(log_scales.grad, torch.zeros_like(log_scales.grad))
+    assert logit_opacities.grad is not None and torch.equal(
+        logit_opacities.grad, torch.zeros_like(logit_opacities.grad)
+    )
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+def test_gaussiansplat3d_render_images_from_world_masks_edge_tile_does_not_deadlock():
+    """
+    Regression test: masked *edge tiles* must not deadlock.
+
+    Edge tiles (where image dimensions are not multiples of tile size) have threads that are
+    `!inside`, but the kernel still uses block-level barriers. Any early-return taken by only
+    the `inside` threads (e.g. for masked tiles) can deadlock the whole kernel launch.
+
+    We run the render+backward in a subprocess with a timeout; a deadlock will cause the test
+    to hang and fail deterministically.
+    """
+
+    import multiprocessing as mp
+
+    ctx = mp.get_context("spawn")
+    q = ctx.Queue()
+
+    p = ctx.Process(target=_render_images_from_world_masked_edge_tile_worker, args=(q,))
+    p.start()
+    p.join(timeout=20.0)
+
+    if p.is_alive():
+        p.terminate()
+        p.join(timeout=5.0)
+        pytest.fail("CUDA kernel deadlocked on masked edge tile (process timeout).")
+
+    status, payload = q.get(timeout=5.0)
+    if status != "ok":
+        raise AssertionError(payload)
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+def test_gaussiansplat3d_render_images_from_world_backgrounds_used_when_no_intersections():
+    """
+    If no Gaussians intersect the image, rasterization should return the background (if provided)
+    with alpha=0 everywhere.
+    """
+    import fvdb
+
+    device = torch.device("cuda")
+    dtype = torch.float32
+
+    C, N, D = 1, 1, 3
+    image_width = 16
+    image_height = 16
+
+    # Put the Gaussian in front of the camera but closer than near plane so it should be clipped.
+    means = torch.tensor([[0.0, 0.0, 1.0e-4]], device=device, dtype=dtype)
+    quats = torch.tensor([[1.0, 0.0, 0.0, 0.0]], device=device, dtype=dtype)
+    log_scales = torch.tensor([[-0.6, -0.9, -0.4]], device=device, dtype=dtype)
+    logit_opacities = torch.tensor([2.0], device=device, dtype=dtype)
+    sh0 = torch.randn((N, 1, D), device=device, dtype=dtype)
+    shN = torch.empty((N, 0, D), device=device, dtype=dtype)
+
+    gs = _build_gaussian_splat(
+        fvdb,
+        means=means,
+        quats=quats,
+        log_scales=log_scales,
+        logit_opacities=logit_opacities,
+        sh0=sh0,
+        shN=shN,
+    )
+
+    world_to_cam, K = _default_camera(device, dtype, cx=7.5, cy=7.5)
+
+    backgrounds = torch.tensor([[0.25, 0.50, -0.75]], device=device, dtype=dtype)
+    rendered, alphas = _render_from_world(
+        fvdb,
+        gs,
+        world_to_cam=world_to_cam,
+        K=K,
+        image_width=image_width,
+        image_height=image_height,
+        sh_degree_to_use=0,
+        backgrounds=backgrounds,
+        masks=None,
+    )
+
+    expected = backgrounds.view(C, 1, 1, D).expand(C, image_height, image_width, D)
+    assert torch.equal(alphas, torch.zeros_like(alphas))
+    assert torch.equal(rendered, expected)