diff --git a/fvdb/__init__.py b/fvdb/__init__.py index 7c0c20ae..e254a8ce 100644 --- a/fvdb/__init__.py +++ b/fvdb/__init__.py @@ -126,7 +126,7 @@ def gaussian_render_jagged( from .convolution_plan import ConvolutionPlan from .gaussian_splatting import GaussianSplat3d, ProjectedGaussianSplats -from .enums import ProjectionType, ShOrderingMode +from .enums import CameraModel, ProjectionType, RollingShutterType, ShOrderingMode # Import torch-compatible functions that work with both Tensor and JaggedTensor from .torch_jagged import ( @@ -190,6 +190,8 @@ def gaussian_render_jagged( "JaggedTensor", "GaussianSplat3d", "ProjectedGaussianSplats", + "CameraModel", + "RollingShutterType", "ProjectionType", "ShOrderingMode", "ConvolutionPlan", diff --git a/fvdb/__init__.pyi b/fvdb/__init__.pyi index 9dbb4f43..a63af0b5 100644 --- a/fvdb/__init__.pyi +++ b/fvdb/__init__.pyi @@ -19,7 +19,7 @@ def _parse_device_string(device_string: str | torch.device) -> torch.device: ... from . import nn, utils, viz from ._fvdb_cpp import config, hilbert, morton, volume_render from .convolution_plan import ConvolutionPlan -from .enums import ProjectionType, ShOrderingMode +from .enums import CameraModel, ProjectionType, RollingShutterType, ShOrderingMode from .gaussian_splatting import GaussianSplat3d, ProjectedGaussianSplats from .grid import Grid from .grid_batch import GridBatch, gcat @@ -109,6 +109,8 @@ __all__ = [ "GaussianSplat3d", "ProjectedGaussianSplats", "ConvolutionPlan", + "CameraModel", + "RollingShutterType", "ProjectionType", "ShOrderingMode", "Grid", diff --git a/fvdb/_fvdb_cpp.pyi b/fvdb/_fvdb_cpp.pyi index 5b7c9546..8180736a 100644 --- a/fvdb/_fvdb_cpp.pyi +++ b/fvdb/_fvdb_cpp.pyi @@ -196,6 +196,24 @@ class GaussianSplat3d: antialias: bool = ..., backgrounds: Optional[torch.Tensor] = ..., ) -> tuple[torch.Tensor, torch.Tensor]: ... + def render_images_from_world( + self, + world_to_camera_matrices: torch.Tensor, + projection_matrices: torch.Tensor, + image_width: int, + image_height: int, + near: float, + far: float, + camera_model: "CameraModel" = ..., + distortion_coeffs: Optional[torch.Tensor] = ..., + sh_degree_to_use: int = ..., + tile_size: int = ..., + min_radius_2d: float = ..., + eps_2d: float = ..., + antialias: bool = ..., + backgrounds: Optional[torch.Tensor] = ..., + masks: Optional[torch.Tensor] = ..., + ) -> tuple[torch.Tensor, torch.Tensor]: ... def sparse_render_images( self, pixels_to_render: JaggedTensor, @@ -1178,3 +1196,16 @@ def volume_render( packInfo: torch.Tensor, transmittanceThresh: float, ) -> list[torch.Tensor]: ... + +class RollingShutterType(Enum): + NONE = ... + VERTICAL = ... + HORIZONTAL = ... + +class CameraModel(Enum): + PINHOLE = ... + OPENCV_RADTAN_5 = ... + OPENCV_RATIONAL_8 = ... + OPENCV_RADTAN_THIN_PRISM_9 = ... + OPENCV_THIN_PRISM_12 = ... + ORTHOGRAPHIC = ... diff --git a/fvdb/enums.py b/fvdb/enums.py index c81c3658..9d7b4cce 100644 --- a/fvdb/enums.py +++ b/fvdb/enums.py @@ -2,7 +2,7 @@ # SPDX-License-Identifier: Apache-2.0 # -from enum import Enum +from enum import Enum, IntEnum class ProjectionType(str, Enum): @@ -48,3 +48,73 @@ class ShOrderingMode(str, Enum): The feature channels of spherical harmonics are stored in separate blocks for each coefficient. *i.e.* The spherical harmonics tensor corresponds to a (row-major) contiguous tensor of shape ``[num_coefficients, channels, num_sh_bases]``, where channels=3 for RGB. """ + + +class RollingShutterType(IntEnum): + """ + Rolling shutter policy for camera projection / ray generation. + + Rolling shutter models treat different image rows/columns as having different exposure times. + FVDB uses this to interpolate between per-camera start/end poses when generating rays. + """ + + NONE = 0 + """ + No rolling shutter: the start pose is used for all pixels. + """ + + VERTICAL = 1 + """ + Vertical rolling shutter: exposure time varies with image row (y). + """ + + HORIZONTAL = 2 + """ + Horizontal rolling shutter: exposure time varies with image column (x). + """ + + +class CameraModel(IntEnum): + """ + Camera model for projection / ray generation. + + Notes: + - ``PINHOLE`` and ``ORTHOGRAPHIC`` ignore distortion coefficients. + - ``OPENCV_*`` variants use pinhole intrinsics plus OpenCV-style distortion. When distortion + coefficients are provided, FVDB expects a packed layout: + + ``[k1,k2,k3,k4,k5,k6,p1,p2,s1,s2,s3,s4]`` + + Unused coefficients for a given model should be set to 0. + """ + + PINHOLE = 0 + """ + Ideal pinhole camera model (no distortion). + """ + + OPENCV_RADTAN_5 = 1 + """ + OpenCV radial-tangential distortion with 5 parameters (k1,k2,p1,p2,k3). + """ + + OPENCV_RATIONAL_8 = 2 + """ + OpenCV rational radial-tangential distortion with 8 parameters (k1..k6,p1,p2). + """ + + OPENCV_RADTAN_THIN_PRISM_9 = 3 + """ + OpenCV radial-tangential + thin-prism distortion with 9 parameters (k1,k2,p1,p2,k3,s1..s4). + """ + + OPENCV_THIN_PRISM_12 = 4 + """ + OpenCV rational radial-tangential + thin-prism distortion with 12 parameters + (k1..k6,p1,p2,s1..s4). + """ + + ORTHOGRAPHIC = 5 + """ + Orthographic camera model (no distortion). + """ diff --git a/fvdb/gaussian_splatting.py b/fvdb/gaussian_splatting.py index de6d5201..0a8c10a8 100644 --- a/fvdb/gaussian_splatting.py +++ b/fvdb/gaussian_splatting.py @@ -5,8 +5,9 @@ from typing import Any, Mapping, Sequence, TypeVar, overload import torch -from fvdb.enums import ProjectionType +from fvdb.enums import CameraModel, ProjectionType +from . import _fvdb_cpp as _C from ._fvdb_cpp import GaussianSplat3d as GaussianSplat3dCpp from ._fvdb_cpp import JaggedTensor as JaggedTensorCpp from ._fvdb_cpp import ProjectedGaussianSplats as ProjectedGaussianSplatsCpp @@ -1877,6 +1878,117 @@ def render_images( backgrounds=backgrounds, ) + def render_images_from_world( + self, + world_to_camera_matrices: torch.Tensor, + projection_matrices: torch.Tensor, + image_width: int, + image_height: int, + near: float, + far: float, + camera_model: CameraModel = CameraModel.PINHOLE, + distortion_coeffs: torch.Tensor | None = None, + sh_degree_to_use: int = -1, + tile_size: int = 16, + min_radius_2d: float = 0.0, + eps_2d: float = 0.3, + antialias: bool = False, + backgrounds: torch.Tensor | None = None, + masks: torch.Tensor | None = None, + ) -> tuple[torch.Tensor, torch.Tensor]: + """ + Render dense images by rasterizing directly from world-space 3D Gaussians. + + This is similar to :meth:`render_images`, but the rasterization step is performed in 3D + using per-pixel rays against the Gaussian ellipsoids (instead of rasterizing 2D conics + produced by a projection step). This enables gradients w.r.t. Gaussian geometry + (``means``, ``quats``, ``log_scales``) through rasterization, which is useful for + Unscented Transform (UT)-based OpenCV camera models. + + Notes: + - This is **dense-only**: outputs are dense tensors of shape ``(C, H, W, ...)``. + - Tile intersection data is still computed from a (non-differentiable) projection + step, so gradients can be discontinuous when small parameter changes cause a Gaussian + to enter/leave a tile (or switch which tiles it overlaps). + - Background compositing follows standard "over" alpha compositing. If + ``backgrounds`` is provided, the output color is: + + ``color = sum_i (feat_i * alpha_i * T_i) + T_final * background`` + + where ``T_final`` is the remaining transmittance at the end of rasterization, and + ``alpha = 1 - T_final``. + - ``masks`` is a **per-tile** boolean mask (parity with the classic rasterizer). + Tiles where ``masks[c, th, tw] == False`` are skipped entirely: the output is + background with ``alpha=0`` and the tile contributes **zero gradients**. + + Example: + + .. code-block:: python + + images, alphas = gaussian_splat_3d.render_images_from_world( + world_to_camera_matrices, # [C,4,4] + projection_matrices, # [C,3,3] + image_width=640, + image_height=480, + near=0.01, + far=1e10, + camera_model=fvdb.CameraModel.OPENCV_RATIONAL_8, + distortion_coeffs=dist_coeffs, # [C,12] + backgrounds=bg, # [C,D] + masks=tile_mask, # [C,tileH,tileW] (optional) + ) + + Args: + world_to_camera_matrices (torch.Tensor): Tensor of shape ``(C, 4, 4)``. + projection_matrices (torch.Tensor): Tensor of shape ``(C, 3, 3)``. + image_width (int): Output image width ``W``. + image_height (int): Output image height ``H``. + near (float): Near clipping plane. + far (float): Far clipping plane. + camera_model (CameraModel): Camera model used for ray generation and distortion. + distortion_coeffs (torch.Tensor | None): Distortion coefficients for OpenCV camera + models. Use ``None`` for no distortion. Expected shape is ``(C, 12)`` with packed + layout ``[k1,k2,k3,k4,k5,k6,p1,p2,s1,s2,s3,s4]``. For camera models that use fewer + coefficients, unused entries should be set to 0. + sh_degree_to_use (int): SH degree to use. ``-1`` means use all available SH bases. + tile_size (int): Tile size (in pixels). ``tileH = ceil(H / tile_size)``, + ``tileW = ceil(W / tile_size)``. + min_radius_2d (float): Minimum projected radius (in pixels) used for tiling/culling. + eps_2d (float): Padding used during tiling/projection to avoid numerical issues. + antialias (bool): If ``True``, applies opacity correction (when available) when using + ``eps_2d > 0.0``. + backgrounds (torch.Tensor | None): Optional background colors of shape ``(C, D)``, + where ``D`` is :attr:`num_channels`. If ``None``, background is treated as 0. + masks (torch.Tensor | None): Optional per-tile boolean mask of shape + ``(C, tileH, tileW)``. Masked tiles are skipped and filled with background. + + Returns: + images (torch.Tensor): Rendered images of shape ``(C, H, W, D)``. + alpha_images (torch.Tensor): Alpha images of shape ``(C, H, W, 1)``. + """ + if isinstance(camera_model, CameraModel): + camera_model_cpp = getattr(_C.CameraModel, camera_model.name) + else: + camera_model_cpp = camera_model + + return self._impl.render_images_from_world( + world_to_camera_matrices=world_to_camera_matrices, + projection_matrices=projection_matrices, + image_width=image_width, + image_height=image_height, + near=near, + far=far, + camera_model=camera_model_cpp, + distortion_coeffs=distortion_coeffs, + sh_degree_to_use=sh_degree_to_use, + tile_size=tile_size, + min_radius_2d=min_radius_2d, + eps_2d=eps_2d, + antialias=antialias, + backgrounds=backgrounds, + masks=masks, + ) + def sparse_render_images( self, pixels_to_render: JaggedTensorOrTensorT, diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 8fe92fc9..4cde6456 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -10,12 +10,10 @@ option(FVDB_BUILD_TESTS "Configure CMake to build tests" ON) option(FVDB_BUILD_BENCHMARKS "Configure CMake to build (google & nvbench) benchmarks" OFF) option(FVDB_STRIP_SYMBOLS "Strip symbols from the build" OFF) option(FVDB_LINEINFO "Enable lineinfo in the build" OFF) -option(FVDB_USE_OPENMP "Enable OpenMP for CPU parallelization" ON) message(STATUS "FVDB: Configure CMake to build tests: ${FVDB_BUILD_TESTS}") message(STATUS "FVDB: Configure CMake to build (google & nvbench) benchmarks: ${FVDB_BUILD_BENCHMARKS}") message(STATUS "FVDB_STRIP_SYMBOLS: ${FVDB_STRIP_SYMBOLS}") -message(STATUS "FVDB_USE_OPENMP: ${FVDB_USE_OPENMP}") # Get dependencies include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/get_cpm.cmake) @@ -51,6 +49,7 @@ set(FVDB_CPP_FILES fvdb/detail/autograd/Inject.cpp fvdb/detail/autograd/GaussianProjection.cpp fvdb/detail/autograd/GaussianRasterize.cpp + fvdb/detail/autograd/GaussianRasterizeFromWorld.cpp fvdb/detail/autograd/GaussianRasterizeSparse.cpp fvdb/detail/autograd/JaggedReduce.cpp fvdb/detail/autograd/MaxPoolGrid.cpp @@ -112,6 +111,8 @@ set(FVDB_CU_FILES fvdb/detail/ops/gsplat/GaussianProjectionJaggedForward.cu fvdb/detail/ops/gsplat/GaussianRasterizeBackward.cu fvdb/detail/ops/gsplat/GaussianRasterizeForward.cu + fvdb/detail/ops/gsplat/GaussianRasterizeFromWorldBackward.cu + fvdb/detail/ops/gsplat/GaussianRasterizeFromWorldForward.cu fvdb/detail/ops/gsplat/GaussianRasterizeNumContributingGaussians.cu fvdb/detail/ops/gsplat/GaussianRasterizeTopContributingGaussianIds.cu fvdb/detail/ops/gsplat/GaussianRasterizeContributingGaussianIds.cu diff --git a/src/fvdb/GaussianSplat3d.cpp b/src/fvdb/GaussianSplat3d.cpp index bc485b95..d2a1b9ad 100644 --- a/src/fvdb/GaussianSplat3d.cpp +++ b/src/fvdb/GaussianSplat3d.cpp @@ -12,8 +12,10 @@ #include #include #include +#include // Ops headers +#include #include #include #include @@ -343,12 +345,13 @@ GaussianSplat3d::projectGaussiansImpl(const torch::Tensor &worldToCameraMatrices ret.perGaussian2dMean = projectionResults[1]; ret.perGaussianDepth = projectionResults[2]; ret.perGaussianConic = projectionResults[3]; + // FIXME: Use accessors in the kernel and use exapand + ret.perGaussianOpacity = opacities().repeat({C, 1}); if (settings.antialias) { - // Antialias compensation is per-camera [C, N], so we must materialize [C, N] - ret.perGaussianOpacity = opacities().unsqueeze(0).expand({C, -1}) * projectionResults[4]; - } else { - // Store as [N]; the opacities() accessor will lazily expand to [C, N] when accessed - ret.perGaussianOpacity = opacities(); + ret.perGaussianOpacity *= projectionResults[4]; + // FIXME (Francis): The contiguity requirement is dumb and should be + // removed by using accessors in the kernel + ret.perGaussianOpacity = ret.perGaussianOpacity.contiguous(); } ret.perGaussianRenderQuantity = [&]() { @@ -476,12 +479,13 @@ GaussianSplat3d::sparseProjectGaussiansImpl(const JaggedTensor &pixelsToRender, ret.perGaussian2dMean = projectionResults[1]; ret.perGaussianDepth = projectionResults[2]; ret.perGaussianConic = projectionResults[3]; + // FIXME: Use accessors in the kernel and use expand + ret.perGaussianOpacity = opacities().repeat({C, 1}); if (settings.antialias) { - // Antialias compensation is per-camera [C, N], so we must materialize [C, N] - ret.perGaussianOpacity = opacities().unsqueeze(0).expand({C, -1}) * projectionResults[4]; - } else { - // Store as [N]; the opacities() accessor will lazily expand to [C, N] when accessed - ret.perGaussianOpacity = opacities(); + ret.perGaussianOpacity *= projectionResults[4]; + // FIXME (Francis): The contiguity requirement is dumb and should be + // removed by using accessors in the kernel + ret.perGaussianOpacity = ret.perGaussianOpacity.contiguous(); } ret.perGaussianRenderQuantity = [&]() { @@ -555,7 +559,7 @@ GaussianSplat3d::renderCropFromProjectedGaussiansImpl( projectedGaussians.perGaussian2dMean, projectedGaussians.perGaussianConic, projectedGaussians.perGaussianRenderQuantity, - projectedGaussians.opacities(), + projectedGaussians.perGaussianOpacity, cropWidth_, cropHeight_, cropOriginW_, @@ -597,7 +601,7 @@ GaussianSplat3d::sparseRenderImpl(const JaggedTensor &pixelsToRender, state.perGaussian2dMean, state.perGaussianConic, state.perGaussianRenderQuantity, - state.opacities(), + state.perGaussianOpacity, settings.imageWidth, settings.imageHeight, 0, @@ -629,7 +633,7 @@ GaussianSplat3d::renderNumContributingGaussiansImpl( return fvdb::detail::ops::dispatchGaussianRasterizeNumContributingGaussians( state.perGaussian2dMean, state.perGaussianConic, - state.opacities(), + state.perGaussianOpacity, state.tileOffsets, state.tileGaussianIds, settings); @@ -651,7 +655,7 @@ GaussianSplat3d::sparseRenderNumContributingGaussiansImpl( return fvdb::detail::ops::dispatchGaussianSparseRasterizeNumContributingGaussians< DeviceTag>(state.perGaussian2dMean, state.perGaussianConic, - state.opacities(), + state.perGaussianOpacity, state.tileOffsets, state.tileGaussianIds, pixelsToRender, @@ -693,7 +697,7 @@ GaussianSplat3d::renderContributingGaussianIdsImpl( return fvdb::detail::ops::dispatchGaussianRasterizeContributingGaussianIds( state.perGaussian2dMean, state.perGaussianConic, - state.opacities(), + state.perGaussianOpacity, state.tileOffsets, state.tileGaussianIds, settings, @@ -717,7 +721,7 @@ GaussianSplat3d::sparseRenderContributingGaussianIdsImpl( return fvdb::detail::ops::dispatchGaussianSparseRasterizeContributingGaussianIds( state.perGaussian2dMean, state.perGaussianConic, - state.opacities(), + state.perGaussianOpacity, state.tileOffsets, state.tileGaussianIds, pixelsToRender, @@ -884,6 +888,154 @@ GaussianSplat3d::renderImages(const torch::Tensor &worldToCameraMatrices, state, settings.tileSize, settings.imageWidth, settings.imageHeight, 0, 0, backgrounds); } +std::tuple +GaussianSplat3d::renderImagesFromWorld(const torch::Tensor &worldToCameraMatrices, + const torch::Tensor &projectionMatrices, + const size_t imageWidth, + const size_t imageHeight, + const float near, + const float far, + const fvdb::detail::ops::CameraModel cameraModel, + const std::optional &distortionCoeffs, + const int64_t shDegreeToUse, + const size_t tileSize, + const float minRadius2d, + const float eps2d, + const bool antialias, + const std::optional &backgrounds, + const std::optional &masks) { + FVDB_FUNC_RANGE(); + const int C = worldToCameraMatrices.size(0); // number of cameras + TORCH_CHECK(C > 0, "At least one camera must be provided (got 0)"); + + torch::Tensor perGaussianRadius; + torch::Tensor perGaussian2dMean; + torch::Tensor perGaussianDepth; + torch::Tensor perGaussianOpacity; + torch::Tensor perGaussianRenderQuantity; + torch::Tensor tileOffsets; + torch::Tensor tileGaussianIds; + + if (cameraModel == fvdb::detail::ops::CameraModel::PINHOLE || + cameraModel == fvdb::detail::ops::CameraModel::ORTHOGRAPHIC) { + // Fast path: reuse the classic projection for tiling/sorting. + RenderSettings settings; + settings.imageWidth = imageWidth; + settings.imageHeight = imageHeight; + settings.nearPlane = near; + settings.farPlane = far; + settings.projectionType = (cameraModel == fvdb::detail::ops::CameraModel::ORTHOGRAPHIC) + ? fvdb::detail::ops::ProjectionType::ORTHOGRAPHIC + : fvdb::detail::ops::ProjectionType::PERSPECTIVE; + settings.shDegreeToUse = shDegreeToUse; + settings.radiusClip = minRadius2d; + settings.eps2d = eps2d; + settings.antialias = antialias; + settings.tileSize = tileSize; + settings.renderMode = RenderSettings::RenderMode::RGB; + + const ProjectedGaussianSplats state = + projectGaussiansImpl(worldToCameraMatrices, projectionMatrices, settings); + + perGaussianRadius = state.perGaussianRadius; + perGaussian2dMean = state.perGaussian2dMean; + perGaussianDepth = state.perGaussianDepth; + perGaussianOpacity = state.perGaussianOpacity; + perGaussianRenderQuantity = state.perGaussianRenderQuantity; + tileOffsets = state.tileOffsets; + tileGaussianIds = state.tileGaussianIds; + } else { + // OpenCV camera models: use UT projection to compute radii/depths for tiling/sorting. + // Rasterization itself is still performed with the 3DGS ray-ellipsoid kernel. + const torch::Tensor distortionCoeffs_ = distortionCoeffs.has_value() + ? distortionCoeffs.value() + : torch::empty({C, 0}, mMeans.options()); + + fvdb::detail::ops::UTParams utParams = fvdb::detail::ops::UTParams{}; + const auto projectionResults = FVDB_DISPATCH_KERNEL(mMeans.device(), [&]() { + return fvdb::detail::ops::dispatchGaussianProjectionForwardUT( + mMeans, + mQuats, + mLogScales, + worldToCameraMatrices, + worldToCameraMatrices, + projectionMatrices, + fvdb::detail::ops::RollingShutterType::NONE, + utParams, + cameraModel, + distortionCoeffs_, + static_cast(imageWidth), + static_cast(imageHeight), + eps2d, + near, + far, + minRadius2d, + antialias); + }); + + perGaussianRadius = std::get<0>(projectionResults); + perGaussian2dMean = std::get<1>(projectionResults); + perGaussianDepth = std::get<2>(projectionResults); + + // Opacities are stored per-Gaussian (shared across cameras). Expand to [C,N] and apply + // antialiasing compensation (if computed). + perGaussianOpacity = opacities().repeat({C, 1}); + if (antialias) { + const torch::Tensor compensations = std::get<4>(projectionResults); + TORCH_CHECK( + compensations.defined(), + "UT projection returned an undefined compensation tensor in antialias mode"); + perGaussianOpacity *= compensations; + perGaussianOpacity = perGaussianOpacity.contiguous(); + } + + // Evaluate SH for per-camera features. (We use the start pose for view-direction shading.) + perGaussianRenderQuantity = + evalSphericalHarmonicsImpl(shDegreeToUse, worldToCameraMatrices, perGaussianRadius); + + const int numTilesW = std::ceil(imageWidth / static_cast(tileSize)); + const int numTilesH = std::ceil(imageHeight / static_cast(tileSize)); + std::tie(tileOffsets, tileGaussianIds) = FVDB_DISPATCH_KERNEL(mMeans.device(), [&]() { + return detail::ops::dispatchGaussianTileIntersection(perGaussian2dMean, + perGaussianRadius, + perGaussianDepth, + at::nullopt, + C, + tileSize, + numTilesH, + numTilesW); + }); + } + + const torch::Tensor distortionCoeffsForRaster = distortionCoeffs.has_value() + ? distortionCoeffs.value() + : torch::empty({C, 0}, mMeans.options()); + + auto outputs = detail::autograd::RasterizeGaussiansToPixelsFromWorld3DGS::apply( + mMeans, + mQuats, + mLogScales, + perGaussianRenderQuantity, + perGaussianOpacity, + worldToCameraMatrices, + worldToCameraMatrices, + projectionMatrices, + distortionCoeffsForRaster, + fvdb::detail::ops::RollingShutterType::NONE, + cameraModel, + static_cast(imageWidth), + static_cast(imageHeight), + 0, + 0, + static_cast(tileSize), + tileOffsets, + tileGaussianIds, + backgrounds, + masks); + + return {outputs[0], outputs[1]}; +} + std::tuple GaussianSplat3d::renderDepths(const torch::Tensor &worldToCameraMatrices, const torch::Tensor &projectionMatrices, diff --git a/src/fvdb/GaussianSplat3d.h b/src/fvdb/GaussianSplat3d.h index 2c6248e1..3716b6ca 100644 --- a/src/fvdb/GaussianSplat3d.h +++ b/src/fvdb/GaussianSplat3d.h @@ -5,12 +5,15 @@ #define FVDB_GAUSSIANSPLAT3D_H #include +#include #include #include #include #include +#include + namespace fvdb { /// @brief A class representing a Gaussian splat scene in 3D space. @@ -905,6 +908,34 @@ class GaussianSplat3d { const bool antialias = false, const std::optional &backgrounds = std::nullopt); + /// @brief Render images by rasterizing directly from world-space 3D Gaussians. + /// + /// This is similar to @ref renderImages but performs rasterization directly from world-space + /// Gaussians (means/quats/log-scales) rather than from their 2D projections. This enables + /// geometry gradients through the rasterization step. + /// + /// Tile intersections are still computed using a (non-differentiable) projection step: + /// - For `cameraModel == CameraModel::PINHOLE` or `CameraModel::ORTHOGRAPHIC`, we reuse the + /// classic projection path. + /// - For OpenCV camera models, we use the Unscented Transform (UT) projection kernel to + /// compute per-Gaussian radii and depths for sorting / tiling, then rasterize with 3DGS. + std::tuple renderImagesFromWorld( + const torch::Tensor &worldToCameraMatrices, + const torch::Tensor &projectionMatrices, + const size_t imageWidth, + const size_t imageHeight, + const float near, + const float far, + const fvdb::detail::ops::CameraModel cameraModel = fvdb::detail::ops::CameraModel::PINHOLE, + const std::optional &distortionCoeffs = std::nullopt, + const int64_t shDegreeToUse = -1, + const size_t tileSize = 16, + const float minRadius2d = 0.0, + const float eps2d = 0.3, + const bool antialias = false, + const std::optional &backgrounds = std::nullopt, + const std::optional &masks = std::nullopt); + /// @brief Render depths of this Gaussian splat scene from the given camera matrices and /// projection matrices. /// @param worldToCameraMatrices [C, 4, 4] Camera-to-world matrices diff --git a/src/fvdb/detail/autograd/GaussianRasterizeFromWorld.cpp b/src/fvdb/detail/autograd/GaussianRasterizeFromWorld.cpp new file mode 100644 index 00000000..219890dd --- /dev/null +++ b/src/fvdb/detail/autograd/GaussianRasterizeFromWorld.cpp @@ -0,0 +1,204 @@ +// Copyright Contributors to the OpenVDB Project +// SPDX-License-Identifier: Apache-2.0 +// +#include +#include +#include +#include +#include +#include + +#include + +namespace fvdb::detail::autograd { + +RasterizeGaussiansToPixelsFromWorld3DGS::VariableList +RasterizeGaussiansToPixelsFromWorld3DGS::forward( + RasterizeGaussiansToPixelsFromWorld3DGS::AutogradContext *ctx, + const RasterizeGaussiansToPixelsFromWorld3DGS::Variable &means, + const RasterizeGaussiansToPixelsFromWorld3DGS::Variable &quats, + const RasterizeGaussiansToPixelsFromWorld3DGS::Variable &logScales, + const RasterizeGaussiansToPixelsFromWorld3DGS::Variable &features, + const RasterizeGaussiansToPixelsFromWorld3DGS::Variable &opacities, + const RasterizeGaussiansToPixelsFromWorld3DGS::Variable &worldToCamMatricesStart, + const RasterizeGaussiansToPixelsFromWorld3DGS::Variable &worldToCamMatricesEnd, + const RasterizeGaussiansToPixelsFromWorld3DGS::Variable &projectionMatrices, + const RasterizeGaussiansToPixelsFromWorld3DGS::Variable &distortionCoeffs, + const fvdb::detail::ops::RollingShutterType rollingShutterType, + const fvdb::detail::ops::CameraModel cameraModel, + const uint32_t imageWidth, + const uint32_t imageHeight, + const uint32_t imageOriginW, + const uint32_t imageOriginH, + const uint32_t tileSize, + const RasterizeGaussiansToPixelsFromWorld3DGS::Variable &tileOffsets, + const RasterizeGaussiansToPixelsFromWorld3DGS::Variable &tileGaussianIds, + std::optional backgrounds, + std::optional masks) { + FVDB_FUNC_RANGE_WITH_NAME("RasterizeGaussiansToPixelsFromWorld3DGS::forward"); + + fvdb::detail::ops::RenderSettings settings; + settings.imageWidth = imageWidth; + settings.imageHeight = imageHeight; + settings.imageOriginW = imageOriginW; + settings.imageOriginH = imageOriginH; + settings.tileSize = tileSize; + + auto outputs = FVDB_DISPATCH_KERNEL_DEVICE(means.device(), [&]() { + return ops::dispatchGaussianRasterizeFromWorld3DGSForward( + means, + quats, + logScales, + features, + opacities, + worldToCamMatricesStart, + worldToCamMatricesEnd, + projectionMatrices, + distortionCoeffs, + rollingShutterType, + cameraModel, + settings, + tileOffsets, + tileGaussianIds, + backgrounds, + masks); + }); + + Variable renderedFeatures = std::get<0>(outputs); + Variable renderedAlphas = std::get<1>(outputs); + Variable lastIds = std::get<2>(outputs); + + std::vector toSave = {means, + quats, + logScales, + features, + opacities, + worldToCamMatricesStart, + worldToCamMatricesEnd, + projectionMatrices, + distortionCoeffs, + tileOffsets, + tileGaussianIds, + renderedAlphas, + lastIds}; + if (backgrounds.has_value()) { + toSave.push_back(backgrounds.value()); + ctx->saved_data["has_backgrounds"] = true; + } else { + ctx->saved_data["has_backgrounds"] = false; + } + if (masks.has_value()) { + toSave.push_back(masks.value()); + ctx->saved_data["has_masks"] = true; + } else { + ctx->saved_data["has_masks"] = false; + } + ctx->save_for_backward(toSave); + + ctx->saved_data["imageWidth"] = (int64_t)imageWidth; + ctx->saved_data["imageHeight"] = (int64_t)imageHeight; + ctx->saved_data["imageOriginW"] = (int64_t)imageOriginW; + ctx->saved_data["imageOriginH"] = (int64_t)imageOriginH; + ctx->saved_data["tileSize"] = (int64_t)tileSize; + ctx->saved_data["cameraModel"] = (int64_t)cameraModel; + ctx->saved_data["rollingShutterType"] = (int64_t)rollingShutterType; + + return {renderedFeatures, renderedAlphas}; +} + +RasterizeGaussiansToPixelsFromWorld3DGS::VariableList +RasterizeGaussiansToPixelsFromWorld3DGS::backward( + RasterizeGaussiansToPixelsFromWorld3DGS::AutogradContext *ctx, + RasterizeGaussiansToPixelsFromWorld3DGS::VariableList gradOutput) { + FVDB_FUNC_RANGE_WITH_NAME("RasterizeGaussiansToPixelsFromWorld3DGS::backward"); + + Variable dLossDRenderedFeatures = gradOutput.at(0); + Variable dLossDRenderedAlphas = gradOutput.at(1); + if (dLossDRenderedFeatures.defined()) { + dLossDRenderedFeatures = dLossDRenderedFeatures.contiguous(); + } + if (dLossDRenderedAlphas.defined()) { + dLossDRenderedAlphas = dLossDRenderedAlphas.contiguous(); + } + + VariableList saved = ctx->get_saved_variables(); + Variable means = saved.at(0); + Variable quats = saved.at(1); + Variable logScales = saved.at(2); + Variable features = saved.at(3); + Variable opacities = saved.at(4); + Variable worldToCamMatricesStart = saved.at(5); + Variable worldToCamMatricesEnd = saved.at(6); + Variable projectionMatrices = saved.at(7); + Variable distortionCoeffs = saved.at(8); + Variable tileOffsets = saved.at(9); + Variable tileGaussianIds = saved.at(10); + Variable renderedAlphas = saved.at(11); + Variable lastIds = saved.at(12); + + const bool hasBackgrounds = ctx->saved_data["has_backgrounds"].toBool(); + const bool hasMasks = ctx->saved_data["has_masks"].toBool(); + std::optional backgrounds = std::nullopt; + std::optional masks = std::nullopt; + int64_t optIdx = 13; + if (hasBackgrounds) { + backgrounds = saved.at(optIdx++); + } + if (hasMasks) { + masks = saved.at(optIdx++); + } + + const uint32_t imageWidth = (uint32_t)ctx->saved_data["imageWidth"].toInt(); + const uint32_t imageHeight = (uint32_t)ctx->saved_data["imageHeight"].toInt(); + const uint32_t imageOriginW = (uint32_t)ctx->saved_data["imageOriginW"].toInt(); + const uint32_t imageOriginH = (uint32_t)ctx->saved_data["imageOriginH"].toInt(); + const uint32_t tileSize = (uint32_t)ctx->saved_data["tileSize"].toInt(); + const auto cameraModel = + static_cast(ctx->saved_data["cameraModel"].toInt()); + const auto rollingShutterType = static_cast( + ctx->saved_data["rollingShutterType"].toInt()); + + fvdb::detail::ops::RenderSettings settings; + settings.imageWidth = imageWidth; + settings.imageHeight = imageHeight; + settings.imageOriginW = imageOriginW; + settings.imageOriginH = imageOriginH; + settings.tileSize = tileSize; + + auto grads = FVDB_DISPATCH_KERNEL_DEVICE(means.device(), [&]() { + return ops::dispatchGaussianRasterizeFromWorld3DGSBackward( + means, + quats, + logScales, + features, + opacities, + worldToCamMatricesStart, + worldToCamMatricesEnd, + projectionMatrices, + distortionCoeffs, + rollingShutterType, + cameraModel, + settings, + tileOffsets, + tileGaussianIds, + renderedAlphas, + lastIds, + dLossDRenderedFeatures, + dLossDRenderedAlphas, + backgrounds, + masks); + }); + + Variable dMeans = std::get<0>(grads); + Variable dQuats = std::get<1>(grads); + Variable dLogScales = std::get<2>(grads); + Variable dFeatures = std::get<3>(grads); + Variable dOpacities = std::get<4>(grads); + + // Return gradients in the same order as forward inputs. + return {dMeans, dQuats, dLogScales, dFeatures, dOpacities, Variable(), Variable(), + Variable(), Variable(), Variable(), Variable(), Variable(), Variable(), Variable(), + Variable(), Variable(), Variable(), Variable(), Variable(), Variable()}; +} + +} // namespace fvdb::detail::autograd diff --git a/src/fvdb/detail/autograd/GaussianRasterizeFromWorld.h b/src/fvdb/detail/autograd/GaussianRasterizeFromWorld.h new file mode 100644 index 00000000..2174dcd0 --- /dev/null +++ b/src/fvdb/detail/autograd/GaussianRasterizeFromWorld.h @@ -0,0 +1,50 @@ +// Copyright Contributors to the OpenVDB Project +// SPDX-License-Identifier: Apache-2.0 +// +#ifndef FVDB_DETAIL_AUTOGRAD_GAUSSIANRASTERIZEFROMWORLD_H +#define FVDB_DETAIL_AUTOGRAD_GAUSSIANRASTERIZEFROMWORLD_H + +#include + +#include + +#include + +namespace fvdb::detail::autograd { + +/// @brief Autograd wrapper for dense rasterization from world-space 3D Gaussians. +struct RasterizeGaussiansToPixelsFromWorld3DGS + : public torch::autograd::Function { + using VariableList = torch::autograd::variable_list; + using AutogradContext = torch::autograd::AutogradContext; + using Variable = torch::autograd::Variable; + + static VariableList + forward(AutogradContext *ctx, + const Variable &means, // [N,3] + const Variable &quats, // [N,4] + const Variable &logScales, // [N,3] + const Variable &features, // [C,N,D] + const Variable &opacities, // [C,N] + const Variable &worldToCamMatricesStart, // [C,4,4] + const Variable &worldToCamMatricesEnd, // [C,4,4] + const Variable &projectionMatrices, // [C,3,3] + const Variable &distortionCoeffs, // [C,K] + const fvdb::detail::ops::RollingShutterType rollingShutterType, + const fvdb::detail::ops::CameraModel cameraModel, + const uint32_t imageWidth, + const uint32_t imageHeight, + const uint32_t imageOriginW, + const uint32_t imageOriginH, + const uint32_t tileSize, + const Variable &tileOffsets, // [C, tileH, tileW] + const Variable &tileGaussianIds, // [n_isects] + std::optional backgrounds = std::nullopt, // [C,D] + std::optional masks = std::nullopt); // [C,tileH,tileW] bool + + static VariableList backward(AutogradContext *ctx, VariableList gradOutput); +}; + +} // namespace fvdb::detail::autograd + +#endif // FVDB_DETAIL_AUTOGRAD_GAUSSIANRASTERIZEFROMWORLD_H diff --git a/src/fvdb/detail/ops/gsplat/GaussianCameraMatrixUtils.cuh b/src/fvdb/detail/ops/gsplat/GaussianCameraMatrixUtils.cuh new file mode 100644 index 00000000..2d7773c5 --- /dev/null +++ b/src/fvdb/detail/ops/gsplat/GaussianCameraMatrixUtils.cuh @@ -0,0 +1,61 @@ +// Copyright Contributors to the OpenVDB Project +// SPDX-License-Identifier: Apache-2.0 +// +#ifndef FVDB_DETAIL_OPS_GSPLAT_GAUSSIANCAMERAMATRIXUTILS_CUH +#define FVDB_DETAIL_OPS_GSPLAT_GAUSSIANCAMERAMATRIXUTILS_CUH + +#include + +namespace fvdb::detail::ops { + +template +inline __host__ __device__ void +loadWorldToCamRtRowMajor4x4(const T *m44, nanovdb::math::Mat3 &R, nanovdb::math::Vec3 &t) { + // Row-major 4x4 with last column = translation. + R = nanovdb::math::Mat3(m44[0], + m44[1], + m44[2], // 1st row + m44[4], + m44[5], + m44[6], // 2nd row + m44[8], + m44[9], + m44[10]); // 3rd row + t = nanovdb::math::Vec3(m44[3], m44[7], m44[11]); +} + +template +inline __device__ void +loadWorldToCamRtFromAccessor44(const Acc44 &m44 /* [C,4,4] */, + const int64_t camId, + nanovdb::math::Mat3 &R, + nanovdb::math::Vec3 &t) { + R = nanovdb::math::Mat3(m44[camId][0][0], + m44[camId][0][1], + m44[camId][0][2], // 1st row + m44[camId][1][0], + m44[camId][1][1], + m44[camId][1][2], // 2nd row + m44[camId][2][0], + m44[camId][2][1], + m44[camId][2][2]); // 3rd row + t = nanovdb::math::Vec3(m44[camId][0][3], m44[camId][1][3], m44[camId][2][3]); +} + +template +inline __device__ nanovdb::math::Mat3 +loadMat3FromAccessor33(const Acc33 &m33 /* [C,3,3] */, const int64_t camId) { + return nanovdb::math::Mat3(m33[camId][0][0], + m33[camId][0][1], + m33[camId][0][2], + m33[camId][1][0], + m33[camId][1][1], + m33[camId][1][2], + m33[camId][2][0], + m33[camId][2][1], + m33[camId][2][2]); +} + +} // namespace fvdb::detail::ops + +#endif // FVDB_DETAIL_OPS_GSPLAT_GAUSSIANCAMERAMATRIXUTILS_CUH diff --git a/src/fvdb/detail/ops/gsplat/GaussianCameraModels.h b/src/fvdb/detail/ops/gsplat/GaussianCameraModels.h new file mode 100644 index 00000000..aa45ba5b --- /dev/null +++ b/src/fvdb/detail/ops/gsplat/GaussianCameraModels.h @@ -0,0 +1,40 @@ +// Copyright Contributors to the OpenVDB Project +// SPDX-License-Identifier: Apache-2.0 +// +#ifndef FVDB_DETAIL_OPS_GSPLAT_GAUSSIANCAMERAMODELS_H +#define FVDB_DETAIL_OPS_GSPLAT_GAUSSIANCAMERAMODELS_H + +#include + +namespace fvdb { +namespace detail { +namespace ops { + +/// @brief Rolling shutter policy for camera projection / ray generation. +enum class RollingShutterType : int32_t { NONE = 0, VERTICAL = 1, HORIZONTAL = 2 }; + +/// @brief Camera model for projection (shared across projection and 3DGS rasterization paths). +/// +/// Notes: +/// - `PINHOLE` and `ORTHOGRAPHIC` have no distortion. +/// - `OPENCV_*` variants are pinhole + OpenCV-style distortion, and expect packed coefficients. +enum class CameraModel : int32_t { + // Pinhole intrinsics only (no distortion). + PINHOLE = 0, + + // OpenCV variants which are just pinhole intrinsics + optional distortion (all of them use the + // same [C,12] distortion coefficients layout: [k1,k2,k3,k4,k5,k6,p1,p2,s1,s2,s3,s4]). + OPENCV_RADTAN_5 = 1, // polynomial radial (k1,k2,k3) + tangential (p1,p2) + OPENCV_RATIONAL_8 = 2, // rational radial (k1..k6) + tangential (p1,p2) + OPENCV_RADTAN_THIN_PRISM_9 = 3, // polynomial radial + tangential + thin-prism (s1..s4) + OPENCV_THIN_PRISM_12 = 4, // rational radial + tangential + thin-prism (s1..s4) + + // Orthographic intrinsics (no distortion). + ORTHOGRAPHIC = 5, +}; + +} // namespace ops +} // namespace detail +} // namespace fvdb + +#endif // FVDB_DETAIL_OPS_GSPLAT_GAUSSIANCAMERAMODELS_H diff --git a/src/fvdb/detail/ops/gsplat/GaussianCameraSharedMemory.cuh b/src/fvdb/detail/ops/gsplat/GaussianCameraSharedMemory.cuh new file mode 100644 index 00000000..0d112ad6 --- /dev/null +++ b/src/fvdb/detail/ops/gsplat/GaussianCameraSharedMemory.cuh @@ -0,0 +1,72 @@ +// Copyright Contributors to the OpenVDB Project +// SPDX-License-Identifier: Apache-2.0 +// +#ifndef FVDB_DETAIL_OPS_GSPLAT_GAUSSIANCAMERASHAREDMEMORY_CUH +#define FVDB_DETAIL_OPS_GSPLAT_GAUSSIANCAMERASHAREDMEMORY_CUH + +#include + +#include + +namespace fvdb::detail::ops { + +template +inline __device__ void +copyMat3AccessorToShared(const int64_t C, + nanovdb::math::Mat3 *__restrict__ out, + const Acc33 &acc /* [C,3,3] */) { + constexpr int64_t kElementsPerMat3 = 9; + for (int64_t i = threadIdx.x; i < C * kElementsPerMat3; i += blockDim.x) { + const int64_t camId = i / kElementsPerMat3; + const int64_t entryId = i % kElementsPerMat3; + const int64_t rowId = entryId / 3; + const int64_t colId = entryId % 3; + out[camId][rowId][colId] = acc[camId][rowId][colId]; + } +} + +template +inline __device__ void +copyWorldToCamRotationToShared(const int64_t C, + nanovdb::math::Mat3 *__restrict__ out, + const Acc44 &acc /* [C,4,4] */) { + constexpr int64_t kElementsPerMat3 = 9; + for (int64_t i = threadIdx.x; i < C * kElementsPerMat3; i += blockDim.x) { + const int64_t camId = i / kElementsPerMat3; + const int64_t entryId = i % kElementsPerMat3; + const int64_t rowId = entryId / 3; + const int64_t colId = entryId % 3; + out[camId][rowId][colId] = acc[camId][rowId][colId]; + } +} + +template +inline __device__ void +copyWorldToCamTranslationToShared(const int64_t C, + nanovdb::math::Vec3 *__restrict__ out, + const Acc44 &acc /* [C,4,4] */) { + constexpr int64_t kElementsPerVec3 = 3; + for (int64_t i = threadIdx.x; i < C * kElementsPerVec3; i += blockDim.x) { + const int64_t camId = i / kElementsPerVec3; + const int64_t entryId = i % kElementsPerVec3; + out[camId][entryId] = acc[camId][entryId][3]; + } +} + +template +inline __device__ void +copyDistortionCoeffsToShared(const int64_t C, + const int64_t K, + ScalarType *__restrict__ out /* [C*K] */, + const AccCk &acc /* [C,K] */) { + const int64_t total = C * K; + for (int64_t i = threadIdx.x; i < total; i += blockDim.x) { + const int64_t camId = i / K; + const int64_t entryId = i % K; + out[camId * K + entryId] = acc[camId][entryId]; + } +} + +} // namespace fvdb::detail::ops + +#endif // FVDB_DETAIL_OPS_GSPLAT_GAUSSIANCAMERASHAREDMEMORY_CUH diff --git a/src/fvdb/detail/ops/gsplat/GaussianIntrinsics.cuh b/src/fvdb/detail/ops/gsplat/GaussianIntrinsics.cuh new file mode 100644 index 00000000..0a3238a4 --- /dev/null +++ b/src/fvdb/detail/ops/gsplat/GaussianIntrinsics.cuh @@ -0,0 +1,29 @@ +// Copyright Contributors to the OpenVDB Project +// SPDX-License-Identifier: Apache-2.0 +// +#ifndef FVDB_DETAIL_OPS_GSPLAT_GAUSSIANINTRINSICS_CUH +#define FVDB_DETAIL_OPS_GSPLAT_GAUSSIANINTRINSICS_CUH + +#include + +namespace fvdb::detail::ops { + +template struct CameraIntrinsics { + T fx, fy, cx, cy; +}; + +template +inline __host__ __device__ CameraIntrinsics +loadIntrinsics(const nanovdb::math::Mat3 &K) { + return CameraIntrinsics{K[0][0], K[1][1], K[0][2], K[1][2]}; +} + +template +inline __host__ __device__ CameraIntrinsics +loadIntrinsicsRowMajor3x3(const T *K9) { + return CameraIntrinsics{K9[0], K9[4], K9[2], K9[5]}; +} + +} // namespace fvdb::detail::ops + +#endif // FVDB_DETAIL_OPS_GSPLAT_GAUSSIANINTRINSICS_CUH diff --git a/src/fvdb/detail/ops/gsplat/GaussianOpenCVDistortion.cuh b/src/fvdb/detail/ops/gsplat/GaussianOpenCVDistortion.cuh new file mode 100644 index 00000000..61e43562 --- /dev/null +++ b/src/fvdb/detail/ops/gsplat/GaussianOpenCVDistortion.cuh @@ -0,0 +1,125 @@ +// Copyright Contributors to the OpenVDB Project +// SPDX-License-Identifier: Apache-2.0 +// +#ifndef FVDB_DETAIL_OPS_GSPLAT_GAUSSIANOPENCVDISTORTION_CUH +#define FVDB_DETAIL_OPS_GSPLAT_GAUSSIANOPENCVDISTORTION_CUH + +#include + +#include + +#include + +namespace fvdb::detail::ops { + +/// @brief Apply OpenCV-style distortion to normalized camera-plane coordinates. +/// +/// This uses FVDB's packed coefficient layout: +/// `[k1,k2,k3,k4,k5,k6,p1,p2,s1,s2,s3,s4]`. +/// +/// Notes: +/// - `CameraModel::PINHOLE` and `CameraModel::ORTHOGRAPHIC` ignore distortion. +/// - Unused coefficients for a given model are ignored (but still read from the packed array). +/// +/// @param cameraModel Camera model selector. +/// @param p_normalized Normalized camera-plane coordinate. +/// - perspective: (x/z, y/z) +/// - orthographic: (x, y) +/// @param distortionCoeffs Pointer to packed coefficients (may be null for non-OpenCV models). +/// @param numCoeffs Number of coefficients per camera (0 for PINHOLE/ORTHOGRAPHIC, 12 for OPENCV). +template +inline __host__ __device__ nanovdb::math::Vec2 +applyOpenCVDistortionPacked(const CameraModel cameraModel, + const nanovdb::math::Vec2 &p_normalized, + const T *distortionCoeffs, + const int64_t numCoeffs) { + // For pinhole/orthographic, distortion is ignored. + if (cameraModel == CameraModel::PINHOLE || cameraModel == CameraModel::ORTHOGRAPHIC || + numCoeffs == 0 || distortionCoeffs == nullptr) { + return p_normalized; + } + + // Packed OpenCV coefficient layout: + // [k1,k2,k3,k4,k5,k6,p1,p2,s1,s2,s3,s4] + // NOTE: for RADTAN_5 we use k1,k2,k3 and ignore k4..k6; for rational we use k1..k6. + const T x = p_normalized[0]; + const T y = p_normalized[1]; + const T x2 = x * x; + const T y2 = y * y; + const T xy = x * y; + const T r2 = x2 + y2; + const T r4 = r2 * r2; + const T r6 = r4 * r2; + + const T k1 = distortionCoeffs[0]; + const T k2 = distortionCoeffs[1]; + const T k3 = distortionCoeffs[2]; + const T k4 = distortionCoeffs[3]; + const T k5 = distortionCoeffs[4]; + const T k6 = distortionCoeffs[5]; + const T p1 = distortionCoeffs[6]; + const T p2 = distortionCoeffs[7]; + const T s1 = distortionCoeffs[8]; + const T s2 = distortionCoeffs[9]; + const T s3 = distortionCoeffs[10]; + const T s4 = distortionCoeffs[11]; + + T radial = T(1); + if (cameraModel == CameraModel::OPENCV_RATIONAL_8 || + cameraModel == CameraModel::OPENCV_THIN_PRISM_12) { + const T num = T(1) + r2 * (k1 + r2 * (k2 + r2 * k3)); + const T den = T(1) + r2 * (k4 + r2 * (k5 + r2 * k6)); + radial = (den != T(0)) ? (num / den) : T(0); + } else if (cameraModel == CameraModel::OPENCV_RADTAN_5 || + cameraModel == CameraModel::OPENCV_RADTAN_THIN_PRISM_9) { + radial = T(1) + k1 * r2 + k2 * r4 + k3 * r6; + } + + T x_dist = x * radial; + T y_dist = y * radial; + + // Tangential + x_dist += T(2) * p1 * xy + p2 * (r2 + T(2) * x2); + y_dist += p1 * (r2 + T(2) * y2) + T(2) * p2 * xy; + + // Thin prism + if (cameraModel == CameraModel::OPENCV_THIN_PRISM_12 || + cameraModel == CameraModel::OPENCV_RADTAN_THIN_PRISM_9) { + x_dist += s1 * r2 + s2 * r4; + y_dist += s3 * r2 + s4 * r4; + } + + return nanovdb::math::Vec2(x_dist, y_dist); +} + +/// @brief Iteratively undistort a packed OpenCV distortion model using fixed-point iteration. +/// +/// This solves for `x` such that `applyOpenCVDistortionPacked(x) == p_distorted`. +/// It uses a small fixed iteration count (matching the UT kernel style) and is intended for use in +/// ray generation. +template +inline __host__ __device__ nanovdb::math::Vec2 +undistortOpenCVPackedFixedPoint(const CameraModel cameraModel, + const nanovdb::math::Vec2 &p_distorted, + const T *distortionCoeffs, + const int64_t numCoeffs, + const int iters = 8) { + if (cameraModel == CameraModel::PINHOLE || cameraModel == CameraModel::ORTHOGRAPHIC || + numCoeffs == 0 || distortionCoeffs == nullptr) { + return p_distorted; + } + + nanovdb::math::Vec2 x = p_distorted; + for (int it = 0; it < iters; ++it) { + const nanovdb::math::Vec2 x_dist = + applyOpenCVDistortionPacked(cameraModel, x, distortionCoeffs, numCoeffs); + const nanovdb::math::Vec2 err = x_dist - p_distorted; + x[0] -= err[0]; + x[1] -= err[1]; + } + return x; +} + +} // namespace fvdb::detail::ops + +#endif // FVDB_DETAIL_OPS_GSPLAT_GAUSSIANOPENCVDISTORTION_CUH diff --git a/src/fvdb/detail/ops/gsplat/GaussianProjectionBackward.cu b/src/fvdb/detail/ops/gsplat/GaussianProjectionBackward.cu index 9f349597..d8f76ed4 100644 --- a/src/fvdb/detail/ops/gsplat/GaussianProjectionBackward.cu +++ b/src/fvdb/detail/ops/gsplat/GaussianProjectionBackward.cu @@ -3,6 +3,7 @@ // #include #include +#include #include #include #include @@ -126,9 +127,8 @@ projectionBackwardKernel(const int32_t offset, dLossDConics += idx * 3; // vjp: compute the inverse of the 2d covariance - const nanovdb::math::Mat2 covar2dInverse(conics[0], conics[1], conics[1], conics[2]); - const nanovdb::math::Mat2 dLossDCovar2dInverse( - dLossDConics[0], dLossDConics[1] * .5f, dLossDConics[1] * .5f, dLossDConics[2]); + const nanovdb::math::Mat2 covar2dInverse = loadConicRowMajor3(conics); + const nanovdb::math::Mat2 dLossDCovar2dInverse = loadConicGradRowMajor3(dLossDConics); nanovdb::math::Mat2 dLossDCovar2d = inverseVectorJacobianProduct(covar2dInverse, dLossDCovar2dInverse); @@ -141,41 +141,20 @@ projectionBackwardKernel(const int32_t offset, } // transform Gaussian to camera space - const nanovdb::math::Mat3 R(worldToCamMatrices[0], - worldToCamMatrices[1], - worldToCamMatrices[2], // 1st row - worldToCamMatrices[4], - worldToCamMatrices[5], - worldToCamMatrices[6], // 2nd row - worldToCamMatrices[8], - worldToCamMatrices[9], - worldToCamMatrices[10]); // 3rd row - const nanovdb::math::Vec3 t( - worldToCamMatrices[3], worldToCamMatrices[7], worldToCamMatrices[11]); + nanovdb::math::Mat3 R; + nanovdb::math::Vec3 t; + loadWorldToCamRtRowMajor4x4(worldToCamMatrices, R, t); nanovdb::math::Mat3 covar; nanovdb::math::Vec4 quat; nanovdb::math::Vec3 scale; if (covars != nullptr) { covars += gId * 6; - covar = nanovdb::math::Mat3(covars[0], - covars[1], - covars[2], // 1st row - covars[1], - covars[3], - covars[4], // 2nd row - covars[2], - covars[4], - covars[5] // 3rd row - ); + covar = loadCovarianceRowMajor6(covars); } else { // compute from quaternions and logScales quats += gId * 4; logScales += gId * 3; - quat = nanovdb::math::Vec4(quats[0], quats[1], quats[2], quats[3]); - scale = nanovdb::math::Vec3(::cuda::std::exp(logScales[0]), - ::cuda::std::exp(logScales[1]), - ::cuda::std::exp(logScales[2])); - + loadQuatScaleFromLogScalesRowMajor(quats, logScales, quat, scale); covar = quaternionAndScaleToCovariance(quat, scale); } @@ -185,17 +164,16 @@ projectionBackwardKernel(const int32_t offset, const nanovdb::math::Mat3 &covarCamSpace = transformCovarianceWorldToCam(R, covar); // vjp: camera projection - const T fx = projectionMatrices[0], cx = projectionMatrices[2], fy = projectionMatrices[4], - cy = projectionMatrices[5]; + const CameraIntrinsics intr = loadIntrinsicsRowMajor3x3(projectionMatrices); auto [dLossDCovarCamSpace, dLossDMeanCamSpace] = [&]() { if constexpr (Ortho) { return projectGaussianOrthographicVectorJacobianProduct( meansCamSpace, covarCamSpace, - fx, - fy, - cx, - cy, + intr.fx, + intr.fy, + intr.cx, + intr.cy, imageWidth, imageHeight, dLossDCovar2d, @@ -204,10 +182,10 @@ projectionBackwardKernel(const int32_t offset, return projectGaussianPerspectiveVectorJacobianProduct( meansCamSpace, covarCamSpace, - fx, - fy, - cx, - cy, + intr.fx, + intr.fy, + intr.cx, + intr.cy, imageWidth, imageHeight, dLossDCovar2d, @@ -263,7 +241,6 @@ projectionBackwardKernel(const int32_t offset, quaternionAndScaleToCovarianceVectorJacobianProduct( quat, scale, rotmat, dLossDCovar); - warpSum(dLossDQuat, warp_group_g); warpSum(dLossDLogScale, warp_group_g); if (warp_group_g.thread_rank() == 0) { outDLossDQuats += gId * 4; diff --git a/src/fvdb/detail/ops/gsplat/GaussianProjectionForward.cu b/src/fvdb/detail/ops/gsplat/GaussianProjectionForward.cu index 950cec6e..2f49ceaf 100644 --- a/src/fvdb/detail/ops/gsplat/GaussianProjectionForward.cu +++ b/src/fvdb/detail/ops/gsplat/GaussianProjectionForward.cu @@ -1,7 +1,9 @@ // Copyright Contributors to the OpenVDB Project // SPDX-License-Identifier: Apache-2.0 // +#include #include +#include #include #include #include @@ -136,15 +138,26 @@ template struct ProjectionForward { const Mat3 covarCamSpace = transformCovarianceWorldToCam(worldToCamRotMatrix, covar); // camera projection - const T fx = projectionMatrix[0][0], cx = projectionMatrix[0][2], - fy = projectionMatrix[1][1], cy = projectionMatrix[1][2]; - auto [covar2d, mean2d] = [&]() { + const CameraIntrinsics intr = loadIntrinsics(projectionMatrix); + auto [covar2d, mean2d] = [&]() { if constexpr (Ortho) { - return projectGaussianOrthographic( - meansCamSpace, covarCamSpace, fx, fy, cx, cy, mImageWidth, mImageHeight); + return projectGaussianOrthographic(meansCamSpace, + covarCamSpace, + intr.fx, + intr.fy, + intr.cx, + intr.cy, + mImageWidth, + mImageHeight); } else { - return projectGaussianPerspective( - meansCamSpace, covarCamSpace, fx, fy, cx, cy, mImageWidth, mImageHeight); + return projectGaussianPerspective(meansCamSpace, + covarCamSpace, + intr.fx, + intr.fy, + intr.cx, + intr.cy, + mImageWidth, + mImageHeight); } }(); @@ -157,10 +170,8 @@ template struct ProjectionForward { const Mat2 covar2dInverse = covar2d.inverse(); - // take 3 sigma as the radius - const T b = 0.5f * (covar2d[0][0] + covar2d[1][1]); - const T v1 = b + sqrt(max(0.01f, b * b - det)); - const T radius = ceil(3.f * sqrt(v1)); + // take 3 sigma as the radius (non-differentiable) + const T radius = radiusFromCovariance2dDet(covar2d, det, T(3)); if (radius <= mRadiusClip) { mOutRadiiAcc[cid][gid] = 0; @@ -168,20 +179,20 @@ template struct ProjectionForward { } // Mask out gaussians outside the image region - if (mean2d[0] + radius <= 0 || mean2d[0] - radius >= mImageWidth || - mean2d[1] + radius <= 0 || mean2d[1] - radius >= mImageHeight) { + if (isOutsideImageWithRadius(mean2d, radius, radius, mImageWidth, mImageHeight)) { mOutRadiiAcc[cid][gid] = 0; return; } // Write outputs - mOutRadiiAcc[cid][gid] = int32_t(radius); - mOutMeans2dAcc[cid][gid][0] = mean2d[0]; - mOutMeans2dAcc[cid][gid][1] = mean2d[1]; - mOutDepthsAcc[cid][gid] = meansCamSpace[2]; - mOutConicsAcc[cid][gid][0] = covar2dInverse[0][0]; - mOutConicsAcc[cid][gid][1] = covar2dInverse[0][1]; - mOutConicsAcc[cid][gid][2] = covar2dInverse[1][1]; + mOutRadiiAcc[cid][gid] = int32_t(radius); + mOutMeans2dAcc[cid][gid][0] = mean2d[0]; + mOutMeans2dAcc[cid][gid][1] = mean2d[1]; + mOutDepthsAcc[cid][gid] = meansCamSpace[2]; + const nanovdb::math::Vec3 conicPacked = packConicRowMajor3(covar2dInverse); + mOutConicsAcc[cid][gid][0] = conicPacked[0]; + mOutConicsAcc[cid][gid][1] = conicPacked[1]; + mOutConicsAcc[cid][gid][2] = conicPacked[2]; if (mOutCompensationsAcc != nullptr) { mOutCompensationsAcc[idx] = compensation; } @@ -189,37 +200,19 @@ template struct ProjectionForward { inline __device__ void loadCamerasIntoSharedMemory() { - // Load projection matrices and world-to-camera matrices into shared memory + // Load projection matrices and world-to-camera matrices into shared memory. alignas(Mat3) extern __shared__ char sharedMemory[]; - projectionMatsShared = reinterpret_cast(sharedMemory); - worldToCamRotMatsShared = reinterpret_cast(sharedMemory + C * sizeof(Mat3)); - worldToCamTranslation = reinterpret_cast(sharedMemory + C * 2 * sizeof(Mat3)); - - const int64_t totalElements = C * (9 + 9 + 3); - for (int64_t i = threadIdx.x; i < totalElements; i += blockDim.x) { - if (i < C * 9) { - const auto camId = i / 9; - const auto entryId = i % 9; - const auto rowId = entryId / 3; - const auto colId = entryId % 3; - projectionMatsShared[camId][rowId][colId] = - mProjectionMatricesAcc[camId][rowId][colId]; - } else if (i < 2 * C * 9) { - const auto baseIdx = i - C * 9; - const auto camId = baseIdx / 9; - const auto entryId = baseIdx % 9; - const auto rowId = entryId / 3; - const auto colId = entryId % 3; - worldToCamRotMatsShared[camId][rowId][colId] = - mWorldToCamMatricesAcc[camId][rowId][colId]; - } else { - const auto baseIdx = i - 2 * C * 9; - const auto camId = baseIdx / 3; - const auto entryId = baseIdx % 3; - worldToCamTranslation[camId][entryId] = mWorldToCamMatricesAcc[camId][entryId][3]; - } - } + uint8_t *ptr = reinterpret_cast(sharedMemory); + projectionMatsShared = reinterpret_cast(ptr); + ptr += C * sizeof(Mat3); + worldToCamRotMatsShared = reinterpret_cast(ptr); + ptr += C * sizeof(Mat3); + worldToCamTranslation = reinterpret_cast(ptr); + + copyMat3AccessorToShared(C, projectionMatsShared, mProjectionMatricesAcc); + copyWorldToCamRotationToShared(C, worldToCamRotMatsShared, mWorldToCamMatricesAcc); + copyWorldToCamTranslationToShared(C, worldToCamTranslation, mWorldToCamMatricesAcc); } }; diff --git a/src/fvdb/detail/ops/gsplat/GaussianProjectionJaggedBackward.cu b/src/fvdb/detail/ops/gsplat/GaussianProjectionJaggedBackward.cu index 1aa350f8..2c545383 100644 --- a/src/fvdb/detail/ops/gsplat/GaussianProjectionJaggedBackward.cu +++ b/src/fvdb/detail/ops/gsplat/GaussianProjectionJaggedBackward.cu @@ -3,6 +3,7 @@ // #include #include +#include #include #include #include @@ -80,9 +81,8 @@ jaggedProjectionBackwardKernel( dLossDConics += idx * 3; // vjp: compute the inverse of the 2d covariance - const nanovdb::math::Mat2 covar2dInverse(conics[0], conics[1], conics[1], conics[2]); - const nanovdb::math::Mat2 dLossDCovar2dInverse( - dLossDConics[0], dLossDConics[1] * .5f, dLossDConics[1] * .5f, dLossDConics[2]); + const nanovdb::math::Mat2 covar2dInverse = loadConicRowMajor3(conics); + const nanovdb::math::Mat2 dLossDCovar2dInverse = loadConicGradRowMajor3(dLossDConics); nanovdb::math::Mat2 dLossDCovar2d = inverseVectorJacobianProduct(covar2dInverse, dLossDCovar2dInverse); @@ -95,39 +95,20 @@ jaggedProjectionBackwardKernel( } // transform Gaussian to camera space - const nanovdb::math::Mat3 R(worldToCamMatrices[0], - worldToCamMatrices[1], - worldToCamMatrices[2], // 1st row - worldToCamMatrices[4], - worldToCamMatrices[5], - worldToCamMatrices[6], // 2nd row - worldToCamMatrices[8], - worldToCamMatrices[9], - worldToCamMatrices[10]); // 3rd row - const nanovdb::math::Vec3 t( - worldToCamMatrices[3], worldToCamMatrices[7], worldToCamMatrices[11]); + nanovdb::math::Mat3 R; + nanovdb::math::Vec3 t; + loadWorldToCamRtRowMajor4x4(worldToCamMatrices, R, t); nanovdb::math::Mat3 covar; nanovdb::math::Vec4 quat; nanovdb::math::Vec3 scale; if (covars != nullptr) { covars += gId * 6; - covar = nanovdb::math::Mat3(covars[0], - covars[1], - covars[2], // 1st row - covars[1], - covars[3], - covars[4], // 2nd row - covars[2], - covars[4], - covars[5] // 3rd row - ); + covar = loadCovarianceRowMajor6(covars); } else { // compute from quaternions and scales quats += gId * 4; scales += gId * 3; - quat = nanovdb::math::Vec4(quats[0], quats[1], quats[2], quats[3]); - scale = nanovdb::math::Vec3(scales[0], scales[1], scales[2]); - + loadQuatScaleFromScalesRowMajor(quats, scales, quat, scale); covar = quaternionAndScaleToCovariance(quat, scale); } const nanovdb::math::Vec3 &meansCamSpace = @@ -136,17 +117,16 @@ jaggedProjectionBackwardKernel( const nanovdb::math::Mat3 &covarCamSpace = transformCovarianceWorldToCam(R, covar); // vjp: camera projection - const T fx = projectionMatrices[0], cx = projectionMatrices[2], fy = projectionMatrices[4], - cy = projectionMatrices[5]; + const CameraIntrinsics intr = loadIntrinsicsRowMajor3x3(projectionMatrices); auto [dLossDCovarCamSpace, dLossDMeanCamSpace] = [&]() { if constexpr (Ortho) { return projectGaussianOrthographicVectorJacobianProduct( meansCamSpace, covarCamSpace, - fx, - fy, - cx, - cy, + intr.fx, + intr.fy, + intr.cx, + intr.cy, imageWidth, imageHeight, dLossDCovar2d, @@ -155,10 +135,10 @@ jaggedProjectionBackwardKernel( return projectGaussianPerspectiveVectorJacobianProduct( meansCamSpace, covarCamSpace, - fx, - fy, - cx, - cy, + intr.fx, + intr.fy, + intr.cx, + intr.cy, imageWidth, imageHeight, dLossDCovar2d, diff --git a/src/fvdb/detail/ops/gsplat/GaussianProjectionJaggedForward.cu b/src/fvdb/detail/ops/gsplat/GaussianProjectionJaggedForward.cu index b40c4d18..1211f873 100644 --- a/src/fvdb/detail/ops/gsplat/GaussianProjectionJaggedForward.cu +++ b/src/fvdb/detail/ops/gsplat/GaussianProjectionJaggedForward.cu @@ -3,6 +3,7 @@ // #include #include +#include #include #include #include @@ -63,17 +64,9 @@ jaggedProjectionForwardKernel(const uint32_t B, projectionMatrices += cId * 9; // input is row-major - const nanovdb::math::Mat3 R(worldToCamMatrices[0], - worldToCamMatrices[1], - worldToCamMatrices[2], // 1st row - worldToCamMatrices[4], - worldToCamMatrices[5], - worldToCamMatrices[6], // 2nd row - worldToCamMatrices[8], - worldToCamMatrices[9], - worldToCamMatrices[10]); // 3rd row - const nanovdb::math::Vec3 t( - worldToCamMatrices[3], worldToCamMatrices[7], worldToCamMatrices[11]); + nanovdb::math::Mat3 R; + nanovdb::math::Vec3 t; + loadWorldToCamRtRowMajor4x4(worldToCamMatrices, R, t); // transform Gaussian center to camera space const nanovdb::math::Vec3 meansCamSpace = transformPointWorldToCam(R, t, nanovdb::math::Vec3(means[0], means[1], means[2])); @@ -87,36 +80,39 @@ jaggedProjectionForwardKernel(const uint32_t B, nanovdb::math::Mat3 covar; if (covars != nullptr) { covars += gId * 6; - covar = nanovdb::math::Mat3(covars[0], - covars[1], - covars[2], // 1st row - covars[1], - covars[3], - covars[4], // 2nd row - covars[2], - covars[4], - covars[5] // 3rd row - ); + covar = loadCovarianceRowMajor6(covars); } else { // compute from quaternions and scales quats += gId * 4; scales += gId * 3; - covar = quaternionAndScaleToCovariance( - nanovdb::math::Vec4(quats[0], quats[1], quats[2], quats[3]), - nanovdb::math::Vec3(scales[0], scales[1], scales[2])); + nanovdb::math::Vec4 quat; + nanovdb::math::Vec3 scale; + loadQuatScaleFromScalesRowMajor(quats, scales, quat, scale); + covar = quaternionAndScaleToCovariance(quat, scale); } const nanovdb::math::Mat3 covarCamSpace = transformCovarianceWorldToCam(R, covar); // camera projection - const T fx = projectionMatrices[0], cx = projectionMatrices[2], fy = projectionMatrices[4], - cy = projectionMatrices[5]; - auto [covar2d, mean2d] = [&]() { + const CameraIntrinsics intr = loadIntrinsicsRowMajor3x3(projectionMatrices); + auto [covar2d, mean2d] = [&]() { if constexpr (Ortho) { - return projectGaussianOrthographic( - meansCamSpace, covarCamSpace, fx, fy, cx, cy, imageWidth, imageHeight); + return projectGaussianOrthographic(meansCamSpace, + covarCamSpace, + intr.fx, + intr.fy, + intr.cx, + intr.cy, + imageWidth, + imageHeight); } else { - return projectGaussianPerspective( - meansCamSpace, covarCamSpace, fx, fy, cx, cy, imageWidth, imageHeight); + return projectGaussianPerspective(meansCamSpace, + covarCamSpace, + intr.fx, + intr.fy, + intr.cx, + intr.cy, + imageWidth, + imageHeight); } }(); @@ -130,10 +126,8 @@ jaggedProjectionForwardKernel(const uint32_t B, // compute the inverse of the 2d covariance const nanovdb::math::Mat2 covar2dInverse = covar2d.inverse(); - // take 3 sigma as the radius (non differentiable) - const T b = 0.5f * (covar2d[0][0] + covar2d[1][1]); - const T v1 = b + sqrt(max(0.01f, b * b - det)); - const T radius = ceil(3.f * sqrt(v1)); + // take 3 sigma as the radius (non-differentiable) + const T radius = radiusFromCovariance2dDet(covar2d, det, T(3)); // T v2 = b - sqrt(max(0.1f, b * b - det)); // T radius = ceil(3.f * sqrt(max(v1, v2))); @@ -143,8 +137,7 @@ jaggedProjectionForwardKernel(const uint32_t B, } // mask out gaussians outside the image region - if (mean2d[0] + radius <= 0 || mean2d[0] - radius >= imageWidth || mean2d[1] + radius <= 0 || - mean2d[1] - radius >= imageHeight) { + if (isOutsideImageWithRadius(mean2d, radius, radius, imageWidth, imageHeight)) { radii[idx] = 0; return; } @@ -154,9 +147,7 @@ jaggedProjectionForwardKernel(const uint32_t B, means2d[idx * 2] = mean2d[0]; means2d[idx * 2 + 1] = mean2d[1]; depths[idx] = meansCamSpace[2]; - conics[idx * 3] = covar2dInverse[0][0]; - conics[idx * 3 + 1] = covar2dInverse[0][1]; - conics[idx * 3 + 2] = covar2dInverse[1][1]; + storeConicRowMajor3(covar2dInverse, conics + idx * 3); if (compensations != nullptr) { compensations[idx] = compensation; } diff --git a/src/fvdb/detail/ops/gsplat/GaussianProjectionUT.cu b/src/fvdb/detail/ops/gsplat/GaussianProjectionUT.cu index 4c1125a2..a8f899de 100644 --- a/src/fvdb/detail/ops/gsplat/GaussianProjectionUT.cu +++ b/src/fvdb/detail/ops/gsplat/GaussianProjectionUT.cu @@ -2,7 +2,11 @@ // SPDX-License-Identifier: Apache-2.0 // +#include +#include #include +#include +#include #include #include #include @@ -42,19 +46,6 @@ template class OpenCVCameraModel { using Vec3 = nanovdb::math::Vec3; using Mat3 = nanovdb::math::Mat3; - /// @brief Internal distortion evaluation mode. - /// - /// This is intentionally separate from the public API `CameraModel` to keep the math paths - /// explicit and avoid implying that `CameraModel` is “just distortion” (since it also includes - /// projection as well). - enum class Model : uint8_t { - NONE = 0, // no distortion - RADTAN_5 = 5, // k1,k2,p1,p2,k3 (polynomial radial up to r^6) - RATIONAL_8 = 8, // k1,k2,p1,p2,k3,k4,k5,k6 (rational radial) - RADTAN_THIN_9 = 9, // RADTAN_5 + thin prism s1..s4 (polynomial radial + thin-prism) - THIN_PRISM_12 = 12, // RATIONAL_8 + s1,s2,s3,s4 - }; - /// @brief Construct a camera model for projection. /// /// For OpenCV models, coefficients are read from a per-camera packed layout: @@ -70,70 +61,22 @@ template class OpenCVCameraModel { /// cameraModel). __device__ __forceinline__ OpenCVCameraModel(const CameraModel cameraModel, const Mat3 &K_in, const T *distortionCoeffs) - : K(K_in) { + : K(K_in), mCameraModel(cameraModel), mDistortionCoeffs(distortionCoeffs) { // ORTHOGRAPHIC is implemented as pinhole intrinsics without the perspective divide. // Distortion is intentionally not supported for orthographic projection. orthographic = (cameraModel == CameraModel::ORTHOGRAPHIC); - if (cameraModel == CameraModel::ORTHOGRAPHIC) { - radial = tangential = thinPrism = nullptr; - numRadial = numTangential = numThinPrism = 0; - model = Model::NONE; + if (cameraModel == CameraModel::ORTHOGRAPHIC || cameraModel == CameraModel::PINHOLE) { + mNumDistortionCoeffs = 0; return; } - if (cameraModel == CameraModel::PINHOLE) { - radial = tangential = thinPrism = nullptr; - numRadial = numTangential = numThinPrism = 0; - model = Model::NONE; - return; - } - - // OpenCV models require the packed coefficient layout. - deviceAssertOrTrap(distortionCoeffs != nullptr); - const T *radial_in = distortionCoeffs + kRadialOffset; // k1..k6 - const T *tang_in = distortionCoeffs + kTangentialOffset; // p1,p2 - const T *thin_in = distortionCoeffs + kThinPrismOffset; // s1..s4 - - if (cameraModel == CameraModel::OPENCV_RADTAN_5) { - radial = radial_in; - numRadial = 3; // k1,k2,k3 (polynomial) - tangential = tang_in; - numTangential = 2; - thinPrism = nullptr; - numThinPrism = 0; - model = Model::RADTAN_5; - return; - } - if (cameraModel == CameraModel::OPENCV_RATIONAL_8) { - radial = radial_in; - numRadial = 6; // k1..k6 (rational) - tangential = tang_in; - numTangential = 2; - thinPrism = nullptr; - numThinPrism = 0; - model = Model::RATIONAL_8; - return; - } - if (cameraModel == CameraModel::OPENCV_THIN_PRISM_12) { - radial = radial_in; - numRadial = 6; // k1..k6 (rational) - tangential = tang_in; - numTangential = 2; - thinPrism = thin_in; - numThinPrism = 4; // s1..s4 - model = Model::THIN_PRISM_12; - return; - } - if (cameraModel == CameraModel::OPENCV_RADTAN_THIN_PRISM_9) { - // Polynomial radial + thin-prism; ignore k4..k6 by construction. - radial = radial_in; - numRadial = 3; // k1,k2,k3 (polynomial) - tangential = tang_in; - numTangential = 2; - thinPrism = thin_in; - numThinPrism = 4; // s1..s4 - model = Model::RADTAN_THIN_9; + if (cameraModel == CameraModel::OPENCV_RADTAN_5 || + cameraModel == CameraModel::OPENCV_RATIONAL_8 || + cameraModel == CameraModel::OPENCV_RADTAN_THIN_PRISM_9 || + cameraModel == CameraModel::OPENCV_THIN_PRISM_12) { + deviceAssertOrTrap(mDistortionCoeffs != nullptr); + mNumDistortionCoeffs = 12; return; } @@ -162,7 +105,8 @@ template class OpenCVCameraModel { p_normalized = Vec2(p_cam[0] * z_inv, p_cam[1] * z_inv); } - const Vec2 p_distorted = applyDistortion(p_normalized); + const Vec2 p_distorted = applyOpenCVDistortionPacked( + mCameraModel, p_normalized, mDistortionCoeffs, mNumDistortionCoeffs); // Project to pixel coordinates. const T fx = K[0][0]; @@ -194,154 +138,10 @@ template class OpenCVCameraModel { // Camera intrinsics (typically backed by shared memory). const Mat3 &K; - bool orthographic = false; - - // Packed OpenCV coefficient layout offsets: - // [k1,k2,k3,k4,k5,k6,p1,p2,s1,s2,s3,s4] - static constexpr int kRadialOffset = 0; // k1..k6 - static constexpr int kTangentialOffset = 6; // p1,p2 - static constexpr int kThinPrismOffset = 8; // s1..s4 - - // Coefficients for the distortion model. - const T *radial = nullptr; // k1..k6 (but k4..k6 only used in rational model) - int numRadial = 0; // Number of radial coefficients - const T *tangential = nullptr; // p1,p2 - int numTangential = 0; // Number of tangential coefficients - const T *thinPrism = nullptr; // s1..s4 - int numThinPrism = 0; // Number of thin prism coefficients - Model model = Model::NONE; // Distortion model - - /// @brief Read a coefficient if present, otherwise return 0. - /// - /// @param[in] ptr Coefficient array (may be null). - /// @param[in] n Number of coefficients in ptr. - /// @param[in] i Index to read. - /// @return Coefficient value or 0 if out-of-range / null. - __host__ __device__ inline T - coeffOrZero(const T *ptr, const int n, const int i) const { - return (ptr != nullptr && i >= 0 && i < n) ? ptr[i] : T(0); - } - - /// @brief Apply OpenCV distortion to a normalized camera-plane point. - /// - /// Input coordinates are assumed to already be normalized to the camera plane: - /// - perspective: \((x/z, y/z)\) - /// - orthographic: \((x, y)\) - /// - /// @param[in] p_normalized Normalized camera-plane coordinates. - /// @return Distorted normalized coordinates. - __device__ Vec2 - applyDistortion(const Vec2 &p_normalized) const { - const T x = p_normalized[0]; - const T y = p_normalized[1]; - const T x2 = x * x; - const T y2 = y * y; - const T xy = x * y; - - const T r2 = x2 + y2; - const T r4 = r2 * r2; - const T r6 = r4 * r2; - - // Radial distortion. - T radial_dist = T(1); - if (model == Model::RATIONAL_8 || model == Model::THIN_PRISM_12) { - const T k1 = coeffOrZero(radial, numRadial, 0); - const T k2 = coeffOrZero(radial, numRadial, 1); - const T k3 = coeffOrZero(radial, numRadial, 2); - const T k4 = coeffOrZero(radial, numRadial, 3); - const T k5 = coeffOrZero(radial, numRadial, 4); - const T k6 = coeffOrZero(radial, numRadial, 5); - const T num = T(1) + r2 * (k1 + r2 * (k2 + r2 * k3)); - const T den = T(1) + r2 * (k4 + r2 * (k5 + r2 * k6)); - radial_dist = (den != T(0)) ? (num / den) : T(0); - } else if (model == Model::RADTAN_5 || model == Model::RADTAN_THIN_9) { - // Polynomial radial (up to k3 / r^6). Thin-prism terms are applied below if enabled. - const T k1 = coeffOrZero(radial, numRadial, 0); - const T k2 = coeffOrZero(radial, numRadial, 1); - const T k3 = coeffOrZero(radial, numRadial, 2); - radial_dist = T(1) + k1 * r2 + k2 * r4 + k3 * r6; - } - - T x_dist = x * radial_dist; - T y_dist = y * radial_dist; - - // Tangential distortion. - const T p1 = coeffOrZero(tangential, numTangential, 0); - const T p2 = coeffOrZero(tangential, numTangential, 1); - x_dist += T(2) * p1 * xy + p2 * (r2 + T(2) * x2); - y_dist += p1 * (r2 + T(2) * y2) + T(2) * p2 * xy; - - // Thin-prism distortion. - if (model == Model::THIN_PRISM_12 || model == Model::RADTAN_THIN_9) { - const T s1 = coeffOrZero(thinPrism, numThinPrism, 0); - const T s2 = coeffOrZero(thinPrism, numThinPrism, 1); - const T s3 = coeffOrZero(thinPrism, numThinPrism, 2); - const T s4 = coeffOrZero(thinPrism, numThinPrism, 3); - x_dist += s1 * r2 + s2 * r4; - y_dist += s3 * r2 + s4 * r4; - } - - return Vec2(x_dist, y_dist); - } -}; - -/// @brief UT-local rigid transform (cached rotation + translation). -/// -/// Quaternion is stored as \([w,x,y,z]\) and is assumed to represent a rotation. -/// The corresponding rotation matrix \(R(q)\) is cached to avoid recomputing it for every point -/// transform (UT sigma points, rolling-shutter iterations, depth culls, etc.). -template struct RigidTransform { - nanovdb::math::Mat3 R; - nanovdb::math::Vec4 q; - nanovdb::math::Vec3 t; - - /// @brief Default constructor (identity transform). - /// - /// Initializes to unit quaternion \([1,0,0,0]\) and zero translation. - __device__ - RigidTransform() - : R(nanovdb::math::Mat3(nanovdb::math::Vec3(T(1), T(0), T(0)), - nanovdb::math::Vec3(T(0), T(1), T(0)), - nanovdb::math::Vec3(T(0), T(0), T(1)))), - q(T(1), T(0), T(0), T(0)), t(T(0), T(0), T(0)) {} - - /// @brief Construct from quaternion and translation. - /// @param[in] q_in Rotation quaternion \([w,x,y,z]\). - /// @param[in] t_in Translation vector. - __device__ - RigidTransform(const nanovdb::math::Vec4 &q_in, const nanovdb::math::Vec3 &t_in) - : R(quaternionToRotationMatrix(q_in)), q(q_in), t(t_in) {} - - /// @brief Construct from rotation matrix and translation. - /// @param[in] R_in Rotation matrix. - /// @param[in] t_in Translation vector. - __device__ - RigidTransform(const nanovdb::math::Mat3 &R_in, const nanovdb::math::Vec3 &t_in) - : R(R_in), q(rotationMatrixToQuaternion(R_in)), t(t_in) {} - - /// @brief Apply the transform to a 3D point: \(R(q)\,p + t\). - /// @param[in] p_world Point to transform. - /// @return Transformed point. - __device__ __forceinline__ nanovdb::math::Vec3 - apply(const nanovdb::math::Vec3 &p_world) const { - // p_cam = R * p_world + t - return R * p_world + t; - } - - /// @brief Interpolate between two rigid transforms. - /// - /// Translation is linearly interpolated; rotation uses NLERP along the shortest arc. - /// - /// @param[in] u Interpolation parameter in \([0,1]\). - /// @param[in] start Start transform. - /// @param[in] end End transform. - /// @return Interpolated transform. - static inline __device__ RigidTransform - interpolate(const T u, const RigidTransform &start, const RigidTransform &end) { - const nanovdb::math::Vec3 t_interp = start.t + u * (end.t - start.t); - const nanovdb::math::Vec4 q_interp = nlerpQuaternionShortestPath(start.q, end.q, u); - return RigidTransform(q_interp, t_interp); - } + bool orthographic = false; + CameraModel mCameraModel = CameraModel::PINHOLE; + const T *mDistortionCoeffs = nullptr; // packed [12] for OPENCV_* models + int64_t mNumDistortionCoeffs = 0; // 0 for PINHOLE/ORTHOGRAPHIC, 12 for OPENCV_* }; /// @brief Projection status for a single world point. @@ -465,13 +265,8 @@ template struct WorldToPixelTransform { // Fixed small iteration count (good enough for convergence in practice). constexpr int kIters = 6; for (int it = 0; it < kIters; ++it) { - ScalarType t_rs = ScalarType(0); - if (rollingShutterType == RollingShutterType::VERTICAL) { - t_rs = floor(pix_prev[1]) / max(ScalarType(1), ScalarType(imageHeight - 1)); - } else if (rollingShutterType == RollingShutterType::HORIZONTAL) { - t_rs = floor(pix_prev[0]) / max(ScalarType(1), ScalarType(imageWidth - 1)); - } - t_rs = min(ScalarType(1), max(ScalarType(0), t_rs)); + const ScalarType t_rs = rollingShutterTimeFromPixel( + rollingShutterType, pix_prev[0], pix_prev[1], imageWidth, imageHeight); const RigidTransform pose_rs = RigidTransform::interpolate(t_rs, worldToCamStart, worldToCamEnd); Vec2 pix_rs(ScalarType(0), ScalarType(0)); @@ -774,7 +569,7 @@ template struct ProjectionForwardUT { /// Layout is `[K, R_start, R_end, t_start, t_end, distortionCoeffs]` per camera. inline __device__ void loadCameraInfoIntoSharedMemory() { - // Load projection matrices and world-to-camera matrices into shared memory + // Load per-camera matrices/coeffs into shared memory. alignas(Mat3) extern __shared__ char sharedMemory[]; // Alignment sanity checks for the shared-memory layout below. If any of these fail, the @@ -783,9 +578,6 @@ template struct ProjectionForwardUT { static_assert(alignof(Mat3) >= alignof(ScalarType), "Mat3 alignment must cover ScalarType alignment"); - constexpr int64_t kMat3Elements = 9; // 3x3 - constexpr int64_t kVec3Elements = 3; // 3 - // Keep a running pointer which we increment to assign shared memory blocks uint8_t *pointer = reinterpret_cast(sharedMemory); @@ -808,54 +600,18 @@ template struct ProjectionForwardUT { mNumDistortionCoeffs > 0 ? reinterpret_cast(pointer) : nullptr; pointer += C * mNumDistortionCoeffs * sizeof(ScalarType); - // Layout in element units: - const int64_t projectionOffset = 0; - const int64_t rotStartOffset = projectionOffset + C * kMat3Elements; - const int64_t rotEndOffset = rotStartOffset + C * kMat3Elements; - const int64_t transStartOffset = rotEndOffset + C * kMat3Elements; - const int64_t transEndOffset = transStartOffset + C * kVec3Elements; - const int64_t distortionOffset = transEndOffset + C * kVec3Elements; - const int64_t totalElements = distortionOffset + C * mNumDistortionCoeffs; - - for (int64_t i = threadIdx.x; i < totalElements; i += blockDim.x) { - if (i < rotStartOffset) { - const auto camId = (i - projectionOffset) / kMat3Elements; - const auto entryId = (i - projectionOffset) % kMat3Elements; - const auto rowId = entryId / kVec3Elements; - const auto colId = entryId % kVec3Elements; - projectionMatsShared[camId][rowId][colId] = - mProjectionMatricesAcc[camId][rowId][colId]; - } else if (i < rotEndOffset) { - const auto camId = (i - rotStartOffset) / kMat3Elements; - const auto entryId = (i - rotStartOffset) % kMat3Elements; - const auto rowId = entryId / kVec3Elements; - const auto colId = entryId % kVec3Elements; - worldToCamRotMatsStartShared[camId][rowId][colId] = - mWorldToCamMatricesStartAcc[camId][rowId][colId]; - } else if (i < transStartOffset) { - const auto camId = (i - rotEndOffset) / kMat3Elements; - const auto entryId = (i - rotEndOffset) % kMat3Elements; - const auto rowId = entryId / kVec3Elements; - const auto colId = entryId % kVec3Elements; - worldToCamRotMatsEndShared[camId][rowId][colId] = - mWorldToCamMatricesEndAcc[camId][rowId][colId]; - } else if (i < transEndOffset) { - const auto camId = (i - transStartOffset) / kVec3Elements; - const auto entryId = (i - transStartOffset) % kVec3Elements; - worldToCamTranslationStartShared[camId][entryId] = - mWorldToCamMatricesStartAcc[camId][entryId][3]; - } else if (i < distortionOffset) { - const auto camId = (i - transEndOffset) / kVec3Elements; - const auto entryId = (i - transEndOffset) % kVec3Elements; - worldToCamTranslationEndShared[camId][entryId] = - mWorldToCamMatricesEndAcc[camId][entryId][3]; - } else if (mNumDistortionCoeffs > 0) { - const auto baseIdx = i - distortionOffset; - const auto camId = baseIdx / mNumDistortionCoeffs; - const auto entryId = baseIdx % mNumDistortionCoeffs; - distortionCoeffsShared[camId * mNumDistortionCoeffs + entryId] = - mDistortionCoeffsAcc[camId][entryId]; - } + copyMat3AccessorToShared(C, projectionMatsShared, mProjectionMatricesAcc); + copyWorldToCamRotationToShared( + C, worldToCamRotMatsStartShared, mWorldToCamMatricesStartAcc); + copyWorldToCamRotationToShared( + C, worldToCamRotMatsEndShared, mWorldToCamMatricesEndAcc); + copyWorldToCamTranslationToShared( + C, worldToCamTranslationStartShared, mWorldToCamMatricesStartAcc); + copyWorldToCamTranslationToShared( + C, worldToCamTranslationEndShared, mWorldToCamMatricesEndAcc); + if (mNumDistortionCoeffs > 0) { + copyDistortionCoeffsToShared( + C, mNumDistortionCoeffs, distortionCoeffsShared, mDistortionCoeffsAcc); } } diff --git a/src/fvdb/detail/ops/gsplat/GaussianProjectionUT.h b/src/fvdb/detail/ops/gsplat/GaussianProjectionUT.h index 6fc780c6..164e4f41 100644 --- a/src/fvdb/detail/ops/gsplat/GaussianProjectionUT.h +++ b/src/fvdb/detail/ops/gsplat/GaussianProjectionUT.h @@ -4,6 +4,8 @@ #ifndef FVDB_DETAIL_OPS_GSPLAT_GAUSSIANPROJECTIONUT_H #define FVDB_DETAIL_OPS_GSPLAT_GAUSSIANPROJECTIONUT_H +#include + #include #include @@ -13,31 +15,6 @@ namespace fvdb { namespace detail { namespace ops { -enum class RollingShutterType { NONE = 0, VERTICAL = 1, HORIZONTAL = 2 }; - -/// @brief Camera model for projection in the UT kernel. -/// -/// This enum describes the camera projection family used by the UT kernel. It is intentionally -/// broader than "distortion model" so we can add more complex models (e.g. RPC) later. -/// -/// Notes: -/// - `PINHOLE` and `ORTHOGRAPHIC` ignore `distortionCoeffs`. -/// - `OPENCV_*` variants are pinhole + OpenCV-style distortion, and require packed coefficients. -enum class CameraModel : int32_t { - // Pinhole intrinsics only (no distortion). - PINHOLE = 0, - - // Orthographic intrinsics (no distortion). - ORTHOGRAPHIC = 5, - - // OpenCV variants which are just pinhole intrinsics + optional distortion (all of them use the - // same [C,12] distortion coefficients layout: [k1,k2,k3,k4,k5,k6,p1,p2,s1,s2,s3,s4]). - OPENCV_RADTAN_5 = 1, // polynomial radial (k1,k2,k3) + tangential (p1,p2)). - OPENCV_RATIONAL_8 = 2, // rational radial (k1..k6) + tangential (p1,p2)). - OPENCV_RADTAN_THIN_PRISM_9 = 3, // polynomial radial + tangential + thin-prism (s1..s4)). - OPENCV_THIN_PRISM_12 = 4, // rational radial + tangential + thin-prism (s1..s4)). -}; - /// @brief Unscented Transform hyperparameters. /// /// This kernel implements the canonical 3D UT with a fixed \(2D+1\) sigma point set (7 points). diff --git a/src/fvdb/detail/ops/gsplat/GaussianProjectionUtils.cuh b/src/fvdb/detail/ops/gsplat/GaussianProjectionUtils.cuh new file mode 100644 index 00000000..06128665 --- /dev/null +++ b/src/fvdb/detail/ops/gsplat/GaussianProjectionUtils.cuh @@ -0,0 +1,109 @@ +// Copyright Contributors to the OpenVDB Project +// SPDX-License-Identifier: Apache-2.0 +// +#ifndef FVDB_DETAIL_OPS_GSPLAT_GAUSSIANPROJECTIONUTILS_CUH +#define FVDB_DETAIL_OPS_GSPLAT_GAUSSIANPROJECTIONUTILS_CUH + +#include +#include +#include + +#include + +#include + +namespace fvdb::detail::ops { + +template +inline __device__ bool +isOutsideImageWithRadius(const nanovdb::math::Vec2 &mean2d, + const T radiusX, + const T radiusY, + const int32_t imageWidth, + const int32_t imageHeight) { + return (mean2d[0] + radiusX <= T(0) || mean2d[0] - radiusX >= T(imageWidth) || + mean2d[1] + radiusY <= T(0) || mean2d[1] - radiusY >= T(imageHeight)); +} + +template +inline __device__ T +radiusFromCovariance2dDet(const nanovdb::math::Mat2 &covar2d, + const T det, + const T sigma, + const T detClamp = T(0.01)) { + // Matches the classic FVDB/gsplat heuristic: use the largest eigenvalue of the 2x2 covariance + // (via trace/determinant) and take `sigma` standard deviations. + const T b = T(0.5) * (covar2d[0][0] + covar2d[1][1]); + const T v1 = b + ::cuda::std::sqrt(nanovdb::math::Max(detClamp, b * b - det)); + return ::cuda::std::ceil(sigma * ::cuda::std::sqrt(v1)); +} + +template +inline __device__ void +storeConicRowMajor3(const nanovdb::math::Mat2 &covar2dInverse, T *outConic3) { + outConic3[0] = covar2dInverse[0][0]; + outConic3[1] = covar2dInverse[0][1]; + outConic3[2] = covar2dInverse[1][1]; +} + +template +inline __device__ nanovdb::math::Vec3 +packConicRowMajor3(const nanovdb::math::Mat2 &covar2dInverse) { + return nanovdb::math::Vec3(covar2dInverse[0][0], covar2dInverse[0][1], covar2dInverse[1][1]); +} + +template +inline __device__ nanovdb::math::Mat2 +loadConicRowMajor3(const T *conic3) { + return nanovdb::math::Mat2(conic3[0], conic3[1], conic3[1], conic3[2]); +} + +template +inline __device__ nanovdb::math::Mat2 +loadConicGradRowMajor3(const T *dConic3) { + // Off-diagonal appears once in packed representation but corresponds to two symmetric entries + // in the full 2x2 matrix. + return nanovdb::math::Mat2(dConic3[0], dConic3[1] * T(0.5), dConic3[1] * T(0.5), dConic3[2]); +} + +template +inline __device__ nanovdb::math::Mat3 +loadCovarianceRowMajor6(const T *covars6) { + // Packed symmetric layout: [xx, xy, xz, yy, yz, zz] + return nanovdb::math::Mat3(covars6[0], + covars6[1], + covars6[2], // 1st row + covars6[1], + covars6[3], + covars6[4], // 2nd row + covars6[2], + covars6[4], + covars6[5] // 3rd row + ); +} + +template +inline __device__ void +loadQuatScaleFromLogScalesRowMajor(const T *quats4, + const T *logScales3, + nanovdb::math::Vec4 &outQuatWxyz, + nanovdb::math::Vec3 &outScale) { + outQuatWxyz = nanovdb::math::Vec4(quats4[0], quats4[1], quats4[2], quats4[3]); + outScale = nanovdb::math::Vec3(::cuda::std::exp(logScales3[0]), + ::cuda::std::exp(logScales3[1]), + ::cuda::std::exp(logScales3[2])); +} + +template +inline __device__ void +loadQuatScaleFromScalesRowMajor(const T *quats4, + const T *scales3, + nanovdb::math::Vec4 &outQuatWxyz, + nanovdb::math::Vec3 &outScale) { + outQuatWxyz = nanovdb::math::Vec4(quats4[0], quats4[1], quats4[2], quats4[3]); + outScale = nanovdb::math::Vec3(scales3[0], scales3[1], scales3[2]); +} + +} // namespace fvdb::detail::ops + +#endif // FVDB_DETAIL_OPS_GSPLAT_GAUSSIANPROJECTIONUTILS_CUH diff --git a/src/fvdb/detail/ops/gsplat/GaussianRasterizeForward.cu b/src/fvdb/detail/ops/gsplat/GaussianRasterizeForward.cu index dc620b4f..a4ace0fe 100644 --- a/src/fvdb/detail/ops/gsplat/GaussianRasterizeForward.cu +++ b/src/fvdb/detail/ops/gsplat/GaussianRasterizeForward.cu @@ -259,11 +259,21 @@ rasterizeGaussiansForward(RasterizeForwardArgs( const uint32_t imageOriginW, const uint32_t imageOriginH, const uint32_t tileSize, - const torch::Tensor &tileOffsets, // [C, tile_height, tile_width] - const torch::Tensor &tileGaussianIds, // [n_isects] - const at::optional &backgrounds // [C, D] + const torch::Tensor &tileOffsets, // [C, tile_height, tile_width] + const torch::Tensor &tileGaussianIds, // [n_isects] + const at::optional &backgrounds, // [C, D] + const at::optional &masks // [C, tile_height, tile_width] bool ) { FVDB_FUNC_RANGE(); const uint32_t channels = features.size(-1); const bool isPacked = means2d.dim() == 2; - const std::optional masks = std::nullopt; - #define CALL_FWD_CUDA(N) \ case N: { \ if (isPacked) { \ @@ -676,16 +685,15 @@ dispatchGaussianRasterizeForward( const uint32_t imageOriginH, const uint32_t tileSize, // intersections - const torch::Tensor &tileOffsets, // [C, tile_height, tile_width] - const torch::Tensor &tileGaussianIds, // [n_isects] - const at::optional &backgrounds // [C, D] + const torch::Tensor &tileOffsets, // [C, tile_height, tile_width] + const torch::Tensor &tileGaussianIds, // [n_isects] + const at::optional &backgrounds, // [C, D] + const at::optional &masks // [C, tile_height, tile_width] bool ) { FVDB_FUNC_RANGE(); const uint32_t channels = features.size(-1); const bool isPacked = means2d.dim() == 2; - const std::optional masks = std::nullopt; - #define CALL_FWD_PRIVATEUSE1(N) \ case N: { \ if (isPacked) { \ @@ -768,9 +776,10 @@ dispatchGaussianRasterizeForward( const uint32_t imageOriginH, const uint32_t tileSize, // intersections - const torch::Tensor &tileOffsets, // [C, tile_height, tile_width] - const torch::Tensor &tileGaussianIds, // [n_isects] - const at::optional &backgrounds // [C, D] + const torch::Tensor &tileOffsets, // [C, tile_height, tile_width] + const torch::Tensor &tileGaussianIds, // [n_isects] + const at::optional &backgrounds, // [C, D] + const at::optional &masks // [C, tile_height, tile_width] bool ) { TORCH_CHECK_NOT_IMPLEMENTED(false, "CPU implementation not available"); } diff --git a/src/fvdb/detail/ops/gsplat/GaussianRasterizeForward.h b/src/fvdb/detail/ops/gsplat/GaussianRasterizeForward.h index 21f1e5d0..31db01a9 100644 --- a/src/fvdb/detail/ops/gsplat/GaussianRasterizeForward.h +++ b/src/fvdb/detail/ops/gsplat/GaussianRasterizeForward.h @@ -56,9 +56,10 @@ std::tuple dispatchGaussianRasteriz const uint32_t imageOriginW, const uint32_t imageOriginH, const uint32_t tileSize, - const torch::Tensor &tileOffsets, // [C, tile_height, tile_width] - const torch::Tensor &tileGaussianIds, // [n_isects] - const at::optional &backgrounds = at::nullopt // [C, D] + const torch::Tensor &tileOffsets, // [C, tile_height, tile_width] + const torch::Tensor &tileGaussianIds, // [n_isects] + const at::optional &backgrounds = at::nullopt, // [C, D] + const at::optional &masks = at::nullopt // [C, tile_height, tile_width] bool ); /// @brief Dispatches the sparse Gaussian rasterization forward pass to the specified device. diff --git a/src/fvdb/detail/ops/gsplat/GaussianRasterizeFromWorld.cuh b/src/fvdb/detail/ops/gsplat/GaussianRasterizeFromWorld.cuh new file mode 100644 index 00000000..09a53ee0 --- /dev/null +++ b/src/fvdb/detail/ops/gsplat/GaussianRasterizeFromWorld.cuh @@ -0,0 +1,254 @@ +// Copyright Contributors to the OpenVDB Project +// SPDX-License-Identifier: Apache-2.0 +// +#ifndef FVDB_DETAIL_OPS_GSPLAT_GAUSSIANRASTERIZEFROMWORLD_CUH +#define FVDB_DETAIL_OPS_GSPLAT_GAUSSIANRASTERIZEFROMWORLD_CUH + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + +#include + +namespace fvdb::detail::ops { + +/// Opacity threshold used by 3DGS alpha compositing. +constexpr __device__ float kAlphaThreshold = 0.999f; + +/// Common dense-tile rasterization arguments shared by from-world forward/backward kernels. +struct RasterizeFromWorldCommonArgs { + using TileOffsetsAccessor = fvdb::TorchRAcc64; + using TileGaussianIdsAccessor = fvdb::TorchRAcc64; + + uint32_t numCameras; + uint32_t imageWidth; + uint32_t imageHeight; + uint32_t imageOriginW; + uint32_t imageOriginH; + uint32_t tileSize; + uint32_t numTilesW; + uint32_t numTilesH; + uint32_t numChannels; + int32_t totalIntersections; + + TileOffsetsAccessor tileOffsets; // [C, TH, TW] + TileGaussianIdsAccessor tileGaussianIds; // [n_isects] + const float *backgrounds; // [C, D] or nullptr + const bool *masks; // [C, TH, TW] or nullptr + + inline __device__ void + denseCoordinates(uint32_t &cameraId, + uint32_t &tileRow, + uint32_t &tileCol, + uint32_t &row, + uint32_t &col) const { + const uint32_t linearBlock = blockIdx.x; + cameraId = linearBlock / (numTilesH * numTilesW); + const uint32_t tileLinear = linearBlock - cameraId * (numTilesH * numTilesW); + tileRow = tileLinear / numTilesW; + tileCol = tileLinear - tileRow * numTilesW; + row = tileRow * tileSize + threadIdx.y; + col = tileCol * tileSize + threadIdx.x; + } + + inline __device__ uint32_t + tileId(const uint32_t cameraId, const uint32_t tileRow, const uint32_t tileCol) const { + return cameraId * numTilesH * numTilesW + tileRow * numTilesW + tileCol; + } + + inline __device__ bool + tileMasked(const uint32_t cameraId, const uint32_t tileRow, const uint32_t tileCol) const { + return (masks != nullptr) && (!masks[tileId(cameraId, tileRow, tileCol)]); + } + + inline __device__ cuda::std::tuple + tileGaussianRange(const uint32_t cameraId, + const uint32_t tileRow, + const uint32_t tileCol) const { + const int32_t rangeStart = tileOffsets[cameraId][tileRow][tileCol]; + + const int32_t rangeEnd = + ((cameraId == numCameras - 1) && (tileRow == numTilesH - 1) && + (tileCol == numTilesW - 1)) + ? totalIntersections + : ((tileCol + 1 < numTilesW) + ? tileOffsets[cameraId][tileRow][tileCol + 1] + : ((tileRow + 1 < numTilesH) ? tileOffsets[cameraId][tileRow + 1][0] + : tileOffsets[cameraId + 1][0][0])); + return {rangeStart, rangeEnd}; + } + + inline __device__ uint32_t + pixelId(const uint32_t row, const uint32_t col) const { + return row * imageWidth + col; + } + + inline __device__ uint32_t + outputPixelBase(const uint32_t cameraId, const uint32_t pixId) const { + return cameraId * imageHeight * imageWidth + pixId; + } + + inline __device__ uint32_t + outputFeatureBase(const uint32_t cameraId, const uint32_t pixId) const { + return outputPixelBase(cameraId, pixId) * numChannels; + } + + inline __device__ float + backgroundValue(const uint32_t cameraId, const uint32_t channelId) const { + return (backgrounds != nullptr) ? backgrounds[cameraId * numChannels + channelId] : 0.0f; + } +}; + +template +inline __device__ nanovdb::math::Vec3 +normalizeSafe(const nanovdb::math::Vec3 &v) { + const T n2 = v.dot(v); + if (n2 > T(0)) { + return v * (T(1) / sqrt(n2)); + } + return nanovdb::math::Vec3(T(0), T(0), T(0)); +} + +/// Vector-Jacobian product for y = normalizeSafe(x). +template +inline __device__ nanovdb::math::Vec3 +normalizeSafeVJP(const nanovdb::math::Vec3 &x, const nanovdb::math::Vec3 &v_y) { + const T n2 = x.dot(x); + if (!(n2 > T(0))) { + return nanovdb::math::Vec3(T(0), T(0), T(0)); + } + const T n = sqrt(n2); + const T invn = T(1) / n; + // v_x = (I/n - x x^T / n^3) v_y + const T invn3 = invn * invn * invn; + const T xdotv = x.dot(v_y); + return v_y * invn - x * (xdotv * invn3); +} + +template +inline __device__ nanovdb::math::Ray +pixelToWorldRay(const uint32_t row, + const uint32_t col, + const uint32_t imageWidth, + const uint32_t imageHeight, + const uint32_t imageOriginW, + const uint32_t imageOriginH, + const nanovdb::math::Mat3 &R_wc_start, + const nanovdb::math::Vec3 &t_wc_start, + const nanovdb::math::Mat3 &R_wc_end, + const nanovdb::math::Vec3 &t_wc_end, + const nanovdb::math::Mat3 &K, + const T *distCoeffs, + const int64_t numDistCoeffs, + const RollingShutterType rollingShutterType, + const CameraModel cameraModel) { + // Pixel center in crop coordinates. + const T px = T(col) + T(imageOriginW) + T(0.5); + const T py = T(row) + T(imageOriginH) + T(0.5); + + // Rolling shutter time based on pixel location. + const T u = rollingShutterTimeFromPixel(rollingShutterType, px, py, imageWidth, imageHeight); + + nanovdb::math::Mat3 R_wc; + nanovdb::math::Vec3 t_wc; + if (rollingShutterType == RollingShutterType::NONE) { + R_wc = R_wc_start; + t_wc = t_wc_start; + } else { + const RigidTransform worldToCamStart(R_wc_start, t_wc_start); + const RigidTransform worldToCamEnd(R_wc_end, t_wc_end); + const RigidTransform worldToCam = + RigidTransform::interpolate(u, worldToCamStart, worldToCamEnd); + R_wc = worldToCam.R; + t_wc = worldToCam.t; + } + + // Invert rigid transform to get camera->world. + const nanovdb::math::Mat3 R_cw = R_wc.transpose(); + + const CameraIntrinsics intr = loadIntrinsics(K); + const nanovdb::math::Vec2 p_distorted((px - intr.cx) / intr.fx, (py - intr.cy) / intr.fy); + const nanovdb::math::Vec2 p = + undistortOpenCVPackedFixedPoint(cameraModel, p_distorted, distCoeffs, numDistCoeffs); + + // Note: `nanovdb::math::Ray` is the standard ray type used elsewhere in FVDB. + // We store world-space `eye` and `dir` (normalized) and leave [t0,t1] at default values. + if (cameraModel == CameraModel::ORTHOGRAPHIC) { + // Parallel rays; origin varies with pixel. + const nanovdb::math::Vec3 o_cam(p[0], p[1], T(0)); + const nanovdb::math::Vec3 d_cam(T(0), T(0), T(1)); + // p_world = R_cw * (p_cam - t_wc) + const nanovdb::math::Vec3 origin_w = R_cw * (o_cam - t_wc); + nanovdb::math::Vec3 dir_w = normalizeSafe(R_cw * d_cam); + return nanovdb::math::Ray(origin_w, dir_w); + } + + // Perspective (pinhole / OpenCV distorted pinhole): origin at camera center. + const nanovdb::math::Vec3 d_cam = normalizeSafe(nanovdb::math::Vec3(p[0], p[1], T(1))); + const nanovdb::math::Vec3 origin_w = + R_cw * (nanovdb::math::Vec3(T(0), T(0), T(0)) - t_wc); + nanovdb::math::Vec3 dir_w = normalizeSafe(R_cw * d_cam); + return nanovdb::math::Ray(origin_w, dir_w); +} + +/// Load quaternion in [w,x,y,z] order. +template +inline __device__ nanovdb::math::Vec4 +quatLoadWxyz(const T *q) { + return nanovdb::math::Vec4(q[0], q[1], q[2], q[3]); +} + +/// Build S^{-1} R^T from quaternion + scale. +template +inline __device__ nanovdb::math::Mat3 +computeIsclRot(const nanovdb::math::Vec4 &quat_wxyz, const nanovdb::math::Vec3 &scale) { + const nanovdb::math::Mat3 R = quaternionToRotationMatrix(quat_wxyz); + const nanovdb::math::Mat3 S_inv( + T(1) / scale[0], T(0), T(0), T(0), T(1) / scale[1], T(0), T(0), T(0), T(1) / scale[2]); + return S_inv * R.transpose(); +} + +/// Vector-Jacobian product for isclRot = S^{-1} R^T. +template +inline __device__ void +isclRotVectorJacobianProduct(const nanovdb::math::Vec4 &quat_wxyz, + const nanovdb::math::Vec3 &scale, + const nanovdb::math::Mat3 &dLossDIsclRot, + nanovdb::math::Vec4 &dLossDQuat, + nanovdb::math::Vec3 &dLossDLogScale) { + // TODO(fvdb): Consider returning {dLossDQuat, dLossDLogScale} to match other VJP helpers in + // this module (e.g. `transformCovarianceWorldToCamVectorJacobianProduct`), rather than using + // output reference parameters. + // iscl_rot = S_inv * R^T, with S_inv = diag(1/scale) + const nanovdb::math::Mat3 R = quaternionToRotationMatrix(quat_wxyz); + const nanovdb::math::Vec3 invScale(T(1) / scale[0], T(1) / scale[1], T(1) / scale[2]); + + // For D = A * B, dA = G * B^T, dB = A^T * G. + // Here A = S_inv (diag), B = R^T. + const nanovdb::math::Mat3 dA = dLossDIsclRot * R; // since (R^T)^T = R + const nanovdb::math::Mat3 A( + invScale[0], T(0), T(0), T(0), invScale[1], T(0), T(0), T(0), invScale[2]); + const nanovdb::math::Mat3 dB = A * dLossDIsclRot; + const nanovdb::math::Mat3 dR = dB.transpose(); // B = R^T + + dLossDQuat = quaternionToRotationMatrixVectorJacobianProduct(quat_wxyz, dR); + + // Diagonal of dA gives gradients w.r.t invScale. + const nanovdb::math::Vec3 dInvScale(dA[0][0], dA[1][1], dA[2][2]); + // invScale = 1/scale => dlogScale = dscale * scale = -dInvScale / scale + dLossDLogScale = nanovdb::math::Vec3( + -dInvScale[0] * invScale[0], -dInvScale[1] * invScale[1], -dInvScale[2] * invScale[2]); +} + +} // namespace fvdb::detail::ops + +#endif // FVDB_DETAIL_OPS_GSPLAT_GAUSSIANRASTERIZEFROMWORLD_CUH diff --git a/src/fvdb/detail/ops/gsplat/GaussianRasterizeFromWorldBackward.cu b/src/fvdb/detail/ops/gsplat/GaussianRasterizeFromWorldBackward.cu new file mode 100644 index 00000000..93975586 --- /dev/null +++ b/src/fvdb/detail/ops/gsplat/GaussianRasterizeFromWorldBackward.cu @@ -0,0 +1,624 @@ +// Copyright Contributors to the OpenVDB Project +// SPDX-License-Identifier: Apache-2.0 +// +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include + +namespace fvdb::detail::ops { +namespace cg = cooperative_groups; + +namespace { + +template struct SharedGaussian { + int32_t id; // flattened id in [0, C*N) + nanovdb::math::Vec3 mean; // world mean + nanovdb::math::Vec4 quat; // wxyz + nanovdb::math::Vec3 scale; // exp(log_scales) + nanovdb::math::Mat3 isclR; // S^{-1} R^T + float opacity; +}; + +template +__global__ void +rasterizeFromWorld3DGSBackwardKernel( + const RasterizeFromWorldCommonArgs commonArgs, + // Gaussians + const torch::PackedTensorAccessor64 means, // [N,3] + const torch::PackedTensorAccessor64 quats, // [N,4] + const torch::PackedTensorAccessor64 logScales, // [N,3] + // Per-camera + const torch::PackedTensorAccessor64 features, // [C,N,D] + const torch::PackedTensorAccessor64 opacities, // [C,N] + const torch::PackedTensorAccessor64 + worldToCamStart, // [C,4,4] + const torch::PackedTensorAccessor64 + worldToCamEnd, // [C,4,4] + const torch::PackedTensorAccessor64 K, // [C,3,3] + const torch::PackedTensorAccessor64 + distortionCoeffs, // [C,K] + const int64_t numDistCoeffs, + const RollingShutterType rollingShutterType, + const CameraModel cameraModel, + // Forward outputs + const torch::PackedTensorAccessor64 + renderedAlphas, // [C,H,W,1] + const torch::PackedTensorAccessor64 lastIds, // [C,H,W] + // Grad outputs + const torch::PackedTensorAccessor64 + dLossDRenderedFeatures, // [C,H,W,D] + const torch::PackedTensorAccessor64 + dLossDRenderedAlphas, // [C,H,W,1] + // Outputs (grads) + float *__restrict__ dMeans, // [N,3] + float *__restrict__ dQuats, // [N,4] + float *__restrict__ dLogScales, // [N,3] + float *__restrict__ dFeatures, // [C,N,D] + float *__restrict__ dOpacities // [C,N] +) { + auto block = cg::this_thread_block(); + const uint32_t blockSize = blockDim.x * blockDim.y; + + uint32_t camId, tileRow, tileCol, row, col; + commonArgs.denseCoordinates(camId, tileRow, tileCol, row, col); + const bool inside = (row < commonArgs.imageHeight && col < commonArgs.imageWidth); + + // Parity with classic rasterizer: masked tiles contribute nothing. + // + // IMPORTANT: this kernel uses block-level barriers later (`block.sync`). Any early return must + // be taken by *all* threads in the block, otherwise edge tiles can deadlock when some threads + // are `!inside`. So we make the return block-wide. + const bool tileMasked = commonArgs.tileMasked(camId, tileRow, tileCol); + if (tileMasked) { + return; + } + + // Camera pose for this camera. + nanovdb::math::Mat3 R_wc_start, R_wc_end; + nanovdb::math::Vec3 t_wc_start, t_wc_end; + { + loadWorldToCamRtFromAccessor44(worldToCamStart, camId, R_wc_start, t_wc_start); + loadWorldToCamRtFromAccessor44(worldToCamEnd, camId, R_wc_end, t_wc_end); + } + + const nanovdb::math::Mat3 K_cam = loadMat3FromAccessor33(K, camId); + const float *distPtr = (numDistCoeffs > 0) ? &distortionCoeffs[camId][0] : nullptr; + + const nanovdb::math::Ray ray = pixelToWorldRay(row, + col, + commonArgs.imageWidth, + commonArgs.imageHeight, + commonArgs.imageOriginW, + commonArgs.imageOriginH, + R_wc_start, + t_wc_start, + R_wc_end, + t_wc_end, + K_cam, + distPtr, + numDistCoeffs, + rollingShutterType, + cameraModel); + + // Whether this pixel participates in the backward pass. + // + // NOTE: We must *not* early-return for `!inside` because the kernel uses `block.sync` later. + const bool rayValid = ray.dir().dot(ray.dir()) > 0.0f; + const bool done = inside && rayValid; + + // Gaussian range for this tile. + const auto [rangeStart, rangeEnd] = commonArgs.tileGaussianRange(camId, tileRow, tileCol); + + // If the tile has no intersections, there is nothing to do. This must be a block-wide return. + if (rangeEnd <= rangeStart) { + return; + } + + // Forward state for this pixel. + int32_t binFinal = -1; + float T_final = 1.0f; + float T = 1.0f; + + float v_render_c[NUM_CHANNELS]; +#pragma unroll + for (uint32_t k = 0; k < NUM_CHANNELS; ++k) { + v_render_c[k] = 0.f; + } + float v_render_a = 0.f; + + if (done) { + binFinal = lastIds[camId][row][col]; + + const float alphaFinal = renderedAlphas[camId][row][col][0]; + T_final = 1.0f - alphaFinal; + T = T_final; + +#pragma unroll + for (uint32_t k = 0; k < NUM_CHANNELS; ++k) { + v_render_c[k] = dLossDRenderedFeatures[camId][row][col][k]; + } + v_render_a = dLossDRenderedAlphas[camId][row][col][0]; + } + + float buffer[NUM_CHANNELS]; +#pragma unroll + for (uint32_t k = 0; k < NUM_CHANNELS; ++k) { + buffer[k] = 0.f; + } + + // Shared memory for gaussian batches. + extern __shared__ char smem[]; + int32_t *idBatch = reinterpret_cast(smem); // [blockSize] + auto *gBatch = + reinterpret_cast *>(&idBatch[blockSize]); // [blockSize] + + const uint32_t threadRank = block.thread_rank(); + cg::thread_block_tile<32> warp = cg::tiled_partition<32>(block); + + const int32_t nIsects = rangeEnd - rangeStart; + const uint32_t nBatches = (nIsects + (int32_t)blockSize - 1) / (int32_t)blockSize; + + // Reduce max binFinal within warp (used to early-skip). + const int32_t warpBinFinal = cg::reduce(warp, binFinal, cg::greater()); + + for (uint32_t b = 0; b < nBatches; ++b) { + block.sync(); + const int32_t batchEnd = rangeEnd - 1 - (int32_t)(blockSize * b); + const int32_t batchSize = min((int32_t)blockSize, batchEnd + 1 - rangeStart); + const int32_t idx = batchEnd - (int32_t)threadRank; + + if (idx >= rangeStart) { + const int32_t flatId = commonArgs.tileGaussianIds[idx]; + idBatch[threadRank] = flatId; + const int32_t gid = flatId % (int32_t)means.size(0); + const int32_t cid = flatId / (int32_t)means.size(0); + + const nanovdb::math::Vec3 mean_w(means[gid][0], means[gid][1], means[gid][2]); + const nanovdb::math::Vec4 quat_wxyz( + quats[gid][0], quats[gid][1], quats[gid][2], quats[gid][3]); + const nanovdb::math::Vec3 scale( + __expf(logScales[gid][0]), __expf(logScales[gid][1]), __expf(logScales[gid][2])); + const nanovdb::math::Mat3 isclR = computeIsclRot(quat_wxyz, scale); + const float op = opacities[cid][gid]; + + gBatch[threadRank].id = flatId; + gBatch[threadRank].mean = mean_w; + gBatch[threadRank].quat = quat_wxyz; + gBatch[threadRank].scale = scale; + gBatch[threadRank].isclR = isclR; + gBatch[threadRank].opacity = op; + } + + block.sync(); + + // Process gaussians in this batch, from back-to-front. + const int32_t startT = max(0, batchEnd - warpBinFinal); + for (int32_t t = startT; t < batchSize; ++t) { + bool valid = done; + if (batchEnd - t > binFinal) { + valid = false; + } + + float alpha = 0.f; + float opac = 0.f; + float vis = 0.f; + + nanovdb::math::Vec3 mean_w(0.f); + nanovdb::math::Vec4 quat_wxyz(1.f, 0.f, 0.f, 0.f); + nanovdb::math::Vec3 scale(1.f); + nanovdb::math::Mat3 Mt; + nanovdb::math::Vec3 o_minus_mu(0.f); + nanovdb::math::Vec3 gro(0.f), grd(0.f), grd_n(0.f), gcrod(0.f); + float grayDist = 0.f; + + if (valid) { + const SharedGaussian g = gBatch[t]; + mean_w = g.mean; + quat_wxyz = g.quat; + scale = g.scale; + Mt = g.isclR; + opac = g.opacity; + + o_minus_mu = ray.eye() - mean_w; + gro = Mt * o_minus_mu; + grd = Mt * ray.dir(); + grd_n = normalizeSafe(grd); + gcrod = grd_n.cross(gro); + grayDist = gcrod.dot(gcrod); + const float power = -0.5f * grayDist; + vis = __expf(power); + alpha = min(kAlphaThreshold, opac * vis); + if (power > 0.f || alpha < 1.f / 255.f) { + valid = false; + } + } + + if (!warp.any(valid)) { + continue; + } + + float v_feat_local[NUM_CHANNELS]; +#pragma unroll + for (uint32_t k = 0; k < NUM_CHANNELS; ++k) { + v_feat_local[k] = 0.f; + } + + nanovdb::math::Vec3 v_mean_local(0.f); + float v_quat_local[4] = {0.f, 0.f, 0.f, 0.f}; + nanovdb::math::Vec3 v_logscale_local(0.f); + float v_opacity_local = 0.f; + + if (valid) { + const float ra = 1.0f / (1.0f - alpha); + T *= ra; + + const float fac = alpha * T; +#pragma unroll + for (uint32_t k = 0; k < NUM_CHANNELS; ++k) { + v_feat_local[k] = fac * v_render_c[k]; + } + + // v_alpha accumulation + float v_alpha = 0.f; + const int32_t flatId = idBatch[t]; + const int32_t cid = flatId / (int32_t)means.size(0); + const int32_t gid = flatId % (int32_t)means.size(0); + +#pragma unroll + for (uint32_t k = 0; k < NUM_CHANNELS; ++k) { + const float c = features[cid][gid][k]; + v_alpha += (c * T - buffer[k] * ra) * v_render_c[k]; + } + + v_alpha += T_final * ra * v_render_a; + + if (commonArgs.backgrounds != nullptr) { + float accum = 0.f; +#pragma unroll + for (uint32_t k = 0; k < NUM_CHANNELS; ++k) { + accum += commonArgs.backgroundValue(camId, k) * v_render_c[k]; + } + v_alpha += -T_final * ra * accum; + } + + if (opac * vis <= kAlphaThreshold) { + const float v_vis = opac * v_alpha; + const float v_gradDist = -0.5f * vis * v_vis; + const nanovdb::math::Vec3 v_gcrod = 2.0f * v_gradDist * gcrod; + const nanovdb::math::Vec3 v_grd_n = -(v_gcrod.cross(gro)); + const nanovdb::math::Vec3 v_gro = v_gcrod.cross(grd_n); + + const nanovdb::math::Vec3 v_grd = normalizeSafeVJP(grd, v_grd_n); + + // v_Mt = outer(v_grd, ray.dir) + outer(v_gro, (ray.eye() - mean)) + const nanovdb::math::Vec3 rayDir = ray.dir(); + nanovdb::math::Mat3 v_Mt = + nanovdb::math::Mat3(v_grd[0] * rayDir[0], + v_grd[0] * rayDir[1], + v_grd[0] * rayDir[2], + v_grd[1] * rayDir[0], + v_grd[1] * rayDir[1], + v_grd[1] * rayDir[2], + v_grd[2] * rayDir[0], + v_grd[2] * rayDir[1], + v_grd[2] * rayDir[2]); + v_Mt += nanovdb::math::Mat3(v_gro[0] * o_minus_mu[0], + v_gro[0] * o_minus_mu[1], + v_gro[0] * o_minus_mu[2], + v_gro[1] * o_minus_mu[0], + v_gro[1] * o_minus_mu[1], + v_gro[1] * o_minus_mu[2], + v_gro[2] * o_minus_mu[0], + v_gro[2] * o_minus_mu[1], + v_gro[2] * o_minus_mu[2]); + + const nanovdb::math::Vec3 v_o_minus_mu = Mt.transpose() * v_gro; + v_mean_local += -v_o_minus_mu; + + nanovdb::math::Vec4 dQuat(0.f); + nanovdb::math::Vec3 dLogScale(0.f); + isclRotVectorJacobianProduct(quat_wxyz, scale, v_Mt, dQuat, dLogScale); + v_quat_local[0] += dQuat[0]; + v_quat_local[1] += dQuat[1]; + v_quat_local[2] += dQuat[2]; + v_quat_local[3] += dQuat[3]; + v_logscale_local += dLogScale; + + v_opacity_local = vis * v_alpha; + } + + // buffer update +#pragma unroll + for (uint32_t k = 0; k < NUM_CHANNELS; ++k) { + const int32_t flatId = idBatch[t]; + const int32_t cid = flatId / (int32_t)means.size(0); + const int32_t gid = flatId % (int32_t)means.size(0); + buffer[k] += features[cid][gid][k] * fac; + } + } + + // Warp-reduce and atomic add once per gaussian per warp. + warpSumMut(v_opacity_local, warp); + warpSumMut(v_mean_local, warp); + warpSumMut<4>(v_quat_local, warp); + warpSumMut(v_logscale_local, warp); + warpSumMut(v_feat_local, warp); + + if (warp.thread_rank() == 0) { + const int32_t flatId = idBatch[t]; + const int32_t cid = flatId / (int32_t)means.size(0); + const int32_t gid = flatId % (int32_t)means.size(0); + + // Per-camera grads + float *dFeatPtr = + dFeatures + ((cid * (int32_t)means.size(0) + gid) * (int32_t)NUM_CHANNELS); +#pragma unroll + for (uint32_t k = 0; k < NUM_CHANNELS; ++k) { + atomicAdd_system(dFeatPtr + k, v_feat_local[k]); + } + atomicAdd_system(dOpacities + (cid * (int32_t)means.size(0) + gid), + v_opacity_local); + + // Geometry grads (shared across cameras) + float *dMeanPtr = dMeans + gid * 3; + atomicAdd_system(dMeanPtr + 0, v_mean_local[0]); + atomicAdd_system(dMeanPtr + 1, v_mean_local[1]); + atomicAdd_system(dMeanPtr + 2, v_mean_local[2]); + + float *dQuatPtr = dQuats + gid * 4; + atomicAdd_system(dQuatPtr + 0, v_quat_local[0]); + atomicAdd_system(dQuatPtr + 1, v_quat_local[1]); + atomicAdd_system(dQuatPtr + 2, v_quat_local[2]); + atomicAdd_system(dQuatPtr + 3, v_quat_local[3]); + + float *dLsPtr = dLogScales + gid * 3; + atomicAdd_system(dLsPtr + 0, v_logscale_local[0]); + atomicAdd_system(dLsPtr + 1, v_logscale_local[1]); + atomicAdd_system(dLsPtr + 2, v_logscale_local[2]); + } + } + } +} + +template +std::tuple +launchBackward(const torch::Tensor &means, + const torch::Tensor &quats, + const torch::Tensor &logScales, + const torch::Tensor &features, + const torch::Tensor &opacities, + const torch::Tensor &worldToCamStart, + const torch::Tensor &worldToCamEnd, + const torch::Tensor &K, + const torch::Tensor &distortionCoeffs, + const RollingShutterType rollingShutterType, + const CameraModel cameraModel, + const uint32_t imageWidth, + const uint32_t imageHeight, + const uint32_t imageOriginW, + const uint32_t imageOriginH, + const uint32_t tileSize, + const torch::Tensor &tileOffsets, + const torch::Tensor &tileGaussianIds, + const torch::Tensor &renderedAlphas, + const torch::Tensor &lastIds, + const torch::Tensor &dLossDRenderedFeatures, + const torch::Tensor &dLossDRenderedAlphas, + const at::optional &backgrounds, + const at::optional &masks) { + const int64_t C = features.size(0); + const int64_t N = means.size(0); + + torch::Tensor dMeans = torch::zeros_like(means); + torch::Tensor dQuats = torch::zeros_like(quats); + torch::Tensor dLogScales = torch::zeros_like(logScales); + torch::Tensor dFeatures = torch::zeros_like(features); + torch::Tensor dOpacities = torch::zeros_like(opacities); + + const uint32_t tileExtentW = (imageWidth + tileSize - 1) / tileSize; + const uint32_t tileExtentH = (imageHeight + tileSize - 1) / tileSize; + const dim3 blockDim(tileSize, tileSize, 1); + const dim3 gridDim(C * tileExtentH * tileExtentW, 1, 1); + const int32_t totalIntersections = static_cast(tileGaussianIds.size(0)); + const int64_t numDistCoeffs = distortionCoeffs.size(1); + + RasterizeFromWorldCommonArgs commonArgs{ + static_cast(C), + imageWidth, + imageHeight, + imageOriginW, + imageOriginH, + tileSize, + tileExtentW, + tileExtentH, + NUM_CHANNELS, + totalIntersections, + tileOffsets.packed_accessor64(), + tileGaussianIds.packed_accessor64(), + nullptr, + nullptr}; + + const PreparedRasterOptionalInputs opt = prepareRasterOptionalInputs( + features, C, tileExtentH, tileExtentW, (int64_t)NUM_CHANNELS, backgrounds, masks); + commonArgs.backgrounds = opt.backgrounds; + commonArgs.masks = opt.masks; + + const size_t blockSize = (size_t)tileSize * (size_t)tileSize; + const size_t sharedMem = blockSize * (sizeof(int32_t) + sizeof(SharedGaussian)); + + auto stream = at::cuda::getDefaultCUDAStream(); + rasterizeFromWorld3DGSBackwardKernel<<>>( + commonArgs, + means.packed_accessor64(), + quats.packed_accessor64(), + logScales.packed_accessor64(), + features.packed_accessor64(), + opacities.packed_accessor64(), + worldToCamStart.packed_accessor64(), + worldToCamEnd.packed_accessor64(), + K.packed_accessor64(), + distortionCoeffs.packed_accessor64(), + numDistCoeffs, + rollingShutterType, + cameraModel, + renderedAlphas.packed_accessor64(), + lastIds.packed_accessor64(), + dLossDRenderedFeatures.packed_accessor64(), + dLossDRenderedAlphas.packed_accessor64(), + dMeans.data_ptr(), + dQuats.data_ptr(), + dLogScales.data_ptr(), + dFeatures.data_ptr(), + dOpacities.data_ptr()); + C10_CUDA_KERNEL_LAUNCH_CHECK(); + return {dMeans, dQuats, dLogScales, dFeatures, dOpacities}; +} + +} // namespace + +template <> +std::tuple +dispatchGaussianRasterizeFromWorld3DGSBackward( + const torch::Tensor &means, + const torch::Tensor &quats, + const torch::Tensor &logScales, + const torch::Tensor &features, + const torch::Tensor &opacities, + const torch::Tensor &worldToCamMatricesStart, + const torch::Tensor &worldToCamMatricesEnd, + const torch::Tensor &projectionMatrices, + const torch::Tensor &distortionCoeffs, + const RollingShutterType rollingShutterType, + const CameraModel cameraModel, + const RenderSettings &settings, + const torch::Tensor &tileOffsets, + const torch::Tensor &tileGaussianIds, + const torch::Tensor &renderedAlphas, + const torch::Tensor &lastIds, + const torch::Tensor &dLossDRenderedFeatures, + const torch::Tensor &dLossDRenderedAlphas, + const at::optional &backgrounds, + const at::optional &masks) { + FVDB_FUNC_RANGE(); + + const uint32_t imageWidth = settings.imageWidth; + const uint32_t imageHeight = settings.imageHeight; + const uint32_t imageOriginW = settings.imageOriginW; + const uint32_t imageOriginH = settings.imageOriginH; + const uint32_t tileSize = settings.tileSize; + + TORCH_CHECK_VALUE(means.is_cuda(), "means must be CUDA"); + TORCH_CHECK_VALUE(features.is_cuda(), "features must be CUDA"); + TORCH_CHECK_VALUE(opacities.is_cuda(), "opacities must be CUDA"); + TORCH_CHECK_VALUE(renderedAlphas.is_cuda(), "renderedAlphas must be CUDA"); + TORCH_CHECK_VALUE(lastIds.is_cuda(), "lastIds must be CUDA"); + + const int64_t C = features.size(0); + const int64_t N = means.size(0); + + // Opacities may be provided either per-camera ([C,N]) or shared across cameras ([N]). + // TODO(fvdb): Avoid materializing a repeated [C,N] tensor when opacities are shared across + // cameras (similar to PR #451). + torch::Tensor opacitiesBatched = opacities; + if (opacitiesBatched.dim() == 1) { + TORCH_CHECK_VALUE(opacitiesBatched.size(0) == N, + "opacities must have shape [N] or [C,N] matching N"); + opacitiesBatched = opacitiesBatched.unsqueeze(0).repeat({C, 1}); + } + opacitiesBatched = opacitiesBatched.contiguous(); + + const uint32_t channels = (uint32_t)features.size(2); + +#define CALL_BWD(NCH) \ + case NCH: \ + return launchBackward(means, \ + quats, \ + logScales, \ + features, \ + opacitiesBatched, \ + worldToCamMatricesStart, \ + worldToCamMatricesEnd, \ + projectionMatrices, \ + distortionCoeffs, \ + rollingShutterType, \ + cameraModel, \ + imageWidth, \ + imageHeight, \ + imageOriginW, \ + imageOriginH, \ + tileSize, \ + tileOffsets, \ + tileGaussianIds, \ + renderedAlphas, \ + lastIds, \ + dLossDRenderedFeatures, \ + dLossDRenderedAlphas, \ + backgrounds, \ + masks); + + switch (channels) { + CALL_BWD(1) + CALL_BWD(2) + CALL_BWD(3) + CALL_BWD(4) + CALL_BWD(5) + CALL_BWD(8) + CALL_BWD(9) + CALL_BWD(16) + CALL_BWD(17) + CALL_BWD(32) + CALL_BWD(33) + CALL_BWD(64) + CALL_BWD(65) + CALL_BWD(128) + CALL_BWD(129) + CALL_BWD(192) + CALL_BWD(193) + CALL_BWD(256) + CALL_BWD(257) + CALL_BWD(512) + CALL_BWD(513) + default: + TORCH_CHECK_VALUE( + false, "Unsupported channels for rasterize-from-world-3dgs backward: ", channels); + } + +#undef CALL_BWD +} + +template <> +std::tuple +dispatchGaussianRasterizeFromWorld3DGSBackward(const torch::Tensor &, + const torch::Tensor &, + const torch::Tensor &, + const torch::Tensor &, + const torch::Tensor &, + const torch::Tensor &, + const torch::Tensor &, + const torch::Tensor &, + const torch::Tensor &, + const RollingShutterType, + const CameraModel, + const RenderSettings &, + const torch::Tensor &, + const torch::Tensor &, + const torch::Tensor &, + const torch::Tensor &, + const torch::Tensor &, + const torch::Tensor &, + const at::optional &, + const at::optional &) { + TORCH_CHECK_VALUE(false, "dispatchGaussianRasterizeFromWorld3DGSBackward is CUDA-only"); +} + +} // namespace fvdb::detail::ops diff --git a/src/fvdb/detail/ops/gsplat/GaussianRasterizeFromWorldBackward.h b/src/fvdb/detail/ops/gsplat/GaussianRasterizeFromWorldBackward.h new file mode 100644 index 00000000..4df897ab --- /dev/null +++ b/src/fvdb/detail/ops/gsplat/GaussianRasterizeFromWorldBackward.h @@ -0,0 +1,62 @@ +// Copyright Contributors to the OpenVDB Project +// SPDX-License-Identifier: Apache-2.0 +// +#ifndef FVDB_DETAIL_OPS_GSPLAT_GAUSSIANRASTERIZEFROMWORLDBACKWARD_H +#define FVDB_DETAIL_OPS_GSPLAT_GAUSSIANRASTERIZEFROMWORLDBACKWARD_H + +#include +#include + +#include + +#include +#include + +namespace fvdb::detail::ops { + +/// @brief Backward pass for dense rasterization from 3D Gaussians using per-pixel rays. +/// +/// Gradients are produced for: +/// - means: [N, 3] +/// - quats: [N, 4] +/// - logScales: [N, 3] +/// - features: [C, N, D] +/// - opacities: [C, N] +/// +/// @tparam DeviceType torch::kCUDA (CPU not implemented). +template +std::tuple +dispatchGaussianRasterizeFromWorld3DGSBackward( + // Gaussian parameters (world space) + const torch::Tensor &means, // [N, 3] + const torch::Tensor &quats, // [N, 4] + const torch::Tensor &logScales, // [N, 3] + // Per-camera quantities + const torch::Tensor &features, // [C, N, D] + const torch::Tensor &opacities, // [C, N] + const torch::Tensor &worldToCamMatricesStart, // [C, 4, 4] + const torch::Tensor &worldToCamMatricesEnd, // [C, 4, 4] + const torch::Tensor &projectionMatrices, // [C, 3, 3] + const torch::Tensor &distortionCoeffs, // [C, K] (K=0 or 12) + const RollingShutterType rollingShutterType, + const CameraModel cameraModel, + // Render settings + const RenderSettings &settings, + // Intersections + const torch::Tensor &tileOffsets, // [C, tileH, tileW] + const torch::Tensor &tileGaussianIds, // [n_isects] values in [0, C*N) + // Forward outputs needed for backward + const torch::Tensor &renderedAlphas, // [C, H, W, 1] + const torch::Tensor &lastIds, // [C, H, W] + // Gradients of outputs + const torch::Tensor &dLossDRenderedFeatures, // [C, H, W, D] + const torch::Tensor &dLossDRenderedAlphas, // [C, H, W, 1] + // Optional background (only affects alpha gradient term) + const at::optional &backgrounds = at::nullopt, // [C, D] + // Optional tile masks (parity with classic rasterizer) + const at::optional &masks = at::nullopt // [C, tileH, tileW] bool +); + +} // namespace fvdb::detail::ops + +#endif // FVDB_DETAIL_OPS_GSPLAT_GAUSSIANRASTERIZEFROMWORLDBACKWARD_H diff --git a/src/fvdb/detail/ops/gsplat/GaussianRasterizeFromWorldForward.cu b/src/fvdb/detail/ops/gsplat/GaussianRasterizeFromWorldForward.cu new file mode 100644 index 00000000..ceef41cb --- /dev/null +++ b/src/fvdb/detail/ops/gsplat/GaussianRasterizeFromWorldForward.cu @@ -0,0 +1,471 @@ +// Copyright Contributors to the OpenVDB Project +// SPDX-License-Identifier: Apache-2.0 +// +#include +#include +#include +#include +#include + +#include +#include +#include + +#include + +namespace fvdb::detail::ops { +namespace cg = cooperative_groups; + +namespace { + +template struct SharedGaussian { + int32_t id; // flattened id in [0, C*N) + nanovdb::math::Vec3 mean; // world mean + nanovdb::math::Mat3 isclR; // S^{-1} R^T + float opacity; +}; + +template +__global__ void +rasterizeFromWorld3DGSForwardKernel( + const RasterizeFromWorldCommonArgs commonArgs, + // Gaussians + const torch::PackedTensorAccessor64 means, // [N,3] + const torch::PackedTensorAccessor64 quats, // [N,4] + const torch::PackedTensorAccessor64 logScales, // [N,3] + // Per-camera + const torch::PackedTensorAccessor64 features, // [C,N,D] + const torch::PackedTensorAccessor64 opacities, // [C,N] + const torch::PackedTensorAccessor64 + worldToCamStart, // [C,4,4] (view as 3D accessor for contiguous) + const torch::PackedTensorAccessor64 + worldToCamEnd, // [C,4,4] + const torch::PackedTensorAccessor64 K, // [C,3,3] + const torch::PackedTensorAccessor64 + distortionCoeffs, // [C,K] + const int64_t numDistCoeffs, + const RollingShutterType rollingShutterType, + const CameraModel cameraModel, + // Outputs + float *__restrict__ outFeatures, // [C,H,W,D] + float *__restrict__ outAlphas, // [C,H,W,1] packed as [C*H*W] + int32_t *__restrict__ outLastIds // [C,H,W] +) { + const uint32_t blockSize = blockDim.x * blockDim.y; + auto block = cg::this_thread_block(); + + uint32_t camId, tileRow, tileCol, row, col; + commonArgs.denseCoordinates(camId, tileRow, tileCol, row, col); + const bool inside = (row < commonArgs.imageHeight && col < commonArgs.imageWidth); + + // Pixel index within this camera. + const uint32_t pixId = commonArgs.pixelId(row, col); + const uint32_t imgBaseFeat = commonArgs.outputFeatureBase(camId, pixId); + const uint32_t imgBasePix = commonArgs.outputPixelBase(camId, pixId); + + // Parity with classic rasterizer: masked tiles write background and exit. + // + // IMPORTANT: this kernel uses block-level barriers later (`__syncthreads_count`, `block.sync`). + // Any early return must be taken by *all* threads in the block, otherwise edge tiles can + // deadlock when some threads are `!inside`. So we make the return block-wide. + const bool tileMasked = commonArgs.tileMasked(camId, tileRow, tileCol); + if (tileMasked) { + if (inside) { + outAlphas[imgBasePix] = 0.0f; + outLastIds[imgBasePix] = -1; +#pragma unroll + for (uint32_t k = 0; k < NUM_CHANNELS; ++k) { + outFeatures[imgBaseFeat + k] = commonArgs.backgroundValue(camId, k); + } + } + return; + } + + // Build camera pose for this camera (start/end). + nanovdb::math::Mat3 worldToCamRotationStart, worldToCamRotationEnd; + nanovdb::math::Vec3 worldToCamTranslationStart, worldToCamTranslationEnd; + { + loadWorldToCamRtFromAccessor44( + worldToCamStart, camId, worldToCamRotationStart, worldToCamTranslationStart); + loadWorldToCamRtFromAccessor44( + worldToCamEnd, camId, worldToCamRotationEnd, worldToCamTranslationEnd); + } + + const nanovdb::math::Mat3 K_cam = loadMat3FromAccessor33(K, camId); + + const float *distPtr = (numDistCoeffs > 0) ? &distortionCoeffs[camId][0] : nullptr; + + const nanovdb::math::Ray ray = pixelToWorldRay(row, + col, + commonArgs.imageWidth, + commonArgs.imageHeight, + commonArgs.imageOriginW, + commonArgs.imageOriginH, + worldToCamRotationStart, + worldToCamTranslationStart, + worldToCamRotationEnd, + worldToCamTranslationEnd, + K_cam, + distPtr, + numDistCoeffs, + rollingShutterType, + cameraModel); + + const bool rayValid = ray.dir().dot(ray.dir()) > 0.0f; + bool done = (!inside) || (!rayValid); + + // Determine gaussian range for this tile. + const auto [rangeStart, rangeEnd] = commonArgs.tileGaussianRange(camId, tileRow, tileCol); + + // If no intersections, just write background. + // + // As above, this must be a block-wide return to avoid deadlocks on edge tiles. + if (rangeEnd <= rangeStart) { + if (inside) { + // alpha=0, output background if provided else 0. + outAlphas[imgBasePix] = 0.0f; + outLastIds[imgBasePix] = -1; +#pragma unroll + for (uint32_t k = 0; k < NUM_CHANNELS; ++k) { + outFeatures[imgBaseFeat + k] = commonArgs.backgroundValue(camId, k); + } + } + return; + } + + // Shared memory for batched gaussians. + extern __shared__ char smem[]; + int32_t *idBatch = reinterpret_cast(smem); // [blockSize] + auto *gaussBatch = + reinterpret_cast *>(&idBatch[blockSize]); // [blockSize] + + float transmittance = 1.0f; + int32_t curIdx = -1; + float pixOut[NUM_CHANNELS]; +#pragma unroll + for (uint32_t k = 0; k < NUM_CHANNELS; ++k) { + pixOut[k] = 0.f; + } + + const int32_t nIsects = rangeEnd - rangeStart; + const uint32_t nBatches = (nIsects + blockSize - 1) / blockSize; + const uint32_t threadRank = block.thread_rank(); + + for (uint32_t b = 0; b < nBatches; ++b) { + if (__syncthreads_count(done) >= (int)blockSize) { + break; + } + + const int32_t batchStart = rangeStart + (int32_t)(blockSize * b); + const int32_t idx = batchStart + (int32_t)threadRank; + if (idx < rangeEnd) { + const int32_t flatId = commonArgs.tileGaussianIds[idx]; + idBatch[threadRank] = flatId; + const int32_t gid = flatId % (int32_t)means.size(0); + + const nanovdb::math::Vec3 mean_w(means[gid][0], means[gid][1], means[gid][2]); + const nanovdb::math::Vec4 quat_wxyz( + quats[gid][0], quats[gid][1], quats[gid][2], quats[gid][3]); + const nanovdb::math::Vec3 scale( + __expf(logScales[gid][0]), __expf(logScales[gid][1]), __expf(logScales[gid][2])); + const nanovdb::math::Mat3 isclR = computeIsclRot(quat_wxyz, scale); + const int32_t cid = flatId / (int32_t)means.size(0); + const float op = opacities[cid][gid]; + + gaussBatch[threadRank].id = flatId; + gaussBatch[threadRank].mean = mean_w; + gaussBatch[threadRank].isclR = isclR; + gaussBatch[threadRank].opacity = op; + } + + __syncthreads(); + + const uint32_t batchSize = + min((uint32_t)blockSize, (uint32_t)max(0, rangeEnd - batchStart)); + for (uint32_t t = 0; (t < batchSize) && !done; ++t) { + const SharedGaussian g = gaussBatch[t]; + // 3DGS ray-ellipsoid visibility in "whitened" coordinates (see 3D-GUT paper Fig. 11). + // gro = S^{-1} R^T (o - μ) + // grd = normalize( S^{-1} R^T d ) + // gcrod = grd × gro (distance proxy to principal axis in whitened space) + const nanovdb::math::Vec3 gro = g.isclR * (ray.eye() - g.mean); + const nanovdb::math::Vec3 grd = normalizeSafe(g.isclR * ray.dir()); + const nanovdb::math::Vec3 gcrod = grd.cross(gro); + const float grayDist = gcrod.dot(gcrod); + const float power = -0.5f * grayDist; + const float vis = __expf(power); + float alpha = min(kAlphaThreshold, g.opacity * vis); + if (power > 0.f || alpha < 1.f / 255.f) { + continue; + } + const float nextTransmittance = transmittance * (1.0f - alpha); + if (nextTransmittance <= 1e-4f) { + done = true; + break; + } + const float contrib = alpha * transmittance; + const int32_t cid = g.id / (int32_t)means.size(0); + const int32_t gid = g.id % (int32_t)means.size(0); +#pragma unroll + for (uint32_t k = 0; k < NUM_CHANNELS; ++k) { + pixOut[k] += features[cid][gid][k] * contrib; + } + curIdx = (uint32_t)(batchStart + (int32_t)t); + transmittance = nextTransmittance; + } + } + + if (!inside) { + return; + } + + outAlphas[imgBasePix] = 1.0f - transmittance; + outLastIds[imgBasePix] = curIdx; +#pragma unroll + for (uint32_t k = 0; k < NUM_CHANNELS; ++k) { + outFeatures[imgBaseFeat + k] = + pixOut[k] + transmittance * commonArgs.backgroundValue(camId, k); + } +} + +template +std::tuple +launchForward(const torch::Tensor &means, + const torch::Tensor &quats, + const torch::Tensor &logScales, + const torch::Tensor &features, + const torch::Tensor &opacities, + const torch::Tensor &worldToCamStart, + const torch::Tensor &worldToCamEnd, + const torch::Tensor &K, + const torch::Tensor &distortionCoeffs, + const RollingShutterType rollingShutterType, + const CameraModel cameraModel, + const uint32_t imageWidth, + const uint32_t imageHeight, + const uint32_t imageOriginW, + const uint32_t imageOriginH, + const uint32_t tileSize, + const torch::Tensor &tileOffsets, + const torch::Tensor &tileGaussianIds, + const at::optional &backgrounds, + const at::optional &masks) { + const int64_t C = features.size(0); + + auto opts = features.options(); + torch::Tensor outFeatures = + torch::zeros({C, (int64_t)imageHeight, (int64_t)imageWidth, (int64_t)NUM_CHANNELS}, opts); + torch::Tensor outAlphas = torch::zeros({C, (int64_t)imageHeight, (int64_t)imageWidth, 1}, opts); + torch::Tensor outLastIds = + torch::zeros({C, (int64_t)imageHeight, (int64_t)imageWidth}, + torch::TensorOptions().dtype(torch::kInt32).device(features.device())); + + const uint32_t tileExtentW = (imageWidth + tileSize - 1) / tileSize; + const uint32_t tileExtentH = (imageHeight + tileSize - 1) / tileSize; + const dim3 blockDim(tileSize, tileSize, 1); + const dim3 gridDim(C * tileExtentH * tileExtentW, 1, 1); + + const int32_t totalIntersections = (int32_t)tileGaussianIds.size(0); + const int64_t numDistCoeffs = distortionCoeffs.size(1); + + RasterizeFromWorldCommonArgs commonArgs{ + static_cast(C), + imageWidth, + imageHeight, + imageOriginW, + imageOriginH, + tileSize, + tileExtentW, + tileExtentH, + NUM_CHANNELS, + totalIntersections, + tileOffsets.packed_accessor64(), + tileGaussianIds.packed_accessor64(), + nullptr, + nullptr}; + + const PreparedRasterOptionalInputs opt = prepareRasterOptionalInputs( + features, C, tileExtentH, tileExtentW, (int64_t)NUM_CHANNELS, backgrounds, masks); + commonArgs.backgrounds = opt.backgrounds; + commonArgs.masks = opt.masks; + + const size_t blockSize = (size_t)tileSize * (size_t)tileSize; + const size_t sharedMem = blockSize * (sizeof(int32_t) + sizeof(SharedGaussian)); + + auto stream = at::cuda::getDefaultCUDAStream(); + rasterizeFromWorld3DGSForwardKernel<<>>( + commonArgs, + means.packed_accessor64(), + quats.packed_accessor64(), + logScales.packed_accessor64(), + features.packed_accessor64(), + opacities.packed_accessor64(), + worldToCamStart.packed_accessor64(), + worldToCamEnd.packed_accessor64(), + K.packed_accessor64(), + distortionCoeffs.packed_accessor64(), + numDistCoeffs, + rollingShutterType, + cameraModel, + outFeatures.data_ptr(), + outAlphas.data_ptr(), + outLastIds.data_ptr()); + + C10_CUDA_KERNEL_LAUNCH_CHECK(); + return {outFeatures, outAlphas, outLastIds}; +} + +} // namespace + +template <> +std::tuple +dispatchGaussianRasterizeFromWorld3DGSForward( + const torch::Tensor &means, + const torch::Tensor &quats, + const torch::Tensor &logScales, + const torch::Tensor &features, + const torch::Tensor &opacities, + const torch::Tensor &worldToCamMatricesStart, + const torch::Tensor &worldToCamMatricesEnd, + const torch::Tensor &projectionMatrices, + const torch::Tensor &distortionCoeffs, + const RollingShutterType rollingShutterType, + const CameraModel cameraModel, + const RenderSettings &settings, + const torch::Tensor &tileOffsets, + const torch::Tensor &tileGaussianIds, + const at::optional &backgrounds, + const at::optional &masks) { + FVDB_FUNC_RANGE(); + + const uint32_t imageWidth = settings.imageWidth; + const uint32_t imageHeight = settings.imageHeight; + const uint32_t imageOriginW = settings.imageOriginW; + const uint32_t imageOriginH = settings.imageOriginH; + const uint32_t tileSize = settings.tileSize; + + TORCH_CHECK_VALUE(means.is_cuda(), "means must be CUDA"); + TORCH_CHECK_VALUE(features.is_cuda(), "features must be CUDA"); + TORCH_CHECK_VALUE(tileOffsets.is_cuda(), "tileOffsets must be CUDA"); + TORCH_CHECK_VALUE(tileGaussianIds.is_cuda(), "tileGaussianIds must be CUDA"); + + TORCH_CHECK_VALUE(means.dim() == 2 && means.size(1) == 3, "means must have shape [N,3]"); + TORCH_CHECK_VALUE(quats.dim() == 2 && quats.size(1) == 4, "quats must have shape [N,4]"); + TORCH_CHECK_VALUE(logScales.dim() == 2 && logScales.size(1) == 3, + "logScales must have shape [N,3]"); + TORCH_CHECK_VALUE(features.dim() == 3, "features must have shape [C,N,D]"); + + const int64_t C = features.size(0); + const int64_t N = means.size(0); + TORCH_CHECK_VALUE(features.size(1) == N, "features must have shape [C,N,D] matching N"); + TORCH_CHECK_VALUE(opacities.is_cuda(), "opacities must be CUDA"); + + // Opacities may be provided either per-camera ([C,N]) or shared across cameras ([N]). + // TODO(fvdb): Avoid materializing a repeated [C,N] tensor when opacities are shared across + // cameras. We should be able to pass a view (or otherwise avoid the repeat) similar to the + // approach used in PR #451 for other rasterization paths. + torch::Tensor opacitiesBatched = opacities; + if (opacitiesBatched.dim() == 1) { + TORCH_CHECK_VALUE(opacitiesBatched.size(0) == N, + "opacities must have shape [N] or [C,N] matching N"); + opacitiesBatched = opacitiesBatched.unsqueeze(0).repeat({C, 1}); + } + TORCH_CHECK_VALUE(opacitiesBatched.dim() == 2, "opacities must have shape [C,N]"); + TORCH_CHECK_VALUE(opacitiesBatched.size(0) == C && opacitiesBatched.size(1) == N, + "opacities must have shape [C,N] matching features and N"); + opacitiesBatched = opacitiesBatched.contiguous(); + + TORCH_CHECK_VALUE(worldToCamMatricesStart.sizes() == torch::IntArrayRef({C, 4, 4}), + "worldToCamMatricesStart must have shape [C,4,4]"); + TORCH_CHECK_VALUE(worldToCamMatricesEnd.sizes() == torch::IntArrayRef({C, 4, 4}), + "worldToCamMatricesEnd must have shape [C,4,4]"); + TORCH_CHECK_VALUE(projectionMatrices.sizes() == torch::IntArrayRef({C, 3, 3}), + "projectionMatrices must have shape [C,3,3]"); + + const int64_t numDistCoeffs = distortionCoeffs.size(1); + TORCH_CHECK_VALUE(distortionCoeffs.dim() == 2 && distortionCoeffs.size(0) == C, + "distortionCoeffs must have shape [C,K]"); + if (cameraModel == CameraModel::OPENCV_RADTAN_5 || + cameraModel == CameraModel::OPENCV_RATIONAL_8 || + cameraModel == CameraModel::OPENCV_RADTAN_THIN_PRISM_9 || + cameraModel == CameraModel::OPENCV_THIN_PRISM_12) { + TORCH_CHECK_VALUE(numDistCoeffs == 12, + "For CameraModel::OPENCV_* distortionCoeffs must be [C,12]"); + } + + const uint32_t channels = (uint32_t)features.size(2); + +#define CALL_FWD(NCH) \ + case NCH: \ + return launchForward(means, \ + quats, \ + logScales, \ + features, \ + opacitiesBatched, \ + worldToCamMatricesStart, \ + worldToCamMatricesEnd, \ + projectionMatrices, \ + distortionCoeffs, \ + rollingShutterType, \ + cameraModel, \ + imageWidth, \ + imageHeight, \ + imageOriginW, \ + imageOriginH, \ + tileSize, \ + tileOffsets, \ + tileGaussianIds, \ + backgrounds, \ + masks); + + switch (channels) { + CALL_FWD(1) + CALL_FWD(2) + CALL_FWD(3) + CALL_FWD(4) + CALL_FWD(5) + CALL_FWD(8) + CALL_FWD(9) + CALL_FWD(16) + CALL_FWD(17) + CALL_FWD(32) + CALL_FWD(33) + CALL_FWD(64) + CALL_FWD(65) + CALL_FWD(128) + CALL_FWD(129) + CALL_FWD(192) + CALL_FWD(193) + CALL_FWD(256) + CALL_FWD(257) + CALL_FWD(512) + CALL_FWD(513) + default: + TORCH_CHECK_VALUE(false, "Unsupported channels for rasterize-from-world-3dgs: ", channels); + } + +#undef CALL_FWD +} + +template <> +std::tuple +dispatchGaussianRasterizeFromWorld3DGSForward(const torch::Tensor &, + const torch::Tensor &, + const torch::Tensor &, + const torch::Tensor &, + const torch::Tensor &, + const torch::Tensor &, + const torch::Tensor &, + const torch::Tensor &, + const torch::Tensor &, + const RollingShutterType, + const CameraModel, + const RenderSettings &, + const torch::Tensor &, + const torch::Tensor &, + const at::optional &, + const at::optional &) { + TORCH_CHECK_VALUE(false, "dispatchGaussianRasterizeFromWorld3DGSForward is CUDA-only"); +} + +} // namespace fvdb::detail::ops diff --git a/src/fvdb/detail/ops/gsplat/GaussianRasterizeFromWorldForward.h b/src/fvdb/detail/ops/gsplat/GaussianRasterizeFromWorldForward.h new file mode 100644 index 00000000..4b00dac9 --- /dev/null +++ b/src/fvdb/detail/ops/gsplat/GaussianRasterizeFromWorldForward.h @@ -0,0 +1,61 @@ +// Copyright Contributors to the OpenVDB Project +// SPDX-License-Identifier: Apache-2.0 +// +#ifndef FVDB_DETAIL_OPS_GSPLAT_GAUSSIANRASTERIZEFROMWORLDFORWARD_H +#define FVDB_DETAIL_OPS_GSPLAT_GAUSSIANRASTERIZEFROMWORLDFORWARD_H + +#include +#include + +#include + +#include +#include + +namespace fvdb::detail::ops { + +/// @brief Rasterize images directly from 3D Gaussians using per-pixel rays. +/// +/// This kernel follows the gsplat "RasterizeToPixelsFromWorld3DGS" algorithm, but is wired to +/// FVDB's existing tile intersection representation (`tileOffsets`, `tileGaussianIds`). +/// +/// Inputs are world-space Gaussians (means/quats/logScales) and per-camera per-gaussian features +/// and opacities. The camera is defined via world->camera matrices (start/end), intrinsics, +/// `CameraModel`, rolling shutter policy, and packed OpenCV distortion coefficients. +/// +/// This is a dense-only rasterizer: outputs are dense tensors of shape +/// - renderedFeatures: [C, H, W, D] +/// - renderedAlphas: [C, H, W, 1] +/// - lastIds: [C, H, W] +/// +/// @tparam DeviceType torch::kCUDA (CPU not implemented). +template +std::tuple +dispatchGaussianRasterizeFromWorld3DGSForward( + // Gaussian parameters (world space) + const torch::Tensor &means, // [N, 3] + const torch::Tensor &quats, // [N, 4] (w,x,y,z) + const torch::Tensor &logScales, // [N, 3] + // Per-camera quantities + const torch::Tensor &features, // [C, N, D] + const torch::Tensor &opacities, // [C, N] + const torch::Tensor &worldToCamMatricesStart, // [C, 4, 4] + const torch::Tensor &worldToCamMatricesEnd, // [C, 4, 4] + const torch::Tensor &projectionMatrices, // [C, 3, 3] + const torch::Tensor &distortionCoeffs, // [C, K] (K=0 or 12) + const RollingShutterType rollingShutterType, + const CameraModel cameraModel, + // Render settings + const RenderSettings &settings, + // Intersections + const torch::Tensor &tileOffsets, // [C, tileH, tileW] + const torch::Tensor &tileGaussianIds, // [n_isects] values in [0, C*N) + // Optional background + const at::optional &backgrounds = at::nullopt, // [C, D] + // Optional tile masks (parity with classic rasterizer) + const at::optional &masks = at::nullopt // [C, tileH, tileW] bool +); + +} // namespace fvdb::detail::ops + +#endif // FVDB_DETAIL_OPS_GSPLAT_GAUSSIANRASTERIZEFROMWORLDFORWARD_H diff --git a/src/fvdb/detail/ops/gsplat/GaussianRasterizeOptionalInputs.h b/src/fvdb/detail/ops/gsplat/GaussianRasterizeOptionalInputs.h new file mode 100644 index 00000000..79239f53 --- /dev/null +++ b/src/fvdb/detail/ops/gsplat/GaussianRasterizeOptionalInputs.h @@ -0,0 +1,59 @@ +// Copyright Contributors to the OpenVDB Project +// SPDX-License-Identifier: Apache-2.0 +// +#ifndef FVDB_DETAIL_OPS_GSPLAT_GAUSSIANRASTERIZEOPTIONALINPUTS_H +#define FVDB_DETAIL_OPS_GSPLAT_GAUSSIANRASTERIZEOPTIONALINPUTS_H + +#include +#include + +#include + +namespace fvdb::detail::ops { + +struct PreparedRasterOptionalInputs { + const float *backgrounds = nullptr; + const bool *masks = nullptr; + torch::Tensor backgroundsContig; + torch::Tensor masksContig; +}; + +inline PreparedRasterOptionalInputs +prepareRasterOptionalInputs(const torch::Tensor &features, + const int64_t C, + const int64_t tileExtentH, + const int64_t tileExtentW, + const int64_t numChannels, + const at::optional &backgrounds, + const at::optional &masks) { + PreparedRasterOptionalInputs out; + + if (backgrounds.has_value()) { + TORCH_CHECK(backgrounds.value().is_cuda(), "backgrounds must be CUDA"); + TORCH_CHECK(backgrounds.value().device() == features.device(), + "backgrounds must be on the same device as features"); + TORCH_CHECK(backgrounds.value().scalar_type() == torch::kFloat32, + "backgrounds must have dtype=float32"); + TORCH_CHECK(backgrounds.value().sizes() == torch::IntArrayRef({C, numChannels}), + "backgrounds must have shape [C, NUM_CHANNELS]"); + out.backgroundsContig = backgrounds.value().contiguous(); + out.backgrounds = out.backgroundsContig.data_ptr(); + } + + if (masks.has_value()) { + TORCH_CHECK(masks.value().is_cuda(), "masks must be CUDA"); + TORCH_CHECK(masks.value().device() == features.device(), + "masks must be on the same device as features"); + TORCH_CHECK(masks.value().scalar_type() == torch::kBool, "masks must have dtype=bool"); + TORCH_CHECK(masks.value().sizes() == torch::IntArrayRef({C, tileExtentH, tileExtentW}), + "masks must have shape [C, tileExtentH, tileExtentW]"); + out.masksContig = masks.value().contiguous(); + out.masks = out.masksContig.data_ptr(); + } + + return out; +} + +} // namespace fvdb::detail::ops + +#endif // FVDB_DETAIL_OPS_GSPLAT_GAUSSIANRASTERIZEOPTIONALINPUTS_H diff --git a/src/fvdb/detail/ops/gsplat/GaussianRigidTransform.cuh b/src/fvdb/detail/ops/gsplat/GaussianRigidTransform.cuh new file mode 100644 index 00000000..ff8d4a48 --- /dev/null +++ b/src/fvdb/detail/ops/gsplat/GaussianRigidTransform.cuh @@ -0,0 +1,67 @@ +// Copyright Contributors to the OpenVDB Project +// SPDX-License-Identifier: Apache-2.0 +// +#ifndef FVDB_DETAIL_OPS_GSPLAT_GAUSSIANRIGIDTRANSFORM_CUH +#define FVDB_DETAIL_OPS_GSPLAT_GAUSSIANRIGIDTRANSFORM_CUH + +#include + +#include + +namespace fvdb::detail::ops { + +/// @brief Rigid transform (cached rotation + translation). +/// +/// Quaternion is stored as \([w,x,y,z]\) and is assumed to represent a rotation. +/// The corresponding rotation matrix \(R(q)\) is cached to avoid recomputing it for every point +/// transform (UT sigma points, rolling-shutter iterations, ray generation, etc.). +template struct RigidTransform { + nanovdb::math::Mat3 R; + nanovdb::math::Vec4 q; + nanovdb::math::Vec3 t; + + /// @brief Default constructor (identity transform). + /// + /// Initializes to unit quaternion \([1,0,0,0]\) and zero translation. + inline __host__ __device__ + RigidTransform() + : R(nanovdb::math::Mat3(nanovdb::math::Vec3(T(1), T(0), T(0)), + nanovdb::math::Vec3(T(0), T(1), T(0)), + nanovdb::math::Vec3(T(0), T(0), T(1)))), + q(T(1), T(0), T(0), T(0)), t(T(0), T(0), T(0)) {} + + /// @brief Construct from quaternion and translation. + /// @param[in] q_in Rotation quaternion \([w,x,y,z]\). + /// @param[in] t_in Translation vector. + inline __host__ __device__ + RigidTransform(const nanovdb::math::Vec4 &q_in, const nanovdb::math::Vec3 &t_in) + : R(quaternionToRotationMatrix(q_in)), q(q_in), t(t_in) {} + + /// @brief Construct from rotation matrix and translation. + /// @param[in] R_in Rotation matrix. + /// @param[in] t_in Translation vector. + inline __host__ __device__ + RigidTransform(const nanovdb::math::Mat3 &R_in, const nanovdb::math::Vec3 &t_in) + : R(R_in), q(rotationMatrixToQuaternion(R_in)), t(t_in) {} + + /// @brief Apply the transform to a 3D point: \(R(q)\,p + t\). + inline __host__ __device__ nanovdb::math::Vec3 + apply(const nanovdb::math::Vec3 &p_world) const { + // p_cam = R * p_world + t + return R * p_world + t; + } + + /// @brief Interpolate between two rigid transforms. + /// + /// Translation is linearly interpolated; rotation uses NLERP along the shortest arc. + inline static __host__ __device__ RigidTransform + interpolate(const T u, const RigidTransform &start, const RigidTransform &end) { + const nanovdb::math::Vec3 t_interp = start.t + u * (end.t - start.t); + const nanovdb::math::Vec4 q_interp = nlerpQuaternionShortestPath(start.q, end.q, u); + return RigidTransform(q_interp, t_interp); + } +}; + +} // namespace fvdb::detail::ops + +#endif // FVDB_DETAIL_OPS_GSPLAT_GAUSSIANRIGIDTRANSFORM_CUH diff --git a/src/fvdb/detail/ops/gsplat/GaussianRollingShutter.cuh b/src/fvdb/detail/ops/gsplat/GaussianRollingShutter.cuh new file mode 100644 index 00000000..be69d9d8 --- /dev/null +++ b/src/fvdb/detail/ops/gsplat/GaussianRollingShutter.cuh @@ -0,0 +1,53 @@ +// Copyright Contributors to the OpenVDB Project +// SPDX-License-Identifier: Apache-2.0 +// +#ifndef FVDB_DETAIL_OPS_GSPLAT_GAUSSIANROLLINGSHUTTER_CUH +#define FVDB_DETAIL_OPS_GSPLAT_GAUSSIANROLLINGSHUTTER_CUH + +#include + +#include + +namespace fvdb::detail::ops { + +template +inline __host__ __device__ T +clamp01(const T x) { + return (x < T(0)) ? T(0) : ((x > T(1)) ? T(1) : x); +} + +/// @brief Convert a pixel coordinate to rolling-shutter time in \([0,1]\). +/// +/// FVDB convention: +/// - vertical: time varies with image row (y) +/// - horizontal: time varies with image column (x) +/// - none: time is 0 +/// +/// Pixel coordinates are in image space (not normalized), and we use `floor` (matching existing +/// kernels) before normalizing by (dim-1). +template +inline __host__ __device__ T +rollingShutterTimeFromPixel(const RollingShutterType rollingShutterType, + const T px, + const T py, + const int64_t imageWidth, + const int64_t imageHeight) { + if (rollingShutterType == RollingShutterType::NONE) { + return T(0); + } + + T u = T(0); + T denom = T(1); + if (rollingShutterType == RollingShutterType::VERTICAL) { + denom = (imageHeight > 1) ? T(imageHeight - 1) : T(1); + u = ::cuda::std::floor(py) / denom; + } else if (rollingShutterType == RollingShutterType::HORIZONTAL) { + denom = (imageWidth > 1) ? T(imageWidth - 1) : T(1); + u = ::cuda::std::floor(px) / denom; + } + return clamp01(u); +} + +} // namespace fvdb::detail::ops + +#endif // FVDB_DETAIL_OPS_GSPLAT_GAUSSIANROLLINGSHUTTER_CUH diff --git a/src/python/GaussianSplatBinding.cpp b/src/python/GaussianSplatBinding.cpp index 88255495..1154b650 100644 --- a/src/python/GaussianSplatBinding.cpp +++ b/src/python/GaussianSplatBinding.cpp @@ -8,11 +8,28 @@ #include #include #include +#include #include void bind_gaussian_splat3d(py::module &m) { + py::enum_(m, "RollingShutterType") + .value("NONE", fvdb::detail::ops::RollingShutterType::NONE) + .value("VERTICAL", fvdb::detail::ops::RollingShutterType::VERTICAL) + .value("HORIZONTAL", fvdb::detail::ops::RollingShutterType::HORIZONTAL) + .export_values(); + + py::enum_(m, "CameraModel") + .value("PINHOLE", fvdb::detail::ops::CameraModel::PINHOLE) + .value("OPENCV_RADTAN_5", fvdb::detail::ops::CameraModel::OPENCV_RADTAN_5) + .value("OPENCV_RATIONAL_8", fvdb::detail::ops::CameraModel::OPENCV_RATIONAL_8) + .value("OPENCV_RADTAN_THIN_PRISM_9", + fvdb::detail::ops::CameraModel::OPENCV_RADTAN_THIN_PRISM_9) + .value("OPENCV_THIN_PRISM_12", fvdb::detail::ops::CameraModel::OPENCV_THIN_PRISM_12) + .value("ORTHOGRAPHIC", fvdb::detail::ops::CameraModel::ORTHOGRAPHIC) + .export_values(); + py::class_(m, "ProjectedGaussianSplats") .def_property_readonly("means2d", &fvdb::GaussianSplat3d::ProjectedGaussianSplats::means2d) .def_property_readonly("conics", &fvdb::GaussianSplat3d::ProjectedGaussianSplats::conics) @@ -201,6 +218,24 @@ bind_gaussian_splat3d(py::module &m) { py::arg("antialias") = false, py::arg("backgrounds") = std::nullopt) + .def("render_images_from_world", + &fvdb::GaussianSplat3d::renderImagesFromWorld, + py::arg("world_to_camera_matrices"), + py::arg("projection_matrices"), + py::arg("image_width"), + py::arg("image_height"), + py::arg("near"), + py::arg("far"), + py::arg("camera_model") = fvdb::detail::ops::CameraModel::PINHOLE, + py::arg("distortion_coeffs") = std::nullopt, + py::arg("sh_degree_to_use") = -1, + py::arg("tile_size") = 16, + py::arg("min_radius_2d") = 0.0, + py::arg("eps_2d") = 0.3, + py::arg("antialias") = false, + py::arg("backgrounds") = std::nullopt, + py::arg("masks") = std::nullopt) + .def("render_depths", &fvdb::GaussianSplat3d::renderDepths, py::arg("world_to_camera_matrices"), diff --git a/src/tests/GaussianRasterizeForwardTest.cpp b/src/tests/GaussianRasterizeForwardTest.cpp index c64d55cb..a234651f 100644 --- a/src/tests/GaussianRasterizeForwardTest.cpp +++ b/src/tests/GaussianRasterizeForwardTest.cpp @@ -8,11 +8,19 @@ #include #include +#include +#include #include #include #include +#if defined(__linux__) +#include +#include +#include +#endif + #include #include #include @@ -21,6 +29,10 @@ #error "FVDB_EXTERNAL_TEST_DATA_PATH must be defined" #endif +namespace { +constexpr const char *kMaskedEdgeTileChildEnv = "FVDB_GSPLAT_MASKED_EDGE_TILE_CHILD"; +} // namespace + struct GaussianRasterizeForwardTestFixture : public ::testing::Test { void loadInputData(const std::string insPath) { @@ -290,6 +302,127 @@ struct GaussianRasterizeForwardTestFixture : public ::testing::Test { uint32_t tileSize; }; +TEST(GaussianRasterizeForwardMaskedEdgeTile, Child) { +#if !defined(__linux__) + GTEST_SKIP() << "This regression test is Linux-only."; +#else + const char *isChild = std::getenv(kMaskedEdgeTileChildEnv); + if (!(isChild && std::string(isChild) == "1")) { + GTEST_SKIP() << "Not running child path."; + } + + if (c10::cuda::device_count() <= 0) { + GTEST_SKIP() << "CUDA not available."; + } + + const at::cuda::CUDAGuard device_guard(0); + + constexpr int64_t C = 1; + + constexpr uint32_t imageWidth = 17; + constexpr uint32_t imageHeight = 17; + constexpr uint32_t tileSize = 16; + constexpr uint32_t tileExtentH = (imageHeight + tileSize - 1) / tileSize; // 2 + constexpr uint32_t tileExtentW = (imageWidth + tileSize - 1) / tileSize; // 2 + + auto fopts = torch::TensorOptions().device(torch::kCUDA).dtype(torch::kFloat32); + + // Single Gaussian that intersects the bottom-right edge tile. + const auto means2d = torch::tensor({{{16.5f, 16.5f}}}, fopts); // [C,N,2] + const auto conics = torch::tensor({{{1.0f, 0.0f, 1.0f}}}, fopts); // [C,N,3] + const auto features = torch::tensor({{{0.4f, 0.5f, -0.6f}}}, fopts); // [C,N,D] + const auto opacities = torch::tensor({{0.9f}}, fopts); // [C,N] + const auto radii = torch::tensor( + {{1}}, torch::TensorOptions().device(torch::kCUDA).dtype(torch::kInt32)); // [C,N] + const auto depths = torch::tensor({{1.0f}}, fopts); // [C,N] + const auto backgrounds = torch::tensor({{0.1f, -0.2f, 0.3f}}, fopts); // [C,D] + + auto masks = torch::ones({C, (int64_t)tileExtentH, (int64_t)tileExtentW}, + torch::TensorOptions().device(torch::kCUDA).dtype(torch::kBool)); + masks[0][1][1] = false; // mask out bottom-right edge tile + + auto [tileOffsets, tileGaussianIds] = + fvdb::detail::ops::dispatchGaussianTileIntersection( + means2d, radii, depths, at::nullopt, (uint32_t)C, tileSize, tileExtentH, tileExtentW); + + auto [outFeatures, outAlphas, outLastIds] = + fvdb::detail::ops::dispatchGaussianRasterizeForward(means2d, + conics, + features, + opacities, + imageWidth, + imageHeight, + 0, + 0, + tileSize, + tileOffsets, + tileGaussianIds, + backgrounds, + masks); + + (void)outLastIds; + + // Ensure the kernel completed (this would hang if there is a deadlock). + C10_CUDA_CHECK(cudaDeviceSynchronize()); + + // The only in-bounds pixel in the bottom-right edge tile is (16,16). It should be filled + // with background and alpha=0. + const auto outFeaturesCpu = outFeatures.cpu(); + const auto outAlphasCpu = outAlphas.cpu(); + + EXPECT_EQ(outAlphasCpu[0][16][16][0].item(), 0.0f); + EXPECT_EQ(outFeaturesCpu[0][16][16][0].item(), 0.1f); + EXPECT_EQ(outFeaturesCpu[0][16][16][1].item(), -0.2f); + EXPECT_EQ(outFeaturesCpu[0][16][16][2].item(), 0.3f); +#endif +} + +TEST(GaussianRasterizeForwardMaskedEdgeTile, NoDeadlock) { +#if !defined(__linux__) + GTEST_SKIP() << "This regression test is Linux-only."; +#else + const char *isChild = std::getenv(kMaskedEdgeTileChildEnv); + if (isChild && std::string(isChild) == "1") { + GTEST_SKIP() << "Running child path."; + } + + pid_t pid = fork(); + ASSERT_GE(pid, 0) << "fork() failed"; + + if (pid == 0) { + // Child: exec the same test binary but run only the child test. + setenv(kMaskedEdgeTileChildEnv, "1", 1); + execl("/proc/self/exe", + "/proc/self/exe", + "--gtest_filter=GaussianRasterizeForwardMaskedEdgeTile.Child", + "--gtest_color=no", + (char *)nullptr); + _exit(127); + } + + int status = 0; + bool exited{false}; + constexpr int kTimeoutMs = 20000; + for (int elapsed = 0; elapsed < kTimeoutMs; elapsed += 50) { + const pid_t r = waitpid(pid, &status, WNOHANG); + if (r == pid) { + exited = true; + break; + } + usleep(50 * 1000); + } + + if (!exited) { + kill(pid, SIGKILL); + (void)waitpid(pid, &status, 0); + FAIL() << "Deadlock detected: child process timed out."; + } + + ASSERT_TRUE(WIFEXITED(status)) << "Child did not exit cleanly."; + ASSERT_EQ(WEXITSTATUS(status), 0) << "Child test failed."; +#endif +} + // This is a helper function to generate the output data for the test cases. // Only enable this test when you want to update the output data. TEST_F(GaussianRasterizeForwardTestFixture, DISABLED_GenerateOutputData) { diff --git a/tests/unit/test_rasterize_from_world.py b/tests/unit/test_rasterize_from_world.py new file mode 100644 index 00000000..70fb2c2e --- /dev/null +++ b/tests/unit/test_rasterize_from_world.py @@ -0,0 +1,752 @@ +#!/usr/bin/env python3 +# Copyright Contributors to the OpenVDB Project +# SPDX-License-Identifier: Apache-2.0 + +import pytest +import torch + + +def _build_gaussian_splat( + fvdb, + *, + means: torch.Tensor, + quats: torch.Tensor, + log_scales: torch.Tensor, + logit_opacities: torch.Tensor, + sh0: torch.Tensor, + shN: torch.Tensor, +): + return fvdb.GaussianSplat3d.from_tensors( + means=means, + quats=quats, + log_scales=log_scales, + logit_opacities=logit_opacities, + sh0=sh0, + shN=shN, + accumulate_mean_2d_gradients=False, + accumulate_max_2d_radii=False, + detach=False, + ) + + +def _default_camera( + device: torch.device, + dtype: torch.dtype, + *, + cx: float, + cy: float, +): + world_to_cam = torch.eye(4, device=device, dtype=dtype).unsqueeze(0) + K = torch.tensor( + [[[18.0, 0.0, cx], [0.0, 18.0, cy], [0.0, 0.0, 1.0]]], + device=device, + dtype=dtype, + ) + return world_to_cam, K + + +def _render_from_world( + fvdb, + gs, + *, + world_to_cam: torch.Tensor, + K: torch.Tensor, + image_width: int, + image_height: int, + sh_degree_to_use: int, + tile_size: int = 16, + backgrounds: torch.Tensor | None = None, + masks: torch.Tensor | None = None, +): + return gs.render_images_from_world( + world_to_camera_matrices=world_to_cam, + projection_matrices=K, + image_width=image_width, + image_height=image_height, + near=0.01, + far=1e10, + camera_model=fvdb.CameraModel.PINHOLE, + distortion_coeffs=None, + sh_degree_to_use=sh_degree_to_use, + tile_size=tile_size, + min_radius_2d=0.0, + eps_2d=0.3, + antialias=False, + backgrounds=backgrounds, + masks=masks, + ) + + +def _assert_nonzero_finite_grad(tensor: torch.Tensor) -> None: + assert tensor.grad is not None + assert torch.isfinite(tensor.grad).all() + assert tensor.grad.abs().sum().item() > 0.0 + + +def _render_images_from_world_masked_edge_tile_worker(queue) -> None: + """ + Subprocess worker for the masked edge-tile deadlock regression test. + + Must be top-level for multiprocessing "spawn" pickling. + """ + try: + import fvdb + + device = torch.device("cuda") + dtype = torch.float32 + + C, N, D = 1, 1, 3 + image_width = 17 + image_height = 17 + tile_size = 16 # -> tileH=tileW=2, with a 1-pixel edge tile at (1,1) + + # Place the Gaussian so it projects into the bottom-right edge tile. + means = torch.tensor([[1.11, 1.11, 2.5]], device=device, dtype=dtype, requires_grad=True) + quats = torch.tensor([[1.0, 0.0, 0.0, 0.0]], device=device, dtype=dtype, requires_grad=True) + log_scales = torch.tensor([[-0.6, -0.9, -0.4]], device=device, dtype=dtype, requires_grad=True) + logit_opacities = torch.tensor([2.0], device=device, dtype=dtype, requires_grad=True) + sh0 = torch.tensor([[[0.8, -0.2, 0.3]]], device=device, dtype=dtype, requires_grad=True) + shN = torch.empty((N, 0, D), device=device, dtype=dtype, requires_grad=True) + + gs = _build_gaussian_splat( + fvdb, + means=means, + quats=quats, + log_scales=log_scales, + logit_opacities=logit_opacities, + sh0=sh0, + shN=shN, + ) + + world_to_cam, K = _default_camera(device, dtype, cx=8.0, cy=8.0) + + backgrounds = torch.tensor([[0.10, -0.20, 0.30]], device=device, dtype=dtype) # [C,D] + masks = torch.ones((C, 2, 2), device=device, dtype=torch.bool) + masks[0, 1, 1] = False # mask out the bottom-right edge tile + + rendered, alphas = _render_from_world( + fvdb, + gs, + world_to_cam=world_to_cam, + K=K, + image_width=image_width, + image_height=image_height, + sh_degree_to_use=0, + tile_size=tile_size, + backgrounds=backgrounds, + masks=masks, + ) + + loss = rendered.sum() + alphas.sum() + loss.backward() + torch.cuda.synchronize() + + queue.put(("ok", None)) + except Exception: # pragma: no cover + import traceback + + queue.put(("err", traceback.format_exc())) + + +@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") +def test_gaussiansplat3d_render_images_from_world_grads_nonzero(): + # Import inside the test so CPU-only environments can still collect this file. + import fvdb + + device = torch.device("cuda") + C, N, D = 1, 1, 3 + + means = torch.tensor([[0.05, 0.05, 2.5]], device=device, dtype=torch.float32, requires_grad=True) + # Unit quaternion (normalized). + quats = torch.tensor( + [[0.97321693, 0.05016582, 0.20066328, -0.10033164]], + device=device, + dtype=torch.float32, + requires_grad=True, + ) + log_scales = torch.tensor([[-0.6, -0.9, -0.4]], device=device, dtype=torch.float32, requires_grad=True) + logit_opacities = torch.tensor([2.0], device=device, dtype=torch.float32, requires_grad=True) + + sh0 = torch.randn((N, 1, D), device=device, dtype=torch.float32, requires_grad=True) + shN = torch.empty((N, 0, D), device=device, dtype=torch.float32, requires_grad=True) + + gs = _build_gaussian_splat( + fvdb, + means=means, + quats=quats, + log_scales=log_scales, + logit_opacities=logit_opacities, + sh0=sh0, + shN=shN, + ) + + image_width = 16 + image_height = 16 + world_to_cam, K = _default_camera(device, torch.float32, cx=7.5, cy=7.5) + + rendered, alphas = _render_from_world( + fvdb, + gs, + world_to_cam=world_to_cam, + K=K, + image_width=image_width, + image_height=image_height, + sh_degree_to_use=0, + ) + + loss = (rendered * rendered).sum() + alphas.sum() + loss.backward() + + _assert_nonzero_finite_grad(means) + _assert_nonzero_finite_grad(quats) + _assert_nonzero_finite_grad(log_scales) + _assert_nonzero_finite_grad(logit_opacities) + _assert_nonzero_finite_grad(sh0) + + +@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") +def test_gaussiansplat3d_render_images_from_world_grads_match_finite_differences(): + """ + Finite-difference check for the 3DGS dense rasterizer backward pass. + + This is intentionally small (C=N=1, single tile) to keep runtime down and to avoid + discontinuities from tile assignment changes. + """ + import fvdb + + device = torch.device("cuda") + dtype = torch.float32 + + C, N, D = 1, 1, 3 + image_width = 16 + image_height = 16 + + world_to_cam, K = _default_camera(device, dtype, cx=7.5, cy=7.5) + + shN = torch.empty((N, 0, D), device=device, dtype=dtype) + + def loss_from_params( + means_t: torch.Tensor, + quats_t: torch.Tensor, + log_scales_t: torch.Tensor, + logit_opacities_t: torch.Tensor, + sh0_t: torch.Tensor, + ) -> torch.Tensor: + gs = _build_gaussian_splat( + fvdb, + means=means_t, + quats=quats_t, + log_scales=log_scales_t, + logit_opacities=logit_opacities_t, + sh0=sh0_t, + shN=shN, + ) + + rendered, alphas = _render_from_world( + fvdb, + gs, + world_to_cam=world_to_cam, + K=K, + image_width=image_width, + image_height=image_height, + sh_degree_to_use=0, + tile_size=16, # single tile + ) + + # Smooth scalar objective; includes both color/features and opacity terms. + return (rendered * rendered).sum() + 0.5 * alphas.sum() + + # Baseline parameters. + means = torch.tensor([[0.05, 0.05, 2.5]], device=device, dtype=dtype, requires_grad=True) + quats = torch.tensor([[1.0, 0.0, 0.0, 0.0]], device=device, dtype=dtype, requires_grad=True) + log_scales = torch.tensor([[-0.6, -0.9, -0.4]], device=device, dtype=dtype, requires_grad=True) + logit_opacities = torch.tensor([2.0], device=device, dtype=dtype, requires_grad=True) + sh0 = torch.tensor([[[0.8, -0.2, 0.3]]], device=device, dtype=dtype, requires_grad=True) # [N,1,D] + + # Autograd gradients. + loss = loss_from_params(means, quats, log_scales, logit_opacities, sh0) + loss.backward() + + assert means.grad is not None + assert quats.grad is not None + assert log_scales.grad is not None + assert logit_opacities.grad is not None + assert sh0.grad is not None + + # Centered finite differences for a small subset of scalars. + eps = 1.0e-3 + rtol = 2.5e-2 + atol = 2.5e-2 + + def fd_scalar(param_name: str, i0: int, i1: int | None = None, i2: int | None = None) -> float: + with torch.no_grad(): + means_p = means.detach().clone() + means_m = means.detach().clone() + quats_p = quats.detach().clone() + quats_m = quats.detach().clone() + log_scales_p = log_scales.detach().clone() + log_scales_m = log_scales.detach().clone() + logit_p = logit_opacities.detach().clone() + logit_m = logit_opacities.detach().clone() + sh0_p = sh0.detach().clone() + sh0_m = sh0.detach().clone() + + if param_name == "means": + assert i1 is not None + means_p[i0, i1] += eps + means_m[i0, i1] -= eps + elif param_name == "quats": + assert i1 is not None + quats_p[i0, i1] += eps + quats_m[i0, i1] -= eps + elif param_name == "log_scales": + assert i1 is not None + log_scales_p[i0, i1] += eps + log_scales_m[i0, i1] -= eps + elif param_name == "logit_opacities": + assert i1 is None + logit_p[i0] += eps + logit_m[i0] -= eps + elif param_name == "sh0": + assert i1 is not None and i2 is not None + sh0_p[i0, i1, i2] += eps + sh0_m[i0, i1, i2] -= eps + else: + raise AssertionError(f"Unknown param_name: {param_name}") + + lp = loss_from_params(means_p, quats_p, log_scales_p, logit_p, sh0_p).item() + lm = loss_from_params(means_m, quats_m, log_scales_m, logit_m, sh0_m).item() + return (lp - lm) / (2.0 * eps) + + checks: list[tuple[str, float, float]] = [] + checks.append(("means[0,0]", float(means.grad[0, 0].item()), fd_scalar("means", 0, 0))) + checks.append(("means[0,2]", float(means.grad[0, 2].item()), fd_scalar("means", 0, 2))) + checks.append(("log_scales[0,0]", float(log_scales.grad[0, 0].item()), fd_scalar("log_scales", 0, 0))) + checks.append(("log_scales[0,2]", float(log_scales.grad[0, 2].item()), fd_scalar("log_scales", 0, 2))) + checks.append(("logit_opacities[0]", float(logit_opacities.grad[0].item()), fd_scalar("logit_opacities", 0))) + checks.append(("sh0[0,0,0]", float(sh0.grad[0, 0, 0].item()), fd_scalar("sh0", 0, 0, 0))) + checks.append(("sh0[0,0,2]", float(sh0.grad[0, 0, 2].item()), fd_scalar("sh0", 0, 0, 2))) + + for name, grad_autograd, grad_fd in checks: + assert torch.isfinite(torch.tensor(grad_autograd)) + assert torch.isfinite(torch.tensor(grad_fd)) + assert grad_autograd == pytest.approx( + grad_fd, rel=rtol, abs=atol + ), f"{name}: autograd={grad_autograd} fd={grad_fd}" + + # Quaternion tangent-space finite-difference check. + # We compare the FD directional derivative along a small axis-angle perturbation to the + # autograd gradient dotted with dq/dt (estimated numerically). + def quat_mul_wxyz(q1: torch.Tensor, q2: torch.Tensor) -> torch.Tensor: + w1, x1, y1, z1 = q1.unbind(-1) + w2, x2, y2, z2 = q2.unbind(-1) + return torch.stack( + ( + w1 * w2 - x1 * x2 - y1 * y2 - z1 * z2, + w1 * x2 + x1 * w2 + y1 * z2 - z1 * y2, + w1 * y2 - x1 * z2 + y1 * w2 + z1 * x2, + w1 * z2 + x1 * y2 - y1 * x2 + z1 * w2, + ), + dim=-1, + ) + + def quat_exp_axis_angle_wxyz(v: torch.Tensor) -> torch.Tensor: + theta = torch.linalg.norm(v) + half = 0.5 * theta + if float(theta.item()) < 1.0e-12: + return torch.tensor([1.0, 0.0, 0.0, 0.0], device=v.device, dtype=v.dtype) + axis = v / theta + return torch.cat((torch.cos(half).view(1), torch.sin(half) * axis), dim=0) + + with torch.no_grad(): + q0 = quats.detach()[0] # [4] wxyz + # Two independent tangent directions. + dirs = [ + torch.tensor([1.0, 0.0, 0.0], device=device, dtype=dtype), + torch.tensor([0.0, 1.0, 0.2], device=device, dtype=dtype), + ] + for j, d in enumerate(dirs): + d = d / torch.linalg.norm(d) + dq_plus = quat_exp_axis_angle_wxyz(eps * d) + dq_minus = quat_exp_axis_angle_wxyz(-eps * d) + q_plus = quat_mul_wxyz(dq_plus, q0).view(1, 4) + q_minus = quat_mul_wxyz(dq_minus, q0).view(1, 4) + + lp = loss_from_params( + means.detach(), q_plus, log_scales.detach(), logit_opacities.detach(), sh0.detach() + ).item() + lm = loss_from_params( + means.detach(), q_minus, log_scales.detach(), logit_opacities.detach(), sh0.detach() + ).item() + fd_dir = (lp - lm) / (2.0 * eps) + + dqdt = (q_plus - q_minus) / (2.0 * eps) # [1,4] + dir_autograd = float((quats.grad.detach() * dqdt).sum().item()) + + assert torch.isfinite(torch.tensor(fd_dir)) + assert torch.isfinite(torch.tensor(dir_autograd)) + assert dir_autograd == pytest.approx( + fd_dir, rel=rtol, abs=atol + ), f"quats tangent dir {j}: autograd={dir_autograd} fd={fd_dir}" + + +@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") +def test_gaussiansplat3d_render_images_from_world_grads_nonzero_with_shN(): + import fvdb + + device = torch.device("cuda") + C, N, D = 1, 1, 3 + + means = torch.tensor([[0.05, 0.05, 2.5]], device=device, dtype=torch.float32, requires_grad=True) + # Unit quaternion (normalized). + quats = torch.tensor( + [[0.97321693, 0.05016582, 0.20066328, -0.10033164]], + device=device, + dtype=torch.float32, + requires_grad=True, + ) + log_scales = torch.tensor([[-0.6, -0.9, -0.4]], device=device, dtype=torch.float32, requires_grad=True) + logit_opacities = torch.tensor([2.0], device=device, dtype=torch.float32, requires_grad=True) + + # Degree 1 uses 4 SH bases total; sh0 is [N,1,D] and shN holds the remaining K-1=3 bases. + sh0 = torch.randn((N, 1, D), device=device, dtype=torch.float32, requires_grad=True) + shN = torch.randn((N, 3, D), device=device, dtype=torch.float32, requires_grad=True) + + gs = _build_gaussian_splat( + fvdb, + means=means, + quats=quats, + log_scales=log_scales, + logit_opacities=logit_opacities, + sh0=sh0, + shN=shN, + ) + + image_width = 16 + image_height = 16 + world_to_cam, K = _default_camera(device, torch.float32, cx=7.5, cy=7.5) + + rendered, alphas = _render_from_world( + fvdb, + gs, + world_to_cam=world_to_cam, + K=K, + image_width=image_width, + image_height=image_height, + sh_degree_to_use=1, + ) + + loss = (rendered * rendered).sum() + alphas.sum() + loss.backward() + + _assert_nonzero_finite_grad(means) + _assert_nonzero_finite_grad(quats) + _assert_nonzero_finite_grad(log_scales) + _assert_nonzero_finite_grad(logit_opacities) + _assert_nonzero_finite_grad(sh0) + _assert_nonzero_finite_grad(shN) + + +@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") +def test_gaussiansplat3d_render_images_from_world_shN_grads_match_finite_differences(): + """ + Finite-difference check for SHN coefficients (non-constant SH terms). + """ + import fvdb + + device = torch.device("cuda") + dtype = torch.float32 + + N = 1 + D = 3 + image_width = 16 + image_height = 16 + + world_to_cam, K = _default_camera(device, dtype, cx=7.5, cy=7.5) + + means = torch.tensor([[0.05, 0.05, 2.5]], device=device, dtype=dtype) + quats = torch.tensor([[1.0, 0.0, 0.0, 0.0]], device=device, dtype=dtype) + log_scales = torch.tensor([[-0.6, -0.9, -0.4]], device=device, dtype=dtype) + logit_opacities = torch.tensor([2.0], device=device, dtype=dtype) + + sh0 = torch.tensor([0.8, -0.2, 0.3], device=device, dtype=dtype).view(N, 1, D).contiguous() + shN0 = ( + torch.tensor( + [[0.05, -0.01, 0.02], [0.00, 0.03, -0.04], [0.01, 0.02, 0.00]], + device=device, + dtype=dtype, + ) + .view(N, 3, D) + .contiguous() + ) + + def loss_from_shN(shN_t: torch.Tensor) -> torch.Tensor: + gs = _build_gaussian_splat( + fvdb, + means=means, + quats=quats, + log_scales=log_scales, + logit_opacities=logit_opacities, + sh0=sh0, + shN=shN_t, + ) + + rendered, alphas = _render_from_world( + fvdb, + gs, + world_to_cam=world_to_cam, + K=K, + image_width=image_width, + image_height=image_height, + sh_degree_to_use=1, + ) + + return (rendered * rendered).sum() + 0.5 * alphas.sum() + + shN = shN0.detach().clone().requires_grad_(True) + loss = loss_from_shN(shN) + loss.backward() + assert shN.grad is not None + + eps = 1.0e-3 + rtol = 2.5e-2 + atol = 2.5e-2 + + def fd_shN(i1: int, i2: int) -> float: + with torch.no_grad(): + shN_p = shN0.detach().clone() + shN_m = shN0.detach().clone() + shN_p[0, i1, i2] += eps + shN_m[0, i1, i2] -= eps + lp = loss_from_shN(shN_p).item() + lm = loss_from_shN(shN_m).item() + return (lp - lm) / (2.0 * eps) + + checks = [ + ("shN[0,0,0]", float(shN.grad[0, 0, 0].item()), fd_shN(0, 0)), + ("shN[0,2,1]", float(shN.grad[0, 2, 1].item()), fd_shN(2, 1)), + ] + + for name, grad_autograd, grad_fd in checks: + assert torch.isfinite(torch.tensor(grad_autograd)) + assert torch.isfinite(torch.tensor(grad_fd)) + assert grad_autograd == pytest.approx( + grad_fd, rel=rtol, abs=atol + ), f"{name}: autograd={grad_autograd} fd={grad_fd}" + + +@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") +def test_gaussiansplat3d_render_images_from_world_backward_with_backgrounds_and_masks(): + """ + Regression test: exercise autograd backward when *both* backgrounds and masks are provided. + + Historically, mismatches between the number of forward inputs and backward returns can cause + runtime errors in some PyTorch builds. This test ensures the code path runs end-to-end. + """ + import fvdb + + device = torch.device("cuda") + dtype = torch.float32 + + C, N, D = 1, 1, 3 + image_width = 16 + image_height = 16 + tile_size = 16 # single tile -> masks is [C,1,1] + + means = torch.tensor([[0.05, 0.05, 2.5]], device=device, dtype=dtype, requires_grad=True) + # Unit quaternion (normalized). + quats = torch.tensor( + [[0.97321693, 0.05016582, 0.20066328, -0.10033164]], + device=device, + dtype=dtype, + requires_grad=True, + ) + log_scales = torch.tensor([[-0.6, -0.9, -0.4]], device=device, dtype=dtype, requires_grad=True) + logit_opacities = torch.tensor([2.0], device=device, dtype=dtype, requires_grad=True) + sh0 = torch.randn((N, 1, D), device=device, dtype=dtype, requires_grad=True) + shN = torch.empty((N, 0, D), device=device, dtype=dtype, requires_grad=True) + + gs = _build_gaussian_splat( + fvdb, + means=means, + quats=quats, + log_scales=log_scales, + logit_opacities=logit_opacities, + sh0=sh0, + shN=shN, + ) + + world_to_cam, K = _default_camera(device, dtype, cx=7.5, cy=7.5) + + backgrounds = torch.tensor([[0.1, -0.2, 0.3]], device=device, dtype=dtype) # [C,D] + masks = torch.ones((C, 1, 1), device=device, dtype=torch.bool) # enable the only tile + + rendered, alphas = _render_from_world( + fvdb, + gs, + world_to_cam=world_to_cam, + K=K, + image_width=image_width, + image_height=image_height, + sh_degree_to_use=0, + tile_size=tile_size, + backgrounds=backgrounds, + masks=masks, + ) + + loss = (rendered * rendered).sum() + 0.5 * alphas.sum() + loss.backward() + + _assert_nonzero_finite_grad(means) + _assert_nonzero_finite_grad(quats) + _assert_nonzero_finite_grad(log_scales) + _assert_nonzero_finite_grad(logit_opacities) + _assert_nonzero_finite_grad(sh0) + + +@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") +def test_gaussiansplat3d_render_images_from_world_masks_write_background_and_zero_grads(): + import fvdb + + device = torch.device("cuda") + dtype = torch.float32 + + C, N, D = 1, 1, 3 + image_width = 16 + image_height = 16 + tile_size = 16 # single tile -> mask is [C,1,1] + + means = torch.tensor([[0.05, 0.05, 2.5]], device=device, dtype=dtype, requires_grad=True) + quats = torch.tensor([[1.0, 0.0, 0.0, 0.0]], device=device, dtype=dtype, requires_grad=True) + log_scales = torch.tensor([[-0.6, -0.9, -0.4]], device=device, dtype=dtype, requires_grad=True) + logit_opacities = torch.tensor([2.0], device=device, dtype=dtype, requires_grad=True) + sh0 = torch.randn((N, 1, D), device=device, dtype=dtype, requires_grad=True) + shN = torch.empty((N, 0, D), device=device, dtype=dtype, requires_grad=True) + + gs = _build_gaussian_splat( + fvdb, + means=means, + quats=quats, + log_scales=log_scales, + logit_opacities=logit_opacities, + sh0=sh0, + shN=shN, + ) + + world_to_cam, K = _default_camera(device, dtype, cx=7.5, cy=7.5) + + backgrounds = torch.tensor([[0.10, -0.20, 0.30]], device=device, dtype=dtype) # [C,D] + masks = torch.zeros((C, 1, 1), device=device, dtype=torch.bool) # mask out the only tile + + rendered, alphas = _render_from_world( + fvdb, + gs, + world_to_cam=world_to_cam, + K=K, + image_width=image_width, + image_height=image_height, + sh_degree_to_use=0, + tile_size=tile_size, + backgrounds=backgrounds, + masks=masks, + ) + + expected = backgrounds.view(C, 1, 1, D).expand(C, image_height, image_width, D) + assert torch.equal(alphas, torch.zeros_like(alphas)) + assert torch.equal(rendered, expected) + + loss = rendered.sum() + alphas.sum() + loss.backward() + + # Masked pixels contribute nothing: grads should be exactly zero. + assert means.grad is not None and torch.equal(means.grad, torch.zeros_like(means.grad)) + assert quats.grad is not None and torch.equal(quats.grad, torch.zeros_like(quats.grad)) + assert log_scales.grad is not None and torch.equal(log_scales.grad, torch.zeros_like(log_scales.grad)) + assert logit_opacities.grad is not None and torch.equal( + logit_opacities.grad, torch.zeros_like(logit_opacities.grad) + ) + + +@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") +def test_gaussiansplat3d_render_images_from_world_masks_edge_tile_does_not_deadlock(): + """ + Regression test: masked *edge tiles* must not deadlock. + + Edge tiles (where image dimensions are not multiples of tile size) have threads that are + `!inside`, but the kernel still uses block-level barriers. Any early-return taken by only + the `inside` threads (e.g. for masked tiles) can deadlock the whole kernel launch. + + We run the render+backward in a subprocess with a timeout; a deadlock will cause the test + to hang and fail deterministically. + """ + + import multiprocessing as mp + + ctx = mp.get_context("spawn") + q = ctx.Queue() + + p = ctx.Process(target=_render_images_from_world_masked_edge_tile_worker, args=(q,)) + p.start() + p.join(timeout=20.0) + + if p.is_alive(): + p.terminate() + p.join(timeout=5.0) + pytest.fail("CUDA kernel deadlocked on masked edge tile (process timeout).") + + status, payload = q.get(timeout=5.0) + if status != "ok": + raise AssertionError(payload) + + +@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") +def test_gaussiansplat3d_render_images_from_world_backgrounds_used_when_no_intersections(): + """ + If no Gaussians intersect the image, rasterization should return the background (if provided) + with alpha=0 everywhere. + """ + import fvdb + + device = torch.device("cuda") + dtype = torch.float32 + + C, N, D = 1, 1, 3 + image_width = 16 + image_height = 16 + + # Put the Gaussian in front of the camera but closer than near plane so it should be clipped. + means = torch.tensor([[0.0, 0.0, 1.0e-4]], device=device, dtype=dtype) + quats = torch.tensor([[1.0, 0.0, 0.0, 0.0]], device=device, dtype=dtype) + log_scales = torch.tensor([[-0.6, -0.9, -0.4]], device=device, dtype=dtype) + logit_opacities = torch.tensor([2.0], device=device, dtype=dtype) + sh0 = torch.randn((N, 1, D), device=device, dtype=dtype) + shN = torch.empty((N, 0, D), device=device, dtype=dtype) + + gs = _build_gaussian_splat( + fvdb, + means=means, + quats=quats, + log_scales=log_scales, + logit_opacities=logit_opacities, + sh0=sh0, + shN=shN, + ) + + world_to_cam, K = _default_camera(device, dtype, cx=7.5, cy=7.5) + + backgrounds = torch.tensor([[0.25, 0.50, -0.75]], device=device, dtype=dtype) + rendered, alphas = _render_from_world( + fvdb, + gs, + world_to_cam=world_to_cam, + K=K, + image_width=image_width, + image_height=image_height, + sh_degree_to_use=0, + backgrounds=backgrounds, + masks=None, + ) + + expected = backgrounds.view(C, 1, 1, D).expand(C, image_height, image_width, D) + assert torch.equal(alphas, torch.zeros_like(alphas)) + assert torch.equal(rendered, expected)