feat: Implement CUDA support for VoxRaytracer, add CUDA tests for voxel image operations, and update CMake to enable CUDA compilation.

2026-03-04 13:59:45 +00:00
parent 52580d8cde
commit b1fb123026
13 changed files with 388 additions and 230 deletions
--- a/src/Math/VoxRaytracerCUDA.hpp
+++ b/src/Math/VoxRaytracerCUDA.hpp
@@ -0,0 +1,138 @@
+#ifndef VOXRAYTRACERCUDA_H
+#define VOXRAYTRACERCUDA_H
+
+#ifdef USE_CUDA
+
+#include "Math/VoxImage.h"
+#include "Math/VoxRaytracer.h"
+#include <cuda_runtime.h>
+
+namespace uLib {
+
+#ifdef __CUDACC__
+template <typename VoxelT>
+__global__ void
+RaytraceAccumulateKernel(const float *lines_data, int num_lines,
+                         VoxelT *d_image, int dim0, int dim1, int dim2,
+                         const float *inv_world_matrix_data, float scale0,
+                         float scale1, float scale2) {
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx >= num_lines)
+    return;
+
+  const float *line_ptr = &lines_data[idx * 8];
+
+  float o_vec[4] = {line_ptr[0], line_ptr[1], line_ptr[2], line_ptr[3]};
+  float d_vec[4] = {line_ptr[4], line_ptr[5], line_ptr[6], line_ptr[7]};
+
+  float pt[4] = {0, 0, 0, 0};
+  float s[4] = {0, 0, 0, 0};
+
+  for (int i = 0; i < 4; ++i) {
+    for (int j = 0; j < 4; ++j) {
+      float m_val = inv_world_matrix_data[i + j * 4];
+      pt[i] += m_val * o_vec[j];
+      s[i] += m_val * d_vec[j];
+    }
+  }
+
+  float l = sqrtf(s[0] * s[0] + s[1] * s[1] + s[2] * s[2]);
+  if (l == 0)
+    return;
+
+  float L[3];
+  L[0] = l / s[0];
+  L[1] = l / s[1];
+  L[2] = l / s[2];
+
+  float offset[3];
+  for (int i = 0; i < 3; ++i) {
+    float fpt_i = floorf(pt[i]);
+    offset[i] = (s[i] >= 0) ? (1.0f - (pt[i] - fpt_i)) : (pt[i] - fpt_i);
+    offset[i] = fabsf(offset[i] * L[i]);
+    L[i] = fabsf(L[i]);
+  }
+
+  int id;
+  float d;
+  int vid[3] = {(int)floorf(pt[0]), (int)floorf(pt[1]), (int)floorf(pt[2])};
+
+  float scale_arr[3] = {scale0, scale1, scale2};
+
+  while (vid[0] >= 0 && vid[0] < dim0 && vid[1] >= 0 && vid[1] < dim1 &&
+         vid[2] >= 0 && vid[2] < dim2) {
+
+    d = offset[0];
+    id = 0;
+    if (offset[1] < d) {
+      d = offset[1];
+      id = 1;
+    }
+    if (offset[2] < d) {
+      d = offset[2];
+      id = 2;
+    }
+
+    float L_intersect = d * scale_arr[id];
+    size_t vox_index = vid[0] * dim1 * dim2 + vid[1] * dim2 + vid[2];
+
+    atomicAdd(&(d_image[vox_index].Value), L_intersect);
+
+    float sign_s = (s[id] >= 0) ? 1.0f : -1.0f;
+    vid[id] += (int)sign_s;
+
+    offset[0] -= d;
+    offset[1] -= d;
+    offset[2] -= d;
+    offset[id] = L[id];
+  }
+}
+#endif
+
+template <typename VoxelT>
+void VoxRaytracer::AccumulateLinesCUDA(const HLine3f *lines, size_t num_lines,
+                                       VoxImage<VoxelT> &image) {
+  if (num_lines == 0)
+    return;
+
+  image.Data().MoveToVRAM();
+
+  float *d_lines = nullptr;
+  size_t lines_size = num_lines * sizeof(HLine3f);
+  cudaMalloc(&d_lines, lines_size);
+  cudaMemcpy(d_lines, lines, lines_size, cudaMemcpyHostToDevice);
+  int threadsPerBlock = 256;
+  int blocksPerGrid = (num_lines + threadsPerBlock - 1) / threadsPerBlock;
+
+  Vector3i dims = image.GetDims();
+  Matrix4f inv_world_matrix = image.GetWorldMatrix().inverse();
+
+  float *d_inv_world;
+  cudaMalloc(&d_inv_world, 16 * sizeof(float));
+  cudaMemcpy(d_inv_world, inv_world_matrix.data(), 16 * sizeof(float),
+             cudaMemcpyHostToDevice);
+
+#ifdef __CUDACC__
+  RaytraceAccumulateKernel<<<blocksPerGrid, threadsPerBlock>>>(
+      d_lines, num_lines, image.Data().GetVRAMData(), dims(0), dims(1), dims(2),
+      d_inv_world, m_scale(0), m_scale(1), m_scale(2));
+  cudaDeviceSynchronize();
+
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    std::cerr << "CUDA Error in AccumulateLinesCUDA: "
+              << cudaGetErrorString(err) << std::endl;
+  }
+#else
+  std::cerr << "RaytraceAccumulateKernel requires NVCC!" << std::endl;
+#endif
+
+  cudaFree(d_lines);
+  cudaFree(d_inv_world);
+}
+
+} // namespace uLib
+
+#endif // USE_CUDA
+
+#endif // VOXRAYTRACERCUDA_H