feat: add CUDA raytracing benchmark and refactor VoxRaytracer::RayData to use DataAllocator for host/device memory management.

2026-03-04 17:47:18 +00:00
parent eb76521060
commit adedbcc37c
12 changed files with 1013 additions and 364 deletions
--- a/src/Math/testing/VoxRaytracerTestExtended.cpp
+++ b/src/Math/testing/VoxRaytracerTestExtended.cpp
@@ -0,0 +1,211 @@
+/*//////////////////////////////////////////////////////////////////////////////
+// CMT Cosmic Muon Tomography project //////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////
+
+  Copyright (c) 2014, Universita' degli Studi di Padova, INFN sez. di Padova
+  All rights reserved
+
+  Authors: Andrea Rigoni Garola < andrea.rigoni@pd.infn.it >
+
+  ------------------------------------------------------------------
+  This library is free software;  you  can  redistribute  it  and/or
+  modify it  under the  terms  of  the  GNU  Lesser  General  Public
+  License as published  by  the  Free  Software  Foundation;  either
+  version 3.0 of the License, or (at your option) any later version.
+
+  This library is  distributed in  the hope that it will  be useful,
+  but  WITHOUT ANY WARRANTY;  without  even  the implied warranty of
+  MERCHANTABILITY  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  Lesser General Public License for more details.
+
+  You should have received a copy of  the GNU Lesser General  Public
+  License along with this library.
+
+//////////////////////////////////////////////////////////////////////////////*/
+
+#include "Math/StructuredGrid.h"
+#include "Math/VoxRaytracer.h"
+#include "testing-prototype.h"
+#include <chrono>
+#include <iostream>
+#include <random>
+
+using namespace uLib;
+
+typedef VoxRaytracer Raytracer;
+
+int main() {
+  BEGIN_TESTING(Math VoxRaytracer Extended Benchmark);
+
+  std::cout << "\n=============================================\n";
+  std::cout << "  VoxRaytracer CPU vs CUDA Benchmark Test\n";
+  std::cout << "=============================================\n\n";
+
+  // Create a 100x100x100 grid (1 million voxels)
+  StructuredGrid img(Vector3i(100, 100, 100));
+  img.SetSpacing(Vector3f(1.0f, 1.0f, 1.0f));
+  img.SetPosition(Vector3f(-50.0f, -50.0f, -50.0f));
+
+  Raytracer rt(img);
+
+  const size_t NUM_RAYS = 1000000;
+  std::cout << "Generating " << NUM_RAYS
+            << " random ray pairs across a 100x100x100 grid...\n";
+
+  std::vector<HPoint3f> in_pts(NUM_RAYS);
+  std::vector<HPoint3f> out_pts(NUM_RAYS);
+
+  // Use a fixed seed for reproducible tests
+  std::random_device rd;
+  std::mt19937 gen(rd());
+  // The grid spans from -50 to 50 on each axis
+  std::uniform_real_distribution<float> dist(-49.9f, 49.9f);
+  // Pick a random face for in/out to ensure rays cross the volume
+  std::uniform_int_distribution<int> face_dist(0, 5);
+
+  for (size_t i = 0; i < NUM_RAYS; ++i) {
+    HPoint3f p1, p2;
+    // Generate point 1 on a random face
+    int f1 = face_dist(gen);
+    p1(0) = (f1 == 0) ? -50.0f : (f1 == 1) ? 50.0f : dist(gen);
+    p1(1) = (f1 == 2) ? -50.0f : (f1 == 3) ? 50.0f : dist(gen);
+    p1(2) = (f1 == 4) ? -50.0f : (f1 == 5) ? 50.0f : dist(gen);
+    p1(3) = 1.0f;
+
+    // Generate point 2 on a different face
+    int f2;
+    do {
+      f2 = face_dist(gen);
+    } while (
+        f1 == f2 ||
+        f1 / 2 ==
+            f2 / 2); // Avoid same face or opposite face trivially if desired
+
+    p2(0) = (f2 == 0) ? -50.0f : (f2 == 1) ? 50.0f : dist(gen);
+    p2(1) = (f2 == 2) ? -50.0f : (f2 == 3) ? 50.0f : dist(gen);
+    p2(2) = (f2 == 4) ? -50.0f : (f2 == 5) ? 50.0f : dist(gen);
+    p2(3) = 1.0f;
+
+    in_pts[i] = p1;
+    out_pts[i] = p2;
+  }
+
+  std::vector<Raytracer::RayData> cpu_results(NUM_RAYS);
+
+  std::cout << "\nRunning CPU Raytracing...\n";
+  auto start_cpu = std::chrono::high_resolution_clock::now();
+
+  for (size_t i = 0; i < NUM_RAYS; ++i) {
+    cpu_results[i] = rt.TraceBetweenPoints(in_pts[i], out_pts[i]);
+  }
+
+  auto end_cpu = std::chrono::high_resolution_clock::now();
+  std::chrono::duration<double, std::milli> cpu_ms = end_cpu - start_cpu;
+  std::cout << "CPU Execution Time: " << cpu_ms.count() << " ms\n";
+
+#ifdef USE_CUDA
+  std::vector<Raytracer::RayData> cuda_results(NUM_RAYS);
+  int max_elements_per_ray =
+      400; // 100x100x100 grid max trace length usually ~300 items
+
+  std::cout << "\nPre-Allocating Data to VRAM...\n";
+  // Pre-allocate input and output points to VRAM
+  HPoint3f *d_in_pts;
+  HPoint3f *d_out_pts;
+  size_t pts_size = NUM_RAYS * sizeof(HPoint3f);
+  cudaMalloc(&d_in_pts, pts_size);
+  cudaMalloc(&d_out_pts, pts_size);
+  cudaMemcpy(d_in_pts, in_pts.data(), pts_size, cudaMemcpyHostToDevice);
+  cudaMemcpy(d_out_pts, out_pts.data(), pts_size, cudaMemcpyHostToDevice);
+
+  // Pre-allocate elements output arrays in VRAM via DataAllocator
+  for (size_t i = 0; i < NUM_RAYS; ++i) {
+    cuda_results[i].Data().resize(max_elements_per_ray);
+    cuda_results[i].Data().MoveToVRAM();
+  }
+
+  std::cout << "Running CUDA Raytracing...\n";
+  auto start_cuda = std::chrono::high_resolution_clock::now();
+
+  float kernel_time_ms = 0.0f;
+  rt.TraceBetweenPointsCUDA(d_in_pts, d_out_pts, NUM_RAYS, cuda_results.data(),
+                            max_elements_per_ray, &kernel_time_ms);
+
+  auto end_cuda = std::chrono::high_resolution_clock::now();
+  std::chrono::duration<double, std::milli> cuda_ms = end_cuda - start_cuda;
+
+  // Free explicit input pointers
+  cudaFree(d_in_pts);
+  cudaFree(d_out_pts);
+
+  // Also query memory usage info
+  size_t free_byte;
+  size_t total_byte;
+  cudaMemGetInfo(&free_byte, &total_byte);
+  double free_db = (double)free_byte / (1024.0 * 1024.0);
+  double total_db = (double)total_byte / (1024.0 * 1024.0);
+  double used_db = total_db - free_db;
+
+  std::cout << "CUDA Kernel Exec Time: " << kernel_time_ms << " ms\n";
+  std::cout << "CUDA Total Time (API): " << cuda_ms.count() << " ms\n";
+  std::cout << "CUDA Total Time Spdup: " << (cpu_ms.count() / cuda_ms.count())
+            << "x\n";
+  if (kernel_time_ms > 0.0f) {
+    std::cout << "CUDA Kernel Speedup  : " << (cpu_ms.count() / kernel_time_ms)
+              << "x\n";
+  }
+  std::cout << "CUDA VRAM Usage Est. : " << used_db << " MB out of " << total_db
+            << " MB total\n";
+
+  std::cout << "\nVerifying CUDA results against CPU...\n";
+  size_t mismatches = 0;
+  for (size_t i = 0; i < NUM_RAYS; ++i) {
+    const auto &cpu_ray = cpu_results[i];
+    const auto &cuda_ray = cuda_results[i];
+
+    if (cpu_ray.Count() != cuda_ray.Count() ||
+        std::abs(cpu_ray.TotalLength() - cuda_ray.TotalLength()) > 1e-3) {
+      if (mismatches < 5) {
+        std::cout << "Mismatch at ray " << i
+                  << ": CPU count=" << cpu_ray.Count()
+                  << ", len=" << cpu_ray.TotalLength()
+                  << " vs CUDA count=" << cuda_ray.Count()
+                  << ", len=" << cuda_ray.TotalLength() << "\n";
+      }
+      mismatches++;
+      continue;
+    }
+
+    // Check elements
+    for (size_t j = 0; j < cpu_ray.Count(); ++j) {
+      if (cpu_ray.Data()[j].vox_id != cuda_ray.Data()[j].vox_id ||
+          std::abs(cpu_ray.Data()[j].L - cuda_ray.Data()[j].L) > 1e-3) {
+        if (mismatches < 5) {
+          std::cout << "Mismatch at ray " << i << ", element " << j
+                    << ": CPU id=" << cpu_ray.Data()[j].vox_id
+                    << ", L=" << cpu_ray.Data()[j].L
+                    << " vs CUDA id=" << cuda_ray.Data()[j].vox_id
+                    << ", L=" << cuda_ray.Data()[j].L << "\n";
+        }
+        mismatches++;
+        break;
+      }
+    }
+  }
+
+  if (mismatches == 0) {
+    std::cout << "SUCCESS! All " << NUM_RAYS
+              << " rays perfectly match between CPU and CUDA.\n";
+  } else {
+    std::cout << "FAILED! " << mismatches << " rays contain mismatched data.\n";
+  }
+
+  TEST1(mismatches == 0);
+
+#else
+  std::cout << "\nUSE_CUDA is not defined. Skipping CUDA benchmarking.\n";
+#endif
+
+  std::cout << "=============================================\n";
+  END_TESTING
+}