uLib/src/Math/testing/VoxRaytracerTestExtended.cpp

/*//////////////////////////////////////////////////////////////////////////////
// CMT Cosmic Muon Tomography project //////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////

  Copyright (c) 2014, Universita' degli Studi di Padova, INFN sez. di Padova
  All rights reserved

  Authors: Andrea Rigoni Garola < andrea.rigoni@pd.infn.it >

  ------------------------------------------------------------------
  This library is free software;  you  can  redistribute  it  and/or
  modify it  under the  terms  of  the  GNU  Lesser  General  Public
  License as published  by  the  Free  Software  Foundation;  either
  version 3.0 of the License, or (at your option) any later version.

  This library is  distributed in  the hope that it will  be useful,
  but  WITHOUT ANY WARRANTY;  without  even  the implied warranty of
  MERCHANTABILITY  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  Lesser General Public License for more details.

  You should have received a copy of  the GNU Lesser General  Public
  License along with this library.

//////////////////////////////////////////////////////////////////////////////*/

#include "Math/StructuredGrid.h"
#include "Math/VoxRaytracer.h"
#include "testing-prototype.h"
#include <chrono>
#include <iostream>
#include <random>

using namespace uLib;

typedef VoxRaytracer Raytracer;

int main() {
  BEGIN_TESTING(Math VoxRaytracer Extended Benchmark);

  std::cout << "\n=============================================\n";
  std::cout << "  VoxRaytracer CPU vs CUDA Benchmark Test\n";
  std::cout << "=============================================\n\n";

  // Create a 100x100x100 grid (1 million voxels)
  StructuredGrid img(Vector3i(100, 100, 100));
  img.SetSpacing(Vector3f(1.0f, 1.0f, 1.0f));
  img.SetPosition(Vector3f(-50.0f, -50.0f, -50.0f));

  Raytracer rt(img);

  const size_t NUM_RAYS = 1000000;
  std::cout << "Generating " << NUM_RAYS
            << " random ray pairs across a 100x100x100 grid...\n";

  std::vector<HPoint3f> in_pts(NUM_RAYS);
  std::vector<HPoint3f> out_pts(NUM_RAYS);

  // Use a fixed seed for reproducible tests
  std::random_device rd;
  std::mt19937 gen(rd());
  // The grid spans from -50 to 50 on each axis
  std::uniform_real_distribution<float> dist(-49.9f, 49.9f);
  // Pick a random face for in/out to ensure rays cross the volume
  std::uniform_int_distribution<int> face_dist(0, 5);

  for (size_t i = 0; i < NUM_RAYS; ++i) {
    HPoint3f p1, p2;
    // Generate point 1 on a random face
    int f1 = face_dist(gen);
    p1(0) = (f1 == 0) ? -50.0f : (f1 == 1) ? 50.0f : dist(gen);
    p1(1) = (f1 == 2) ? -50.0f : (f1 == 3) ? 50.0f : dist(gen);
    p1(2) = (f1 == 4) ? -50.0f : (f1 == 5) ? 50.0f : dist(gen);
    p1(3) = 1.0f;

    // Generate point 2 on a different face
    int f2;
    do {
      f2 = face_dist(gen);
    } while (
        f1 == f2 ||
        f1 / 2 ==
            f2 / 2); // Avoid same face or opposite face trivially if desired

    p2(0) = (f2 == 0) ? -50.0f : (f2 == 1) ? 50.0f : dist(gen);
    p2(1) = (f2 == 2) ? -50.0f : (f2 == 3) ? 50.0f : dist(gen);
    p2(2) = (f2 == 4) ? -50.0f : (f2 == 5) ? 50.0f : dist(gen);
    p2(3) = 1.0f;

    in_pts[i] = p1;
    out_pts[i] = p2;
  }

  std::vector<Raytracer::RayData> cpu_results(NUM_RAYS);

  std::cout << "\nRunning CPU Raytracing...\n";
  auto start_cpu = std::chrono::high_resolution_clock::now();

  for (size_t i = 0; i < NUM_RAYS; ++i) {
    cpu_results[i] = rt.TraceBetweenPoints(in_pts[i], out_pts[i]);
  }

  auto end_cpu = std::chrono::high_resolution_clock::now();
  std::chrono::duration<double, std::milli> cpu_ms = end_cpu - start_cpu;
  std::cout << "CPU Execution Time: " << cpu_ms.count() << " ms\n";

#ifdef USE_CUDA
  std::vector<Raytracer::RayData> cuda_results(NUM_RAYS);
  int max_elements_per_ray =
      400; // 100x100x100 grid max trace length usually ~300 items

  std::cout << "\nPre-Allocating Data to VRAM...\n";
  // Pre-allocate input and output points to VRAM
  HPoint3f *d_in_pts;
  HPoint3f *d_out_pts;
  size_t pts_size = NUM_RAYS * sizeof(HPoint3f);
  cudaMalloc(&d_in_pts, pts_size);
  cudaMalloc(&d_out_pts, pts_size);
  cudaMemcpy(d_in_pts, in_pts.data(), pts_size, cudaMemcpyHostToDevice);
  cudaMemcpy(d_out_pts, out_pts.data(), pts_size, cudaMemcpyHostToDevice);

  // Pre-allocate elements output arrays in VRAM via DataAllocator
  for (size_t i = 0; i < NUM_RAYS; ++i) {
    cuda_results[i].Data().resize(max_elements_per_ray);
    cuda_results[i].Data().MoveToVRAM();
  }

  std::cout << "Running CUDA Raytracing...\n";
  auto start_cuda = std::chrono::high_resolution_clock::now();

  float kernel_time_ms = 0.0f;
  rt.TraceBetweenPointsCUDA(d_in_pts, d_out_pts, NUM_RAYS, cuda_results.data(),
                            max_elements_per_ray, &kernel_time_ms);

  auto end_cuda = std::chrono::high_resolution_clock::now();
  std::chrono::duration<double, std::milli> cuda_ms = end_cuda - start_cuda;

  // Free explicit input pointers
  cudaFree(d_in_pts);
  cudaFree(d_out_pts);

  // Also query memory usage info
  size_t free_byte;
  size_t total_byte;
  cudaMemGetInfo(&free_byte, &total_byte);
  double free_db = (double)free_byte / (1024.0 * 1024.0);
  double total_db = (double)total_byte / (1024.0 * 1024.0);
  double used_db = total_db - free_db;

  std::cout << "CUDA Kernel Exec Time: " << kernel_time_ms << " ms\n";
  std::cout << "CUDA Total Time (API): " << cuda_ms.count() << " ms\n";
  std::cout << "CUDA Total Time Spdup: " << (cpu_ms.count() / cuda_ms.count())
            << "x\n";
  if (kernel_time_ms > 0.0f) {
    std::cout << "CUDA Kernel Speedup  : " << (cpu_ms.count() / kernel_time_ms)
              << "x\n";
  }
  std::cout << "CUDA VRAM Usage Est. : " << used_db << " MB out of " << total_db
            << " MB total\n";

  std::cout << "\nVerifying CUDA results against CPU...\n";
  size_t mismatches = 0;
  for (size_t i = 0; i < NUM_RAYS; ++i) {
    const auto &cpu_ray = cpu_results[i];
    const auto &cuda_ray = cuda_results[i];

    if (cpu_ray.Count() != cuda_ray.Count() ||
        std::abs(cpu_ray.TotalLength() - cuda_ray.TotalLength()) > 1e-3) {
      if (mismatches < 5) {
        std::cout << "Mismatch at ray " << i
                  << ": CPU count=" << cpu_ray.Count()
                  << ", len=" << cpu_ray.TotalLength()
                  << " vs CUDA count=" << cuda_ray.Count()
                  << ", len=" << cuda_ray.TotalLength() << "\n";
      }
      mismatches++;
      continue;
    }

    // Check elements
    for (size_t j = 0; j < cpu_ray.Count(); ++j) {
      if (cpu_ray.Data()[j].vox_id != cuda_ray.Data()[j].vox_id ||
          std::abs(cpu_ray.Data()[j].L - cuda_ray.Data()[j].L) > 1e-3) {
        if (mismatches < 5) {
          std::cout << "Mismatch at ray " << i << ", element " << j
                    << ": CPU id=" << cpu_ray.Data()[j].vox_id
                    << ", L=" << cpu_ray.Data()[j].L
                    << " vs CUDA id=" << cuda_ray.Data()[j].vox_id
                    << ", L=" << cuda_ray.Data()[j].L << "\n";
        }
        mismatches++;
        break;
      }
    }
  }

  if (mismatches == 0) {
    std::cout << "SUCCESS! All " << NUM_RAYS
              << " rays perfectly match between CPU and CUDA.\n";
  } else {
    std::cout << "FAILED! " << mismatches << " rays contain mismatched data.\n";
  }

  TEST1(mismatches == 0);

#else
  std::cout << "\nUSE_CUDA is not defined. Skipping CUDA benchmarking.\n";
#endif

  std::cout << "=============================================\n";
  END_TESTING
}