/*////////////////////////////////////////////////////////////////////////////// // CMT Cosmic Muon Tomography project ////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////// Copyright (c) 2014, Universita' degli Studi di Padova, INFN sez. di Padova All rights reserved Authors: Andrea Rigoni Garola < andrea.rigoni@pd.infn.it > ------------------------------------------------------------------ This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 3.0 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library. //////////////////////////////////////////////////////////////////////////////*/ #include "Math/StructuredGrid.h" #include "Math/VoxRaytracer.h" #include "testing-prototype.h" #include #include #include using namespace uLib; typedef VoxRaytracer Raytracer; int main() { BEGIN_TESTING(Math VoxRaytracer Extended Benchmark); std::cout << "\n=============================================\n"; std::cout << " VoxRaytracer CPU vs CUDA Benchmark Test\n"; std::cout << "=============================================\n\n"; // Create a 100x100x100 grid (1 million voxels) StructuredGrid img(Vector3i(100, 100, 100)); img.SetSpacing(Vector3f(1.0f, 1.0f, 1.0f)); img.SetPosition(Vector3f(-50.0f, -50.0f, -50.0f)); Raytracer rt(img); const size_t NUM_RAYS = 100000; std::cout << "Generating " << NUM_RAYS << " random ray pairs across a 100x100x100 grid...\n"; std::vector in_pts(NUM_RAYS); std::vector out_pts(NUM_RAYS); // Use a fixed seed for reproducible tests std::random_device rd; std::mt19937 gen(rd()); // The grid spans from -50 to 50 on each axis std::uniform_real_distribution dist(-49.9f, 49.9f); // Pick a random face for in/out to ensure rays cross the volume std::uniform_int_distribution face_dist(0, 5); for (size_t i = 0; i < NUM_RAYS; ++i) { HPoint3f p1, p2; // Generate point 1 on a random face int f1 = face_dist(gen); p1(0) = (f1 == 0) ? -50.0f : (f1 == 1) ? 50.0f : dist(gen); p1(1) = (f1 == 2) ? -50.0f : (f1 == 3) ? 50.0f : dist(gen); p1(2) = (f1 == 4) ? -50.0f : (f1 == 5) ? 50.0f : dist(gen); p1(3) = 1.0f; // Generate point 2 on a different face int f2; do { f2 = face_dist(gen); } while ( f1 == f2 || f1 / 2 == f2 / 2); // Avoid same face or opposite face trivially if desired p2(0) = (f2 == 0) ? -50.0f : (f2 == 1) ? 50.0f : dist(gen); p2(1) = (f2 == 2) ? -50.0f : (f2 == 3) ? 50.0f : dist(gen); p2(2) = (f2 == 4) ? -50.0f : (f2 == 5) ? 50.0f : dist(gen); p2(3) = 1.0f; in_pts[i] = p1; out_pts[i] = p2; } std::vector cpu_results(NUM_RAYS); std::cout << "\nRunning CPU Raytracing...\n"; auto start_cpu = std::chrono::high_resolution_clock::now(); for (size_t i = 0; i < NUM_RAYS; ++i) { cpu_results[i] = rt.TraceBetweenPoints(in_pts[i], out_pts[i]); } auto end_cpu = std::chrono::high_resolution_clock::now(); std::chrono::duration cpu_ms = end_cpu - start_cpu; std::cout << "CPU Execution Time: " << cpu_ms.count() << " ms\n"; #ifdef USE_CUDA std::vector cuda_results(NUM_RAYS); int max_elements_per_ray = 400; // 100x100x100 grid max trace length usually ~300 items std::cout << "\nPre-Allocating Data to VRAM...\n"; // Pre-allocate input and output points to VRAM HPoint3f *d_in_pts; HPoint3f *d_out_pts; size_t pts_size = NUM_RAYS * sizeof(HPoint3f); cudaMalloc(&d_in_pts, pts_size); cudaMalloc(&d_out_pts, pts_size); cudaMemcpy(d_in_pts, in_pts.data(), pts_size, cudaMemcpyHostToDevice); cudaMemcpy(d_out_pts, out_pts.data(), pts_size, cudaMemcpyHostToDevice); // Pre-allocate elements output arrays in VRAM via DataAllocator for (size_t i = 0; i < NUM_RAYS; ++i) { cuda_results[i].Data().resize(max_elements_per_ray); cuda_results[i].Data().MoveToVRAM(); } std::cout << "Running CUDA Raytracing...\n"; auto start_cuda = std::chrono::high_resolution_clock::now(); float kernel_time_ms = 0.0f; rt.TraceBetweenPointsCUDA(d_in_pts, d_out_pts, NUM_RAYS, cuda_results.data(), max_elements_per_ray, &kernel_time_ms); auto end_cuda = std::chrono::high_resolution_clock::now(); std::chrono::duration cuda_ms = end_cuda - start_cuda; // Free explicit input pointers cudaFree(d_in_pts); cudaFree(d_out_pts); // Also query memory usage info size_t free_byte; size_t total_byte; cudaMemGetInfo(&free_byte, &total_byte); double free_db = (double)free_byte / (1024.0 * 1024.0); double total_db = (double)total_byte / (1024.0 * 1024.0); double used_db = total_db - free_db; std::cout << "CUDA Kernel Exec Time: " << kernel_time_ms << " ms\n"; std::cout << "CUDA Total Time (API): " << cuda_ms.count() << " ms\n"; std::cout << "CUDA Total Time Spdup: " << (cpu_ms.count() / cuda_ms.count()) << "x\n"; if (kernel_time_ms > 0.0f) { std::cout << "CUDA Kernel Speedup : " << (cpu_ms.count() / kernel_time_ms) << "x\n"; } std::cout << "CUDA VRAM Usage Est. : " << used_db << " MB out of " << total_db << " MB total\n"; std::cout << "\nVerifying CUDA results against CPU...\n"; size_t mismatches = 0; for (size_t i = 0; i < NUM_RAYS; ++i) { const auto &cpu_ray = cpu_results[i]; const auto &cuda_ray = cuda_results[i]; if (cpu_ray.Count() != cuda_ray.Count() || std::abs(cpu_ray.TotalLength() - cuda_ray.TotalLength()) > 1e-3) { if (mismatches < 5) { std::cout << "Mismatch at ray " << i << ": CPU count=" << cpu_ray.Count() << ", len=" << cpu_ray.TotalLength() << " vs CUDA count=" << cuda_ray.Count() << ", len=" << cuda_ray.TotalLength() << "\n"; } mismatches++; continue; } // Check elements for (size_t j = 0; j < cpu_ray.Count(); ++j) { if (cpu_ray.Data()[j].vox_id != cuda_ray.Data()[j].vox_id || std::abs(cpu_ray.Data()[j].L - cuda_ray.Data()[j].L) > 1e-3) { if (mismatches < 5) { std::cout << "Mismatch at ray " << i << ", element " << j << ": CPU id=" << cpu_ray.Data()[j].vox_id << ", L=" << cpu_ray.Data()[j].L << " vs CUDA id=" << cuda_ray.Data()[j].vox_id << ", L=" << cuda_ray.Data()[j].L << "\n"; } mismatches++; break; } } } if (mismatches == 0) { std::cout << "SUCCESS! All " << NUM_RAYS << " rays perfectly match between CPU and CUDA.\n"; } else { std::cout << "FAILED! " << mismatches << " rays contain mismatched data.\n"; } TEST1(mismatches == 0); #else std::cout << "\nUSE_CUDA is not defined. Skipping CUDA benchmarking.\n"; #endif std::cout << "=============================================\n"; END_TESTING }