feat: add CUDA raytracing benchmark and refactor VoxRaytracer::RayData to use DataAllocator for host/device memory management.
This commit is contained in:
@@ -5,6 +5,7 @@ set(TESTS
|
||||
ContainerBoxTest
|
||||
VoxImageTest
|
||||
VoxRaytracerTest
|
||||
VoxRaytracerTestExtended
|
||||
StructuredDataTest
|
||||
VoxImageFilterTest
|
||||
PolicyTest
|
||||
@@ -24,6 +25,6 @@ set(LIBRARIES
|
||||
uLib_add_tests(Math)
|
||||
|
||||
if(USE_CUDA)
|
||||
set_source_files_properties(VoxImageTest.cpp VoxImageCopyTest.cpp VoxImageFilterTest.cpp VoxRaytracerTest.cpp PROPERTIES LANGUAGE CUDA)
|
||||
set_source_files_properties(VoxRaytracerTest.cpp PROPERTIES CXX_STANDARD 17 CUDA_STANDARD 17)
|
||||
set_source_files_properties(VoxImageTest.cpp VoxImageCopyTest.cpp VoxImageFilterTest.cpp VoxRaytracerTest.cpp VoxRaytracerTestExtended.cpp PROPERTIES LANGUAGE CUDA)
|
||||
set_source_files_properties(VoxRaytracerTest.cpp VoxRaytracerTestExtended.cpp PROPERTIES CXX_STANDARD 17 CUDA_STANDARD 17)
|
||||
endif()
|
||||
|
||||
@@ -94,7 +94,8 @@ int main() {
|
||||
|
||||
Raytracer::RayData rdata =
|
||||
ray.TraceBetweenPoints(HPoint3f(-3, -3, -3), HPoint3f(3, 3, 3));
|
||||
for (const Raytracer::RayData::Element &el : rdata.Data()) {
|
||||
for (size_t i = 0; i < rdata.Count(); ++i) {
|
||||
const Raytracer::RayData::Element &el = rdata.Data()[i];
|
||||
std::cout << " " << el.vox_id << " , " << el.L << "\n";
|
||||
}
|
||||
}
|
||||
@@ -105,7 +106,7 @@ int main() {
|
||||
Raytracer rt(img);
|
||||
|
||||
Raytracer::RayData ray = rt.TraceBetweenPoints(pt1, pt2);
|
||||
TEST1(ray.Data().size() == 2);
|
||||
TEST1(ray.Count() == 2);
|
||||
TEST1(ray.Data().at(0).vox_id == 6);
|
||||
TEST1(ray.Data().at(1).vox_id == 7);
|
||||
ray.PrintSelf(std::cout);
|
||||
@@ -117,7 +118,7 @@ int main() {
|
||||
Raytracer rt(img);
|
||||
|
||||
Raytracer::RayData ray = rt.TraceBetweenPoints(pt1, pt2);
|
||||
TEST1(ray.Data().size() == 2);
|
||||
TEST1(ray.Count() == 2);
|
||||
TEST1(ray.Data().at(0).vox_id == 6);
|
||||
TEST1(ray.Data().at(1).vox_id == 4);
|
||||
ray.PrintSelf(std::cout);
|
||||
@@ -129,7 +130,7 @@ int main() {
|
||||
Raytracer rt(img);
|
||||
|
||||
Raytracer::RayData ray = rt.TraceBetweenPoints(pt1, pt2);
|
||||
TEST1(ray.Data().size() == 4);
|
||||
TEST1(ray.Count() == 4);
|
||||
TEST1(ray.Data().at(0).vox_id == 6);
|
||||
TEST1(ray.Data().at(1).vox_id == 4);
|
||||
TEST1(ray.Data().at(2).vox_id == 5);
|
||||
@@ -141,6 +142,46 @@ int main() {
|
||||
{
|
||||
std::cout << "\n--- Testing CUDA Raytracer Accumulator ---\n";
|
||||
|
||||
Raytracer rt(img);
|
||||
|
||||
{
|
||||
HPoint3f pt1(1, -0.5, 1);
|
||||
HPoint3f pt2(1, 4.5, 1);
|
||||
HPoint3f pts1[1] = {pt1};
|
||||
HPoint3f pts2[1] = {pt2};
|
||||
Raytracer::RayData ray_cuda[1];
|
||||
rt.TraceBetweenPointsCUDA(pts1, pts2, 1, ray_cuda);
|
||||
TEST1(ray_cuda[0].Count() == 2);
|
||||
TEST1(ray_cuda[0].Data().at(0).vox_id == 6);
|
||||
TEST1(ray_cuda[0].Data().at(1).vox_id == 7);
|
||||
}
|
||||
|
||||
{
|
||||
HPoint3f pt1(5, 1, 1);
|
||||
HPoint3f pt2(-3, 1, 1);
|
||||
HPoint3f pts1[1] = {pt1};
|
||||
HPoint3f pts2[1] = {pt2};
|
||||
Raytracer::RayData ray_cuda[1];
|
||||
rt.TraceBetweenPointsCUDA(pts1, pts2, 1, ray_cuda);
|
||||
TEST1(ray_cuda[0].Count() == 2);
|
||||
TEST1(ray_cuda[0].Data().at(0).vox_id == 6);
|
||||
TEST1(ray_cuda[0].Data().at(1).vox_id == 4);
|
||||
}
|
||||
|
||||
{
|
||||
HPoint3f pt1(1, 1, 1);
|
||||
HPoint3f pt2(-1, 3, -1);
|
||||
HPoint3f pts1[1] = {pt1};
|
||||
HPoint3f pts2[1] = {pt2};
|
||||
Raytracer::RayData ray_cuda[1];
|
||||
rt.TraceBetweenPointsCUDA(pts1, pts2, 1, ray_cuda);
|
||||
TEST1(ray_cuda[0].Count() == 4);
|
||||
TEST1(ray_cuda[0].Data().at(0).vox_id == 6);
|
||||
TEST1(ray_cuda[0].Data().at(1).vox_id == 4);
|
||||
TEST1(ray_cuda[0].Data().at(2).vox_id == 5);
|
||||
TEST1(ray_cuda[0].Data().at(3).vox_id == 1);
|
||||
}
|
||||
|
||||
VoxImage<TestVoxel> img_cuda(Vector3i(4, 4, 4));
|
||||
img_cuda.SetSpacing(Vector3f(2, 2, 2));
|
||||
img_cuda.SetPosition(Vector3f(-4, -4, -4));
|
||||
|
||||
211
src/Math/testing/VoxRaytracerTestExtended.cpp
Normal file
211
src/Math/testing/VoxRaytracerTestExtended.cpp
Normal file
@@ -0,0 +1,211 @@
|
||||
/*//////////////////////////////////////////////////////////////////////////////
|
||||
// CMT Cosmic Muon Tomography project //////////////////////////////////////////
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
Copyright (c) 2014, Universita' degli Studi di Padova, INFN sez. di Padova
|
||||
All rights reserved
|
||||
|
||||
Authors: Andrea Rigoni Garola < andrea.rigoni@pd.infn.it >
|
||||
|
||||
------------------------------------------------------------------
|
||||
This library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 3.0 of the License, or (at your option) any later version.
|
||||
|
||||
This library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with this library.
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////*/
|
||||
|
||||
#include "Math/StructuredGrid.h"
|
||||
#include "Math/VoxRaytracer.h"
|
||||
#include "testing-prototype.h"
|
||||
#include <chrono>
|
||||
#include <iostream>
|
||||
#include <random>
|
||||
|
||||
using namespace uLib;
|
||||
|
||||
typedef VoxRaytracer Raytracer;
|
||||
|
||||
int main() {
|
||||
BEGIN_TESTING(Math VoxRaytracer Extended Benchmark);
|
||||
|
||||
std::cout << "\n=============================================\n";
|
||||
std::cout << " VoxRaytracer CPU vs CUDA Benchmark Test\n";
|
||||
std::cout << "=============================================\n\n";
|
||||
|
||||
// Create a 100x100x100 grid (1 million voxels)
|
||||
StructuredGrid img(Vector3i(100, 100, 100));
|
||||
img.SetSpacing(Vector3f(1.0f, 1.0f, 1.0f));
|
||||
img.SetPosition(Vector3f(-50.0f, -50.0f, -50.0f));
|
||||
|
||||
Raytracer rt(img);
|
||||
|
||||
const size_t NUM_RAYS = 1000000;
|
||||
std::cout << "Generating " << NUM_RAYS
|
||||
<< " random ray pairs across a 100x100x100 grid...\n";
|
||||
|
||||
std::vector<HPoint3f> in_pts(NUM_RAYS);
|
||||
std::vector<HPoint3f> out_pts(NUM_RAYS);
|
||||
|
||||
// Use a fixed seed for reproducible tests
|
||||
std::random_device rd;
|
||||
std::mt19937 gen(rd());
|
||||
// The grid spans from -50 to 50 on each axis
|
||||
std::uniform_real_distribution<float> dist(-49.9f, 49.9f);
|
||||
// Pick a random face for in/out to ensure rays cross the volume
|
||||
std::uniform_int_distribution<int> face_dist(0, 5);
|
||||
|
||||
for (size_t i = 0; i < NUM_RAYS; ++i) {
|
||||
HPoint3f p1, p2;
|
||||
// Generate point 1 on a random face
|
||||
int f1 = face_dist(gen);
|
||||
p1(0) = (f1 == 0) ? -50.0f : (f1 == 1) ? 50.0f : dist(gen);
|
||||
p1(1) = (f1 == 2) ? -50.0f : (f1 == 3) ? 50.0f : dist(gen);
|
||||
p1(2) = (f1 == 4) ? -50.0f : (f1 == 5) ? 50.0f : dist(gen);
|
||||
p1(3) = 1.0f;
|
||||
|
||||
// Generate point 2 on a different face
|
||||
int f2;
|
||||
do {
|
||||
f2 = face_dist(gen);
|
||||
} while (
|
||||
f1 == f2 ||
|
||||
f1 / 2 ==
|
||||
f2 / 2); // Avoid same face or opposite face trivially if desired
|
||||
|
||||
p2(0) = (f2 == 0) ? -50.0f : (f2 == 1) ? 50.0f : dist(gen);
|
||||
p2(1) = (f2 == 2) ? -50.0f : (f2 == 3) ? 50.0f : dist(gen);
|
||||
p2(2) = (f2 == 4) ? -50.0f : (f2 == 5) ? 50.0f : dist(gen);
|
||||
p2(3) = 1.0f;
|
||||
|
||||
in_pts[i] = p1;
|
||||
out_pts[i] = p2;
|
||||
}
|
||||
|
||||
std::vector<Raytracer::RayData> cpu_results(NUM_RAYS);
|
||||
|
||||
std::cout << "\nRunning CPU Raytracing...\n";
|
||||
auto start_cpu = std::chrono::high_resolution_clock::now();
|
||||
|
||||
for (size_t i = 0; i < NUM_RAYS; ++i) {
|
||||
cpu_results[i] = rt.TraceBetweenPoints(in_pts[i], out_pts[i]);
|
||||
}
|
||||
|
||||
auto end_cpu = std::chrono::high_resolution_clock::now();
|
||||
std::chrono::duration<double, std::milli> cpu_ms = end_cpu - start_cpu;
|
||||
std::cout << "CPU Execution Time: " << cpu_ms.count() << " ms\n";
|
||||
|
||||
#ifdef USE_CUDA
|
||||
std::vector<Raytracer::RayData> cuda_results(NUM_RAYS);
|
||||
int max_elements_per_ray =
|
||||
400; // 100x100x100 grid max trace length usually ~300 items
|
||||
|
||||
std::cout << "\nPre-Allocating Data to VRAM...\n";
|
||||
// Pre-allocate input and output points to VRAM
|
||||
HPoint3f *d_in_pts;
|
||||
HPoint3f *d_out_pts;
|
||||
size_t pts_size = NUM_RAYS * sizeof(HPoint3f);
|
||||
cudaMalloc(&d_in_pts, pts_size);
|
||||
cudaMalloc(&d_out_pts, pts_size);
|
||||
cudaMemcpy(d_in_pts, in_pts.data(), pts_size, cudaMemcpyHostToDevice);
|
||||
cudaMemcpy(d_out_pts, out_pts.data(), pts_size, cudaMemcpyHostToDevice);
|
||||
|
||||
// Pre-allocate elements output arrays in VRAM via DataAllocator
|
||||
for (size_t i = 0; i < NUM_RAYS; ++i) {
|
||||
cuda_results[i].Data().resize(max_elements_per_ray);
|
||||
cuda_results[i].Data().MoveToVRAM();
|
||||
}
|
||||
|
||||
std::cout << "Running CUDA Raytracing...\n";
|
||||
auto start_cuda = std::chrono::high_resolution_clock::now();
|
||||
|
||||
float kernel_time_ms = 0.0f;
|
||||
rt.TraceBetweenPointsCUDA(d_in_pts, d_out_pts, NUM_RAYS, cuda_results.data(),
|
||||
max_elements_per_ray, &kernel_time_ms);
|
||||
|
||||
auto end_cuda = std::chrono::high_resolution_clock::now();
|
||||
std::chrono::duration<double, std::milli> cuda_ms = end_cuda - start_cuda;
|
||||
|
||||
// Free explicit input pointers
|
||||
cudaFree(d_in_pts);
|
||||
cudaFree(d_out_pts);
|
||||
|
||||
// Also query memory usage info
|
||||
size_t free_byte;
|
||||
size_t total_byte;
|
||||
cudaMemGetInfo(&free_byte, &total_byte);
|
||||
double free_db = (double)free_byte / (1024.0 * 1024.0);
|
||||
double total_db = (double)total_byte / (1024.0 * 1024.0);
|
||||
double used_db = total_db - free_db;
|
||||
|
||||
std::cout << "CUDA Kernel Exec Time: " << kernel_time_ms << " ms\n";
|
||||
std::cout << "CUDA Total Time (API): " << cuda_ms.count() << " ms\n";
|
||||
std::cout << "CUDA Total Time Spdup: " << (cpu_ms.count() / cuda_ms.count())
|
||||
<< "x\n";
|
||||
if (kernel_time_ms > 0.0f) {
|
||||
std::cout << "CUDA Kernel Speedup : " << (cpu_ms.count() / kernel_time_ms)
|
||||
<< "x\n";
|
||||
}
|
||||
std::cout << "CUDA VRAM Usage Est. : " << used_db << " MB out of " << total_db
|
||||
<< " MB total\n";
|
||||
|
||||
std::cout << "\nVerifying CUDA results against CPU...\n";
|
||||
size_t mismatches = 0;
|
||||
for (size_t i = 0; i < NUM_RAYS; ++i) {
|
||||
const auto &cpu_ray = cpu_results[i];
|
||||
const auto &cuda_ray = cuda_results[i];
|
||||
|
||||
if (cpu_ray.Count() != cuda_ray.Count() ||
|
||||
std::abs(cpu_ray.TotalLength() - cuda_ray.TotalLength()) > 1e-3) {
|
||||
if (mismatches < 5) {
|
||||
std::cout << "Mismatch at ray " << i
|
||||
<< ": CPU count=" << cpu_ray.Count()
|
||||
<< ", len=" << cpu_ray.TotalLength()
|
||||
<< " vs CUDA count=" << cuda_ray.Count()
|
||||
<< ", len=" << cuda_ray.TotalLength() << "\n";
|
||||
}
|
||||
mismatches++;
|
||||
continue;
|
||||
}
|
||||
|
||||
// Check elements
|
||||
for (size_t j = 0; j < cpu_ray.Count(); ++j) {
|
||||
if (cpu_ray.Data()[j].vox_id != cuda_ray.Data()[j].vox_id ||
|
||||
std::abs(cpu_ray.Data()[j].L - cuda_ray.Data()[j].L) > 1e-3) {
|
||||
if (mismatches < 5) {
|
||||
std::cout << "Mismatch at ray " << i << ", element " << j
|
||||
<< ": CPU id=" << cpu_ray.Data()[j].vox_id
|
||||
<< ", L=" << cpu_ray.Data()[j].L
|
||||
<< " vs CUDA id=" << cuda_ray.Data()[j].vox_id
|
||||
<< ", L=" << cuda_ray.Data()[j].L << "\n";
|
||||
}
|
||||
mismatches++;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (mismatches == 0) {
|
||||
std::cout << "SUCCESS! All " << NUM_RAYS
|
||||
<< " rays perfectly match between CPU and CUDA.\n";
|
||||
} else {
|
||||
std::cout << "FAILED! " << mismatches << " rays contain mismatched data.\n";
|
||||
}
|
||||
|
||||
TEST1(mismatches == 0);
|
||||
|
||||
#else
|
||||
std::cout << "\nUSE_CUDA is not defined. Skipping CUDA benchmarking.\n";
|
||||
#endif
|
||||
|
||||
std::cout << "=============================================\n";
|
||||
END_TESTING
|
||||
}
|
||||
Reference in New Issue
Block a user