From ec2027e980dc9e2cbf97e5367656f2eb9ce22c9f Mon Sep 17 00:00:00 2001
From: AndreaRigoni <andrea.rigoni@igi.cnr.it>
Date: Fri, 27 Mar 2026 16:42:04 +0000
Subject: [PATCH] check if algorithm could be run on cuda

---
 src/Core/Algorithm.h              | 45 ++++++++++++++++++++++++++-----
 src/Math/VoxImageFilter.h         | 11 ++++++++
 src/Math/VoxImageFilterABTrim.hpp | 12 ++++++---
 src/Math/VoxImageFilterLinear.hpp |  6 +++--
 4 files changed, 61 insertions(+), 13 deletions(-)
diff --git a/src/Core/Algorithm.h b/src/Core/Algorithm.h
index 720b117..e5375a4 100644
--- a/src/Core/Algorithm.h
+++ b/src/Core/Algorithm.h
@@ -30,10 +30,15 @@
 #include <chrono>
 #include <condition_variable>
 
+#ifdef USE_CUDA
+#include <cuda_runtime.h>
+#endif
+
 #include "Core/Object.h"
 #include "Core/Monitor.h"
 #include "Core/Threads.h"
 #include "Core/Property.h"
+#include "Core/DataAllocator.h"
 
 namespace uLib {
 
@@ -57,7 +62,12 @@ public:
     using EncoderType = T_enc;
     using DecoderType = T_dec;
 
-    Algorithm() : Object(), m_Encoder(nullptr), m_Decoder(nullptr) {}
+    Algorithm()
+        : Object()
+        , m_Encoder(nullptr)
+        , m_Decoder(nullptr)
+        , m_PreferredDevice(MemoryDevice::RAM)
+    {}
     virtual ~Algorithm() = default;
 
     virtual const char* GetClassName() const override { return "Algorithm"; }
@@ -79,6 +89,20 @@ public:
     void SetDecoder(Algorithm* dec) { m_Decoder = dec; }
     Algorithm* GetDecoder() const { return m_Decoder; }
 
+    /**
+     * @brief Returns the preferred memory device for this algorithm.
+     * CUDA-capable algorithms should override to return VRAM when their
+     * data resides on the GPU. AlgorithmTask uses this to synchronize
+     * appropriately.
+     */
+    virtual MemoryDevice GetPreferredDevice() const { return m_PreferredDevice; }
+    void SetPreferredDevice(MemoryDevice dev) { m_PreferredDevice = dev; }
+
+    /**
+     * @brief Returns true if this algorithm prefers GPU execution.
+     */
+    bool IsGPU() const { return GetPreferredDevice() == MemoryDevice::VRAM; }
+
 signals:
     virtual void Started() { ULIB_SIGNAL_EMIT(Algorithm::Started); }
     virtual void Finished() { ULIB_SIGNAL_EMIT(Algorithm::Finished); }
@@ -86,6 +110,7 @@ signals:
 protected:
     Algorithm* m_Encoder;
     Algorithm* m_Decoder;
+    MemoryDevice m_PreferredDevice;
 };
 
 
@@ -192,11 +217,19 @@ protected:
     }
 
 private:
+    void ExecuteAlgorithm() {
+        if (!m_Algorithm) return;
+        m_Algorithm->Process(m_Input);
+#ifdef USE_CUDA
+        if (m_Algorithm->IsGPU()) {
+            cudaDeviceSynchronize();
+        }
+#endif
+    }
+
     void RunCyclic() {
         while (!m_StopRequested.load()) {
-            if (m_Algorithm) {
-                m_Algorithm->Process(m_Input);
-            }
+            ExecuteAlgorithm();
             std::unique_lock<std::timed_mutex> lock(m_WaitMutex.GetNative());
             m_Condition.wait_for(lock,
                 std::chrono::milliseconds(m_CycleTime_ms),
@@ -212,9 +245,7 @@ private:
             });
             if (m_StopRequested.load()) break;
             m_Triggered.store(false);
-            if (m_Algorithm) {
-                m_Algorithm->Process(m_Input);
-            }
+            ExecuteAlgorithm();
         }
     }
 
diff --git a/src/Math/VoxImageFilter.h b/src/Math/VoxImageFilter.h
index 9af5099..6e13c66 100644
--- a/src/Math/VoxImageFilter.h
+++ b/src/Math/VoxImageFilter.h
@@ -79,6 +79,17 @@ public:
    */
   void Run();
 
+  /**
+   * @brief Returns VRAM if image or kernel data is on GPU, RAM otherwise.
+   */
+  MemoryDevice GetPreferredDevice() const override {
+    if (m_Image && m_Image->Data().GetDevice() == MemoryDevice::VRAM)
+      return MemoryDevice::VRAM;
+    if (m_KernelData.ConstData().GetDevice() == MemoryDevice::VRAM)
+      return MemoryDevice::VRAM;
+    return MemoryDevice::RAM;
+  }
+
   void SetKernelNumericXZY(const std::vector<float> &numeric);
 
   void SetKernelSpherical(float (*shape)(float));
diff --git a/src/Math/VoxImageFilterABTrim.hpp b/src/Math/VoxImageFilterABTrim.hpp
index 621c00d..deb50e6 100644
--- a/src/Math/VoxImageFilterABTrim.hpp
+++ b/src/Math/VoxImageFilterABTrim.hpp
@@ -109,7 +109,8 @@ public:
   }
 
 #if defined(USE_CUDA) && defined(__CUDACC__)
-  void Run() {
+  VoxImage<VoxelT>* Process(VoxImage<VoxelT>* const& image) override {
+    if (this->m_Image != image) this->SetImage(image);
     if (this->m_Image->Data().GetDevice() == MemoryDevice::VRAM ||
         this->m_KernelData.Data().GetDevice() == MemoryDevice::VRAM) {
 
@@ -136,8 +137,9 @@ public:
           d_img_in, d_img_out, d_kernel, vox_size, ker_size, center_count,
           mAtrim, mBtrim);
       cudaDeviceSynchronize();
+      return this->m_Image;
     } else {
-      BaseClass::Run();
+      return BaseClass::Process(image);
     }
   }
 #endif
@@ -207,7 +209,8 @@ public:
   }
 
 #if defined(USE_CUDA) && defined(__CUDACC__)
-  void Run() {
+  VoxImage<VoxelT>* Process(VoxImage<VoxelT>* const& image) override {
+    if (this->m_Image != image) this->SetImage(image);
     if (this->m_Image->Data().GetDevice() == MemoryDevice::VRAM ||
         this->m_KernelData.Data().GetDevice() == MemoryDevice::VRAM) {
 
@@ -234,8 +237,9 @@ public:
           d_img_in, d_img_out, d_kernel, vox_size, ker_size, center_count,
           mAtrim, mBtrim);
       cudaDeviceSynchronize();
+      return this->m_Image;
     } else {
-      BaseClass::Run();
+      return BaseClass::Process(image);
     }
   }
 #endif
diff --git a/src/Math/VoxImageFilterLinear.hpp b/src/Math/VoxImageFilterLinear.hpp
index 420254a..c27bf92 100644
--- a/src/Math/VoxImageFilterLinear.hpp
+++ b/src/Math/VoxImageFilterLinear.hpp
@@ -67,7 +67,8 @@ public:
   VoxFilterAlgorithmLinear(const Vector3i &size) : BaseClass(size) {}
 
 #if defined(USE_CUDA) && defined(__CUDACC__)
-  void Run() {
+  VoxImage<VoxelT>* Process(VoxImage<VoxelT>* const& image) override {
+    if (this->m_Image != image) this->SetImage(image);
     if (this->m_Image->Data().GetDevice() == MemoryDevice::VRAM ||
         this->m_KernelData.Data().GetDevice() == MemoryDevice::VRAM) {
 
@@ -92,8 +93,9 @@ public:
       LinearFilterKernel<<<blocksPerGrid, threadsPerBlock>>>(
           d_img_in, d_img_out, d_kernel, vox_size, ker_size, center_count);
       cudaDeviceSynchronize();
+      return this->m_Image;
     } else {
-      BaseClass::Run();
+      return BaseClass::Process(image);
     }
   }
 #endif