diff --git a/Dockerfile b/Dockerfile
index 27fefbe..97981e8 100755
--- a/Dockerfile
+++ b/Dockerfile
@@ -2,7 +2,7 @@
 # Platform arm64 means nvidia jetson arm64v8.
 # Image may be not compatible with other arm machines.
 
-FROM --platform=linux/arm64v8 nvcr.io/nvidia/l4t-base:r35.1.0 AS handy-base-arm64
+FROM --platform=linux/arm64v8 nvcr.io/nvidia/l4t-base:r36.2.0 AS handy-base-arm64
 
 ### INSTALL NVIDIA PACKAGES
 
@@ -105,7 +105,7 @@ RUN apt-get update && \
         zlib1g-dev \
     && rm -rf /var/lib/apt/lists/* && apt-get clean
 
-ENV TORCHVISION_VERSION=0.14.0
+ENV TORCHVISION_VERSION=0.17.0
 
 FROM handy-common AS handy-cuda-arm64
 
@@ -162,19 +162,26 @@ RUN pip3 install --no-cache-dir \
     numpy \
     pillow
 
-ENV PYTORCH_WHL="torch-1.13.0a0+340c4120.nv22.06-cp38-cp38-linux_aarch64.whl"
-ENV PYTORCH_URL="https://developer.download.nvidia.com/compute/redist/jp/v50/pytorch/${PYTORCH_WHL}"
+ENV PYTORCH_WHL="torch-2.2.0a0+81ea7a4.nv24.01-cp310-cp310-linux_aarch64.whl"
+ENV PYTORCH_URL="https://developer.download.nvidia.com/compute/redist/jp/v60dp/pytorch/${PYTORCH_WHL}"
 
 RUN wget --no-check-certificate -qO ${PYTORCH_WHL} ${PYTORCH_URL} \
     && pip3 install --no-cache-dir ${PYTORCH_WHL} \
     && rm -rf /tmp/*
 
-RUN wget -qO - https://github.com/pytorch/vision/archive/refs/tags/v${TORCHVISION_VERSION}.tar.gz | tar -xz \
-    && cd vision-${TORCHVISION_VERSION} \
-    && python3 setup.py install \
-    && rm -rf /tmp/*
+RUN pip3 install torchvision==${TORCHVISION_VERSION}
+
+# RUN apt-get update && \
+#     apt-get install -y cuda-command-line-tools-12-2 libcupti-dev && \
+#     echo "/usr/local/cuda-12.2/extras/CUPTI/lib64" \
+#       > /etc/ld.so.conf.d/cupti.conf && ldconfig
 
-ENV PYTORCH_PATH="/usr/local/lib/python3.8/dist-packages/torch"
+# RUN wget -qO - https://github.com/pytorch/vision/archive/refs/tags/v${TORCHVISION_VERSION}.tar.gz | tar -xz \
+#     && cd vision-${TORCHVISION_VERSION} \
+#     && python3 setup.py install \
+#     && rm -rf /tmp/*
+
+ENV PYTORCH_PATH="/usr/local/lib/python3.10/dist-packages/torch"
 ENV LD_LIBRARY_PATH="${PYTORCH_PATH}/lib:${LD_LIBRARY_PATH}"
 
 FROM handy-common AS handy-cuda-amd64
@@ -215,13 +222,13 @@ RUN wget -qO - https://github.com/opencv/opencv/archive/refs/tags/${OPENCV_VERSI
         -DBUILD_TESTS=OFF \
     && make -j$(nproc) install && rm -rf /tmp/*
 
-ENV TORCH_VERSION=1.13.0
+ENV TORCH_VERSION=2.2.0
 
 RUN pip3 install --no-cache-dir \
     torch==${TORCH_VERSION} \
     torchvision==${TORCHVISION_VERSION}
 
-ENV PYTORCH_PATH="/usr/local/lib/python3.8/dist-packages/torch"
+ENV PYTORCH_PATH="/usr/local/lib/python3.10/dist-packages/torch"
 ENV LD_LIBRARY_PATH="${PYTORCH_PATH}/lib:${LD_LIBRARY_PATH}"
 
 FROM handy-cuda-${TARGETARCH} AS handy-ros
@@ -295,6 +302,11 @@ RUN mkdir -p ${ROS_ROOT} \
     > ${ROS_ROOT}/ros2.rosinstall \
     && vcs import ${ROS_TMP} < ${ROS_ROOT}/ros2.rosinstall
 
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+       ca-certificates curl gnupg2 lsb-release \
+    && rm -rf /var/lib/apt/lists/*
+
 RUN apt-get update -q \
     && rosdep init \
     && rosdep update \
@@ -311,11 +323,13 @@ RUN apt-get update -q \
         --skip-keys python3-opencv \
     && rm -rf /var/lib/apt/lists/* && apt-get clean
 
+ENV CMAKE_TLS_VERIFY=0
+
 RUN cd ${ROS_TMP} \
     && colcon build \
         --merge-install \
         --install-base ${ROS_ROOT} \
-        --cmake-args -DBUILD_TESTING=OFF \
+        --cmake-args -DBUILD_TESTING=OFF -DCMAKE_TLS_VERIFY=OFF \
     && rm -rf /tmp/*
 
 RUN printf "export ROS_ROOT=${ROS_ROOT}\n" >> ${HOME}/.bashrc \
diff --git a/docker-compose.yaml b/docker-compose.yaml
index 43754b4..f55943d 100755
--- a/docker-compose.yaml
+++ b/docker-compose.yaml
@@ -1,14 +1,16 @@
 services:
   handy:
-    image: "registry.robotics-lab.ru/handy:0.7.3"
+    image: "registry.robotics-lab.ru/handy:0.7.4"
     container_name: "${CONTAINER_NAME:-handy-${USER}}"
     networks: [default]
     privileged: true
+    runtime: nvidia
     ports:
       - "${FOXGLOVE_PORT:-8765}:8765"
     volumes:
       - "/home/bakind/handy:/handy"
       - "/dev:/dev"
+      # - "/tmp/.X11-unix/:/tmp/.X11-unix"
     deploy:
       resources:
         limits:
@@ -22,10 +24,10 @@ services:
       x-bake:
         platforms:
           - linux/arm64
-          - linux/amd64
+          # - linux/amd64
         cache-from:
           - type=registry,ref=registry.robotics-lab.ru/handy:cache-arm64
-          - type=registry,ref=registry.robotics-lab.ru/handy:cache-amd64
+          # - type=registry,ref=registry.robotics-lab.ru/handy:cache-amd64
         cache-to:
           # [!!!] Warning: https://github.com/docker/buildx/discussions/1382
           # - type=registry,mode=max,ref=registry.robotics-lab.ru/handy:cache-arm64
diff --git a/packages/camera/CMakeLists.txt b/packages/camera/CMakeLists.txt
index 722645a..1a9ab81 100644
--- a/packages/camera/CMakeLists.txt
+++ b/packages/camera/CMakeLists.txt
@@ -4,6 +4,21 @@ project(camera)
 add_compile_options(-Wall -Wextra -Werror=unused-variable -Wpedantic)
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread -O3")
 
+set(Torch_DIR "/usr/local/lib/python3.10/dist-packages/torch/share/cmake/Torch")
+find_package(Torch REQUIRED)
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TORCH_CXX_FLAGS}")
+
+add_compile_options(-fsanitize=address)
+add_link_options(-fsanitize=address)
+
+find_path(TENSORRT_INCLUDE_DIR NvInfer.h
+  PATHS /usr/include /usr/local/include)
+find_library(TENSORRT_LIB nvinfer
+  PATHS /usr/lib /usr/lib/aarch64-linux-gnu)
+find_library(TENSORRT_ONNX_LIB nvonnxparser
+  PATHS /usr/lib /usr/lib/aarch64-linux-gnu)
+
+find_package(CUDAToolkit REQUIRED)
 find_package(OpenCV REQUIRED)
 find_package(yaml_cpp_vendor REQUIRED)
 find_package(ament_cmake REQUIRED)
@@ -18,28 +33,46 @@ add_executable(
   calibration src/calibration_main.cpp src/calibration.cpp src/params.cpp
 )
 add_executable(triangulation src/triangulation.cpp src/params.cpp)
+add_executable(inference_node src/inference_node.cpp src/inference_node_main.cpp)
 
 target_include_directories(
   camera PUBLIC "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>"
-                "$<INSTALL_INTERFACE:include/${PROJECT_NAME}>" "/usr/include"
+                "$<INSTALL_INTERFACE:include/${PROJECT_NAME}>"
+                "/usr/include"
+                "/opt/ros/iron/include/mcap_vendor/mcap"
 )
 
 target_include_directories(
   calibration PUBLIC "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>"
                      "/usr/include"
 )
+
 target_include_directories(
   triangulation PUBLIC "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>"
                        "/usr/include"
 )
 
+target_include_directories(
+  inference_node PUBLIC "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>"
+                        ${TENSORRT_INCLUDE_DIR}
+                        "/usr/local/cuda/include"
+                       "/usr/include"
+                       ${mcap_vendor_INCLUDE_DIRS}
+)
+
 ament_target_dependencies(camera yaml_cpp_vendor mcap_vendor)
 ament_target_dependencies(calibration yaml_cpp_vendor Boost OpenCV)
 ament_target_dependencies(triangulation OpenCV yaml_cpp_vendor)
+ament_target_dependencies(inference_node OpenCV Boost yaml_cpp_vendor mcap_vendor Torch)
 
 target_link_libraries(camera mvsdk)
+target_link_libraries(inference_node 
+                      ${TENSORRT_LIB}
+                      ${TENSORRT_ONNX_LIB}
+                      mvsdk
+                      ${TORCH_LIBRARIES})
 
-install(TARGETS camera calibration triangulation
+install(TARGETS camera calibration triangulation inference_node
         DESTINATION lib/${PROJECT_NAME}
 )
 
diff --git a/packages/camera/include/camera.h b/packages/camera/include/camera.h
index 65f161b..6591697 100644
--- a/packages/camera/include/camera.h
+++ b/packages/camera/include/camera.h
@@ -11,7 +11,7 @@
 #include <cstdint>
 
 #include <boost/lockfree/queue.hpp>
-#include "mcap_vendor/mcap/writer.hpp"
+#include "mcap/writer.hpp"
 
 using namespace std::chrono_literals;
 
@@ -147,5 +147,7 @@ class CameraRecorder {
 
     mcap::McapWriter mcap_writer_;
     MappedFileManager file_manager_;
+
+    std::function<void(SynchronizedFrameBuffers*)> deleter_;
 };
 }  // namespace handy::camera
diff --git a/packages/camera/include/inference_node.h b/packages/camera/include/inference_node.h
new file mode 100644
index 0000000..3453076
--- /dev/null
+++ b/packages/camera/include/inference_node.h
@@ -0,0 +1,113 @@
+#pragma once
+
+#include "camera.h"
+#include <torch/script.h>
+#include <opencv2/core.hpp>
+#include "c10/cuda/CUDAStream.h"
+#include <boost/lockfree/queue.hpp>
+#include <mutex>
+#include <atomic>
+#include <thread>
+#include <condition_variable>
+#include <deque>
+#include <vector>
+#include <functional>
+#include <memory>
+#include <array>
+
+namespace handy::camera {
+
+class BallInferenceNode {
+  public:
+    static const int kMaxCameraNum = 2;
+
+    /// Per‐frame detection result
+    struct DetectionOutput {
+        std::array<cv::Point, kMaxCameraNum> centers;  // one ball center per camera image
+        std::array<bool, kMaxCameraNum> flags;         // true = needs segmentation for that camera
+        uint32_t timestamp = 0;
+        SynchronizedFrameBuffers* images_to_segment = nullptr;
+    };
+
+    /// Per‐frame segmentation result
+    struct SegmentationOutput {
+        // one centroid per camera image that requested segmentation
+        std::vector<cv::Point> centroids;
+        std::vector<int> camera_indices;
+        uint32_t timestamp = 0;
+    };
+
+    using DetectionCallback = std::function<void(DetectionOutput)>;
+    using SegmentationCallback = std::function<void(SegmentationOutput)>;
+
+    /// ctor loads both models and sets up queues/threads
+    BallInferenceNode(
+        const std::string& det_model_path, const std::string& seg_model_path, Size input_size,
+        std::function<void(SynchronizedFrameBuffers*)> deleter, int window_size = 5);
+    ~BallInferenceNode();
+
+    /// Subscribe to detection outputs
+    /// Called from arbitrary thread; callbacks must be lock‐free/low‐latency.
+    void registerDetectionSubscriber(const DetectionCallback& cb);
+
+    /// Subscribe to segmentation outputs
+    void registerSegmentationSubscriber(const SegmentationCallback& cb);
+
+    /// Start the internal threads
+    void start();
+
+    /// Signal all threads to stop, join them, and release resources
+    void stop();
+
+    void handleSyncBuffer(std::shared_ptr<SynchronizedFrameBuffers> buffer);
+
+  private:
+    void detectSingleCamera(const std::deque<at::Tensor>& frames, DetectionOutput& detect_result,
+        int camera_idx, c10::cuda::CUDAStream& infer_stream);
+    void detectionLoop();
+    void segmentationLoop();
+
+
+    // === model objects ===
+    torch::jit::script::Module det_model_;
+    torch::jit::script::Module seg_model_;
+
+    std::function<void(SynchronizedFrameBuffers*)> buffer_deleter_;
+
+    struct Param {
+        // === input queue & rolling window ===
+        int window_size = 5;
+        Size input_size{320, 192};
+        const at::Tensor means =
+            torch::tensor({0.077, 0.092, 0.142}).view({1, 3, 1, 1}).to(at::kFloat).to(at::kCUDA);
+        const at::Tensor stds =
+            torch::tensor({0.068, 0.079, 0.108}).view({1, 3, 1, 1}).to(at::kFloat).to(at::kCUDA);
+    } param_;
+
+    struct State {
+        boost::lockfree::queue<SynchronizedFrameBuffers*> det_queue{64};
+
+        // protected by ring_mutex_
+        std::array<std::deque<at::Tensor>, kMaxCameraNum> ring_buffer;
+        std::vector<at::Tensor> pinned_pool;
+        std::atomic<size_t> pool_idx = 0;
+        std::mutex ring_mutex;
+
+        // === intermediate queues ===
+        boost::lockfree::queue<DetectionOutput> seg_queue{128};
+
+        // === subscriber callbacks ===
+        std::vector<DetectionCallback> det_cbs;
+        std::vector<SegmentationCallback> seg_cbs;
+        std::mutex cb_mutex;
+
+        // === threading & synchroization ===
+        std::atomic<bool> running{false};
+        std::thread det_thread;
+        std::thread seg_thread;
+        std::condition_variable seg_cv;
+        std::mutex seg_mutex;
+    } state_;
+};
+
+}  // namespace handy::camera
diff --git a/packages/camera/include/triangulation.h b/packages/camera/include/triangulation.h
index 18ca8b9..15e0f66 100644
--- a/packages/camera/include/triangulation.h
+++ b/packages/camera/include/triangulation.h
@@ -1,5 +1,4 @@
 #include "params.h"
-#include "params.h"
 
 #include <opencv2/core/core.hpp>
 #include <string>
diff --git a/packages/camera/src/camera.cpp b/packages/camera/src/camera.cpp
index 87973fe..1a4a147 100644
--- a/packages/camera/src/camera.cpp
+++ b/packages/camera/src/camera.cpp
@@ -282,6 +282,18 @@ CameraRecorder::CameraRecorder(
         }
     }
 
+    deleter_ = [this](SynchronizedFrameBuffers* sync_buffers_ptr) {
+        for (size_t i = 0; i < sync_buffers_ptr->images.size(); ++i) {
+            if (!sync_buffers_ptr->images[i]) {
+                continue;
+            }
+            while (!this->state_.free_buffers[i]->push(sync_buffers_ptr->images[i])) {
+            }
+            state_.free_buffer_cnts[i]++;
+        }
+        delete sync_buffers_ptr;
+    };
+
     for (int i = 0; i < state_.camera_num; ++i) {
         abortIfNot("reset timestamp", i, CameraRstTimeStamp(state_.camera_handles[i]));
     }
@@ -395,20 +407,9 @@ void CameraRecorder::synchronizeQueues() {
         };
 
         // make single structure as a shared ptr
-        // deleter is expected to push buffers back to the queue and then free the structure
-        auto custom_deleter = [this](SynchronizedFrameBuffers* sync_buffers_ptr) {
-            for (size_t i = 0; i < sync_buffers_ptr->images.size(); ++i) {
-                if (!sync_buffers_ptr->images[i]) {
-                    continue;
-                }
-                while (!this->state_.free_buffers[i]->push(sync_buffers_ptr->images[i])) {
-                }
-                state_.free_buffer_cnts[i]++;
-            }
-            delete sync_buffers_ptr;
-        };
+        // deleter_ is expected to push buffers back to the queue and then free the structure
         std::shared_ptr<SynchronizedFrameBuffers> sync_buffers(
-            new SynchronizedFrameBuffers, custom_deleter);
+            new SynchronizedFrameBuffers, deleter_);
         sync_buffers->images.resize(state_.camera_num, nullptr);
         sync_buffers->image_sizes.resize(state_.camera_num);
 
@@ -440,7 +441,7 @@ void CameraRecorder::synchronizeQueues() {
             // creating new structure
             sync_buffers = std::shared_ptr<SynchronizedFrameBuffers>(
                 new SynchronizedFrameBuffers,
-                custom_deleter);  // custom deleter adds buffer pointer back to the queue
+                deleter_);  // custom deleter adds buffer pointer back to the queue
             sync_buffers->images.resize(state_.camera_num, nullptr);
             sync_buffers->image_sizes.resize(state_.camera_num);
         } while (next());
diff --git a/packages/camera/src/inference_node.cpp b/packages/camera/src/inference_node.cpp
new file mode 100644
index 0000000..1262644
--- /dev/null
+++ b/packages/camera/src/inference_node.cpp
@@ -0,0 +1,283 @@
+#include "inference_node.h"
+#include <opencv2/imgproc.hpp>
+#include <torch/torch.h>
+#include <torch/types.h>
+#include "c10/cuda/CUDAStream.h"
+#include "c10/cuda/CUDAGuard.h"
+#include <ATen/cuda/CUDAContext.h>
+
+namespace handy::camera {
+
+BallInferenceNode::BallInferenceNode(
+    const std::string& det_model_path, const std::string& seg_model_path, Size input_size,
+    std::function<void(SynchronizedFrameBuffers*)> deleter, int window_size) {
+    det_model_ = torch::jit::load(det_model_path);
+    det_model_.eval();
+    det_model_.to(at::kCUDA);
+
+    seg_model_ = torch::jit::load(seg_model_path);
+    seg_model_.eval();
+    seg_model_.to(at::kCUDA);
+
+    // Store params
+    param_.input_size = input_size;
+    param_.window_size = window_size;
+    buffer_deleter_ = deleter;
+
+    for (int i = 0; i < param_.window_size * kMaxCameraNum; ++i) {
+        // Allocate an empty pinned tensor of shape [1, C, H, W]
+        at::Tensor t = torch::empty(
+                           {1, 3, param_.input_size.height, param_.input_size.width},
+                           at::TensorOptions().dtype(torch::kUInt8))
+                           .pin_memory();
+        state_.pinned_pool.push_back(std::move(t));
+    }
+}
+
+BallInferenceNode::~BallInferenceNode() { stop(); }
+
+void BallInferenceNode::registerDetectionSubscriber(const DetectionCallback& cb) {
+    std::lock_guard<std::mutex> lk(state_.cb_mutex);
+    state_.det_cbs.push_back(cb);
+}
+
+void BallInferenceNode::registerSegmentationSubscriber(const SegmentationCallback& cb) {
+    std::lock_guard<std::mutex> lk(state_.cb_mutex);
+    state_.seg_cbs.push_back(cb);
+}
+
+void BallInferenceNode::handleSyncBuffer(std::shared_ptr<SynchronizedFrameBuffers> buffer) {
+    while (!state_.det_queue.push(buffer.get())) {
+    }
+}
+
+void BallInferenceNode::start() {
+    state_.running = true;
+    // Launch threads
+    state_.det_thread = std::thread(&BallInferenceNode::detectionLoop, this);
+    state_.seg_thread = std::thread(&BallInferenceNode::segmentationLoop, this);
+}
+
+void BallInferenceNode::stop() {
+    bool expected = true;
+    if (state_.running.compare_exchange_strong(expected, false)) {
+        state_.seg_cv.notify_one();
+        if (state_.det_thread.joinable()) {
+            state_.det_thread.join();
+        }
+        if (state_.seg_thread.joinable()) {
+            state_.seg_thread.join();
+        }
+    }
+}
+
+void BallInferenceNode::detectSingleCamera(
+    const std::deque<at::Tensor>& frames, DetectionOutput& detect_result, int camera_idx,
+    c10::cuda::CUDAStream& infer_stream) {
+    std::vector<at::Tensor> frames_vec(frames.begin(), frames.end());
+    at::Tensor batch = at::cat(frames_vec, 1);  // dim=1
+
+    torch::NoGradGuard no_grad;
+    at::cuda::CUDAStreamGuard guard(infer_stream);
+    auto out_tuple = det_model_.forward({batch}).toTuple();
+
+    // Unpack model outputs
+    //    out_tuple[0]: prob distribution (1, width + height)
+    //    out_tuple[1]: presence logits (1, 2)
+    at::Tensor center_distr = out_tuple->elements()[0].toTensor();
+    at::Tensor success_detection_distr = out_tuple->elements()[1].toTensor();
+
+    // Threshold & argmax to get (x,y) center
+    //    replicate: pred[pred < thresh]=0, then argmax on slices
+    const float thresh = 0.0001;
+    int W = param_.input_size.width;
+    center_distr = center_distr.clone();
+    center_distr.masked_fill_(center_distr < thresh, 0.0f);
+
+    at::Tensor prob_x = center_distr.slice(1, 0, W);                     // {1, W}
+    at::Tensor prob_y = center_distr.slice(1, W, center_distr.size(1));  // {1, H}
+
+    int64_t pred_x = prob_x.argmax(1).item<int64_t>();
+    int64_t pred_y = prob_y.argmax(1).item<int64_t>();
+    detect_result.centers[camera_idx] = {pred_x, pred_y};
+
+    int64_t pres_idx = success_detection_distr.argmax(1).item<int64_t>();
+    detect_result.flags[camera_idx] = (pres_idx == 1);  // class 1 means object present
+}
+
+void BallInferenceNode::detectionLoop() {
+    auto infer_stream = at::cuda::getStreamFromPool();
+    std::vector<std::deque<at::Tensor>> rollBuf(kMaxCameraNum);
+
+    SynchronizedFrameBuffers* sync;
+    while (state_.running) {
+        if (!state_.det_queue.pop(sync)) {
+            std::this_thread::sleep_for(1ms);
+            continue;
+        }
+        std::cout << "recieved buffer for detection\n";
+
+        DetectionOutput detection_out;
+        bool any_successful_detection = false;
+        auto start = std::chrono::steady_clock::now();
+
+        // Preprocess each camera image
+        for (size_t cam = 0; cam < sync->images.size(); ++cam) {
+            uint8_t* raw = sync->images[cam];
+            Size sz = sync->image_sizes[cam];
+            cv::Mat bayer(sz.height, sz.width, CV_8UC1, raw);
+
+            
+
+            static thread_local cv::Mat rgb;
+            cv::cvtColor(bayer, rgb, cv::COLOR_BayerBG2RGB);
+
+            static thread_local cv::Mat resized;
+            cv::resize(rgb, resized, cv::Size(param_.input_size.width, param_.input_size.height));
+            // To tensor  of shape (1, height, width, channel) and perform all the preprocessing
+            at::Tensor t_uint8 = state_.pinned_pool[state_.pool_idx++ % state_.pinned_pool.size()];
+            std::memcpy(
+                t_uint8.data_ptr<uint8_t>(),  // pinned host memory
+                resized.data,                 // OpenCV RGB data
+                3 * param_.input_size.area());
+            at::Tensor t = t_uint8.to(at::kCUDA, true)
+                               .toType(at::kFloat)
+                               .div_(255.0)
+                               .sub_(param_.means)
+                               .div_(param_.stds);
+
+            rollBuf[cam].push_back(t);
+            if (rollBuf[cam].size() > (size_t)param_.window_size) {
+                rollBuf[cam].pop_front();
+            }
+            if (rollBuf[cam].size() < (size_t)param_.window_size) {
+                continue;
+            }
+            detectSingleCamera(rollBuf[cam], detection_out, cam, infer_stream);
+            any_successful_detection |= detection_out.flags[cam];
+        }
+        auto end = std::chrono::steady_clock::now();
+
+        std::cout << "elapsed: "
+                  << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count()
+                  << '\n';
+        detection_out.images_to_segment = nullptr;
+        detection_out.timestamp = sync->timestamp;
+
+        {
+            std::lock_guard<std::mutex> lk(state_.cb_mutex);
+            for (auto& cb : state_.det_cbs) {
+                cb(detection_out);
+            }
+        }
+
+        if (any_successful_detection) {
+            // segmentation requires initial images, so we need to send pointer to buffer too
+            detection_out.images_to_segment = sync;
+            state_.seg_queue.push(detection_out);
+            state_.seg_cv.notify_one();
+        } else {
+            // in case no segmentation is required we can release buffer
+            buffer_deleter_(sync);
+        }
+    }
+}
+
+void BallInferenceNode::segmentationLoop() {
+    while (state_.running) {
+        std::unique_lock<std::mutex> lk(state_.seg_mutex);
+        state_.seg_cv.wait_for(
+            lk, 5ms, [this]() { return !state_.running || !state_.seg_queue.empty(); });
+        if (!state_.running && state_.seg_queue.empty()) break;
+
+        DetectionOutput det;
+        if (!state_.seg_queue.pop(det)) {
+            continue;
+        }
+
+        auto sync = det.images_to_segment;
+        if (!sync) {
+            continue;
+        }
+
+        std::vector<at::Tensor> masks;
+        std::vector<int> cams;
+        std::vector<cv::Point> centers;
+        std::vector<cv::Point2f> centroids;
+
+        SegmentationOutput segmentation_output;
+        segmentation_output.timestamp = det.timestamp;
+
+        // Process each flagged camera
+        for (size_t cam = 0; cam < det.flags.size(); ++cam) {
+            if (!det.flags[cam]) continue;
+
+            uint8_t* raw = sync->images[cam];
+            Size sz = sync->image_sizes[cam];
+            cv::Mat bayer(sz.height, sz.width, CV_8UC1, raw);
+            cv::Mat rgb;
+            cv::cvtColor(bayer, rgb, cv::COLOR_BayerBG2RGB);
+
+            const int cropSize = 128;
+            int half = cropSize / 2;
+            cv::Point pt = det.centers[cam];
+            int x = std::clamp(pt.x, half, sz.width - half);
+            int y = std::clamp(pt.y, half, sz.height - half);
+            int x0 = x - half;
+            int y0 = y - half;
+            cv::Rect cropRect(x0, y0, cropSize, cropSize);
+            cv::Mat crop = rgb(cropRect);
+
+            at::Tensor t = torch::from_blob(crop.data, {1, crop.rows, crop.cols, 3}, at::kByte)
+                               .permute({0, 3, 1, 2})
+                               .toType(at::kFloat)
+                               .div_(255.0)
+                               .sub_(param_.means)
+                               .div_(param_.stds)
+                               .pin_memory();
+
+            at::Tensor gpu_in = t.to(at::kCUDA, /*non_blocking=*/true);
+            torch::NoGradGuard ndg;
+            at::Tensor out = seg_model_.forward({gpu_in}).toTensor().cpu();
+            // out shape {1, 1, cropSize, cropSize}
+            at::Tensor prob = out.sigmoid().squeeze(0).squeeze(0);  // {H, W} on CUDA
+
+            constexpr float thresh = 0.5f;
+            at::Tensor bin = prob > thresh;  // ByteTensor {H, W} on CUDA
+            at::Tensor coords = bin.nonzero();
+
+            // compute centroid as the mean of these coords
+            at::Tensor centroid;
+            if (coords.numel() == 0) {
+                // {row_center, col_center}
+                centroid =
+                    torch::tensor({cropSize / 2.0, cropSize / 2.0}, at::kFloat).to(at::kCUDA);
+            } else {
+                // convert coords to float and take mean over dim=0 -> {mean_row, mean_col}
+                centroid = coords.to(at::kFloat).mean(0);
+            }
+
+            centroid = centroid.to(at::kCPU);
+
+            float centroid_y = centroid[0].item<float>() + y0;
+            float centroid_x = centroid[1].item<float>() + x0;
+
+            segmentation_output.centroids.emplace_back(centroid_x, centroid_y);
+            segmentation_output.camera_indices.push_back(cam);
+        }
+
+        if (segmentation_output.camera_indices.empty()) {
+            continue;
+        }
+
+        std::lock_guard<std::mutex> lk_callback(state_.seg_mutex);
+        for (auto& cb : state_.seg_cbs) {
+            cb(segmentation_output);
+        }
+
+        // now we do not need buffer
+        buffer_deleter_(sync);
+    }
+}
+
+}  // namespace handy::camera