From 8a06e6b9a1036e607f46b2cf786b4e2a07ec1250 Mon Sep 17 00:00:00 2001 From: lyq Date: Thu, 12 Mar 2026 16:11:32 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BC=98=E5=8C=96=E7=82=B9=E4=BA=91=E5=A4=84?= =?UTF-8?q?=E7=90=86=E6=80=A7=E8=83=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../include/gpu_processing.h | 157 +- .../launch/merge_two_lidars.launch.py | 5 +- .../src/gpu_processing.cu | 845 ++++++-- .../src/merge_two_lidars.cpp | 1823 +++++++++-------- 4 files changed, 1755 insertions(+), 1075 deletions(-) diff --git a/src/perception/rslidar_pointcloud_merger/include/gpu_processing.h b/src/perception/rslidar_pointcloud_merger/include/gpu_processing.h index ca58bed..88625a2 100644 --- a/src/perception/rslidar_pointcloud_merger/include/gpu_processing.h +++ b/src/perception/rslidar_pointcloud_merger/include/gpu_processing.h @@ -1,21 +1,31 @@ -/* GPU加速的点云处理头文件 */ +/* GPU加速的点云处理头文件 - 优化版 */ +/* 主要优化: + * 1. GPU内存持久化(避免每帧 malloc/free) + * 2. 锁页内存(Pinned Memory)加速 H2D/D2H 传输 + * 3. CUDA Stream 并行处理两路雷达 + * 4. 修复 voxelizeKernel 竞争条件(原子操作累加 x/y/z) + * 5. 修复 SOR 内核(正确的统计离群点去除算法) + * 6. 合并内核补全体素化 + SOR 逻辑 + */ #ifndef GPU_PROCESSING_H #define GPU_PROCESSING_H +#include #include #include +#include #include -// 定义__device__和__host__函数装饰器 #ifdef __CUDACC__ #define DEVICE_HOST __device__ __host__ #else #define DEVICE_HOST #endif -// 避免与CUDA的float3/int3冲突,使用自定义名称 +/* ===================== 基础数据结构 ===================== */ + struct PointFloat { float x, y, z; @@ -30,28 +40,153 @@ struct PointInt DEVICE_HOST PointInt(int x_, int y_, int z_) : x(x_), y(y_), z(z_) {} }; -// 检查GPU是否可用 +/* ===================== GPU 处理参数 ===================== */ + +struct GpuProcessingParams +{ + // 空间过滤范围 + float min_x, max_x; + float min_y, max_y; + float min_z, max_z; + + // 体素化参数 + float voxel_resolution; // 体素大小(米) + float grid_range; // 网格覆盖范围(米),以原点为中心 + + // SOR 参数(统计离群点去除) + int sor_mean_k; // 邻域点数 k + float sor_std_mult; // 标准差倍数阈值 + + // 车身过滤 + float car_min_x, car_max_x; + float car_min_y, car_max_y; + float car_min_z, car_max_z; + bool filter_car; + + // 扫刷过滤(右侧 / 左侧) + float brush_r_min_x, brush_r_max_x; + float brush_r_min_y, brush_r_max_y; + float brush_r_min_z, brush_r_max_z; + + float brush_l_min_x, brush_l_max_x; + float brush_l_min_y, brush_l_max_y; + float brush_l_min_z, brush_l_max_z; + bool filter_brush; +}; + +/* ===================== 持久化 GPU 资源管理器 ===================== + * + * 在节点初始化时调用 GpuResources::init(),节点销毁时调用 destroy()。 + * 所有 GPU 内核通过本类持有的设备/锁页内存及 CUDA Stream 执行, + * 彻底避免每帧 cudaMalloc/cudaFree 的开销。 + * + * 双流设计: + * stream[0] → 前雷达 + * stream[1] → 后雷达 + */ +class GpuResources +{ + public: + /* ---- 单例访问 ---- */ + static GpuResources& instance(); + + /* ---- 生命周期 ---- */ + /** + * @brief 初始化 GPU 资源 + * @param max_points_per_cloud 单路雷达最大点数,用于预分配内存 + * @param max_voxel_grid_dim 体素网格每轴最大格数(= grid_range / voxel_resolution) + */ + bool init(int max_points_per_cloud, int max_voxel_grid_dim); + void destroy(); + + bool isInitialized() const { return initialized_; } + + /* ---- 设备内存(按雷达通道索引:0=前, 1=后) ---- */ + float* d_points[2]; // 输入点云 (N*3 floats) + uint8_t* d_pass_mask[2]; // 空间过滤掩码 + uint8_t* d_sor_mask[2]; // SOR 掩码 + int* d_voxel_counts[2]; // 体素命中计数 + // 体素中心的 x/y/z 分量用三个独立 float 数组,配合 atomicAdd 的整数技巧 + // 实际存储为定点整数(×1000 精度),避免 atomicAdd 不支持 float3 + int* d_voxel_sum_x[2]; // 体素内点 x 坐标之和(×VOXEL_SCALE) + int* d_voxel_sum_y[2]; + int* d_voxel_sum_z[2]; + float* d_mean_dist[2]; // 每点到 k 邻居的平均距离(SOR 中间结果) + + /* ---- 锁页内存(Host 侧) ---- */ + float* h_points[2]; // 输入点云(锁页) + uint8_t* h_pass_mask[2]; // 过滤结果(锁页) + int* h_voxel_counts[2]; // 体素命中数(锁页) + int* h_voxel_sum_x[2]; + int* h_voxel_sum_y[2]; + int* h_voxel_sum_z[2]; + + /* ---- CUDA Streams ---- */ + cudaStream_t stream[2]; + + /* ---- 容量 ---- */ + int max_points; + int max_voxel_dim; + int max_voxels; // = max_voxel_dim^3 + + static constexpr int VOXEL_SCALE = 1000; // 定点精度:1mm + + private: + GpuResources() : initialized_(false) {} + bool initialized_; +}; + +/* ===================== 设备可用性检查 ===================== */ + bool isDeviceAvailable(); -// GPU加速的条件过滤 +/* ===================== 核心 GPU 处理接口 ===================== */ + +/** + * @brief 完整点云处理流水线(异步,使用指定 stream) + * + * 执行顺序: + * 1. 空间过滤(Bounding Box + 车身/扫刷剔除) + * 2. 体素化降采样(Voxel Grid,原子累加求均值) + * 3. 统计离群点去除(SOR,在降采样后的点上执行) + * + * @param points 输入点云 float 数组(x0,y0,z0, x1,y1,z1, ...) + * @param num_points 点数 + * @param params 处理参数 + * @param channel 雷达通道(0=前, 1=后),决定使用哪条 Stream 和内存 + * @return 处理后点云的 float 数组(格式同输入) + */ +std::vector gpuProcessPointCloud(const float* points, int num_points, const GpuProcessingParams& params, + int channel = 0); + +/** + * @brief 仅空间过滤(轻量版,供单独调用) + */ std::vector gpuFilterPointCloud(const std::vector& points, float min_x, float min_y, float min_z, float max_x, float max_y, float max_z); -// GPU加速的体素网格降采样 +/** + * @brief 体素化降采样 + */ std::vector gpuVoxelGridFilter(const std::vector& points, float resolution, float range); -// GPU加速的统计离群点去除 +/** + * @brief 统计离群点去除(在已降采样的点云上执行,避免 O(N²) 瓶颈) + */ std::vector gpuStatisticalOutlierRemoval(const std::vector& points, int mean_k, float std_thresh); -// GPU加速的合并处理(空间过滤+体素化+离群点去除) +/** + * @brief 合并内核(空间过滤 + 体素化 + SOR 一体化) + * 保持与旧接口兼容,内部使用持久化资源 + */ std::vector gpuMergedProcessing(const std::vector& points, float min_x, float min_y, float min_z, float max_x, float max_y, float max_z, float voxel_resolution, float grid_range, int mean_k, float std_mult); -// 辅助函数:将PCL点云转换为简单浮点数组 +/* ===================== 辅助函数 ===================== */ + std::vector convertToFloatArray(const typename pcl::PointCloud::Ptr& cloud); -// 辅助函数:将浮点数组转换为PCL点云 typename pcl::PointCloud::Ptr convertToPCLCloud(const std::vector& points); -#endif // GPU_PROCESSING_H +#endif // GPU_PROCESSING_H \ No newline at end of file diff --git a/src/perception/rslidar_pointcloud_merger/launch/merge_two_lidars.launch.py b/src/perception/rslidar_pointcloud_merger/launch/merge_two_lidars.launch.py index 49ccb02..79d128e 100644 --- a/src/perception/rslidar_pointcloud_merger/launch/merge_two_lidars.launch.py +++ b/src/perception/rslidar_pointcloud_merger/launch/merge_two_lidars.launch.py @@ -79,7 +79,10 @@ def generate_launch_description(): "grid_range": 15.0, # 栅格覆盖的实际空间范围(米) # 单元格尺寸:由 grid_range / grid_size 决定 "enable_print": False, # 是否打印栅格 - "use_gpu": False, # 是否使用GPU加速 + "use_gpu": True, # 是否使用GPU加速 + "sor_method": 1, # 推荐先用1,效果不满意再试2 + "radius_search": 0.5, # 搜索半径(米),越大越严格 + "min_neighbors": 3, # 至少需要N个邻居才保留 # 车身过滤参数 "filter_car": True, # 是否启用车身过滤 "car_length": 1.8, # 车长(米) diff --git a/src/perception/rslidar_pointcloud_merger/src/gpu_processing.cu b/src/perception/rslidar_pointcloud_merger/src/gpu_processing.cu index fbce2ac..1b33fd7 100644 --- a/src/perception/rslidar_pointcloud_merger/src/gpu_processing.cu +++ b/src/perception/rslidar_pointcloud_merger/src/gpu_processing.cu @@ -1,311 +1,714 @@ -/* GPU加速的点云处理 - 超高负载优化版 (Jetson 专用) */ +/* GPU加速的点云处理 - 完整优化版 -#include + */ #include #include +#include #include #include "gpu_processing.h" -#ifdef __CUDACC__ -#define DEVICE_FUNC __device__ __host__ -#else -#define DEVICE_FUNC inline +#ifdef USE_GPU +#include #endif -__constant__ float d_const_params[9]; -__constant__ int d_const_ints[2]; +/* ================================================================ + * 宏与工具 + * ================================================================ */ -DEVICE_FUNC bool isDeviceAvailable() +#define CUDA_CHECK(call) \ + do \ + { \ + cudaError_t err = (call); \ + if (err != cudaSuccess) \ + { \ + fprintf(stderr, "[CUDA ERROR] %s:%d %s\n", __FILE__, __LINE__, cudaGetErrorString(err)); \ + } \ + } while (0) + +/* 每个线程块的线程数(256 是 Jetson Orin 上的最优选择)*/ +static constexpr int BLOCK_SIZE = 256; + +/* 体素坐标定点精度(毫米级,避免 atomicAdd 不支持 float 的问题)*/ +static constexpr int VOXEL_SCALE = GpuResources::VOXEL_SCALE; + +/* ================================================================ + * GpuResources 单例实现 + * ================================================================ */ + +GpuResources& GpuResources::instance() { -#if defined(USE_GPU) && defined(__CUDACC__) - int count; - cudaGetDeviceCount(&count); - return count > 0; + static GpuResources res; + return res; +} + +bool GpuResources::init(int max_points_per_cloud, int max_voxel_grid_dim) +{ +#ifdef USE_GPU + if (initialized_) return true; + + max_points = max_points_per_cloud; + max_voxel_dim = max_voxel_grid_dim; + max_voxels = max_voxel_dim * max_voxel_dim * max_voxel_dim; + + for (int ch = 0; ch < 2; ++ch) + { + /* ---- CUDA Stream ---- */ + CUDA_CHECK(cudaStreamCreateWithFlags(&stream[ch], cudaStreamNonBlocking)); + + /* ---- 设备内存 ---- */ + CUDA_CHECK(cudaMalloc(&d_points[ch], (size_t)max_points * 3 * sizeof(float))); + CUDA_CHECK(cudaMalloc(&d_pass_mask[ch], (size_t)max_points * sizeof(uint8_t))); + CUDA_CHECK(cudaMalloc(&d_sor_mask[ch], (size_t)max_points * sizeof(uint8_t))); + CUDA_CHECK(cudaMalloc(&d_voxel_counts[ch], (size_t)max_voxels * sizeof(int))); + CUDA_CHECK(cudaMalloc(&d_voxel_sum_x[ch], (size_t)max_voxels * sizeof(int))); + CUDA_CHECK(cudaMalloc(&d_voxel_sum_y[ch], (size_t)max_voxels * sizeof(int))); + CUDA_CHECK(cudaMalloc(&d_voxel_sum_z[ch], (size_t)max_voxels * sizeof(int))); + CUDA_CHECK(cudaMalloc(&d_mean_dist[ch], (size_t)max_points * sizeof(float))); + + /* ---- 锁页内存(Host) ---- */ + CUDA_CHECK(cudaMallocHost(&h_points[ch], (size_t)max_points * 3 * sizeof(float))); + CUDA_CHECK(cudaMallocHost(&h_pass_mask[ch], (size_t)max_points * sizeof(uint8_t))); + CUDA_CHECK(cudaMallocHost(&h_voxel_counts[ch], (size_t)max_voxels * sizeof(int))); + CUDA_CHECK(cudaMallocHost(&h_voxel_sum_x[ch], (size_t)max_voxels * sizeof(int))); + CUDA_CHECK(cudaMallocHost(&h_voxel_sum_y[ch], (size_t)max_voxels * sizeof(int))); + CUDA_CHECK(cudaMallocHost(&h_voxel_sum_z[ch], (size_t)max_voxels * sizeof(int))); + } + + initialized_ = true; + fprintf(stdout, + "[GpuResources] Initialized: max_points=%d, max_voxel_dim=%d, " + "max_voxels=%d\n", + max_points, max_voxel_dim, max_voxels); + return true; +#else + (void)max_points_per_cloud; + (void)max_voxel_grid_dim; + return false; +#endif +} + +void GpuResources::destroy() +{ +#ifdef USE_GPU + if (!initialized_) return; + + for (int ch = 0; ch < 2; ++ch) + { + cudaStreamSynchronize(stream[ch]); + cudaStreamDestroy(stream[ch]); + + cudaFree(d_points[ch]); + cudaFree(d_pass_mask[ch]); + cudaFree(d_sor_mask[ch]); + cudaFree(d_voxel_counts[ch]); + cudaFree(d_voxel_sum_x[ch]); + cudaFree(d_voxel_sum_y[ch]); + cudaFree(d_voxel_sum_z[ch]); + cudaFree(d_mean_dist[ch]); + + cudaFreeHost(h_points[ch]); + cudaFreeHost(h_pass_mask[ch]); + cudaFreeHost(h_voxel_counts[ch]); + cudaFreeHost(h_voxel_sum_x[ch]); + cudaFreeHost(h_voxel_sum_y[ch]); + cudaFreeHost(h_voxel_sum_z[ch]); + } + + initialized_ = false; +#endif +} + +/* ================================================================ + * 设备可用性 + * ================================================================ */ + +bool isDeviceAvailable() +{ +#ifdef USE_GPU + int count = 0; + cudaError_t err = cudaGetDeviceCount(&count); + return (err == cudaSuccess && count > 0); #else return false; #endif } -// ================================================================= -// 🔥 重型 GPU 内核:计算密集型,强制 GPU 满载 -// ================================================================= -__global__ void heavyMergedKernel(const float* points, int num_points, bool* output_mask) +/* ================================================================ + * CUDA 内核函数 + * ================================================================ */ + +/* ---------------------------------------------------------------- + * 内核 1:空间过滤 + 车身/扫刷剔除 + * + * 每个线程处理一个点,输出 pass_mask[i]=1 表示该点通过过滤。 + * ---------------------------------------------------------------- */ +__global__ void kernelSpatialFilter(const float* __restrict__ points, int num_points, float min_x, float max_x, + float min_y, float max_y, float min_z, float max_z, + /* 车身包围盒(世界坐标,负向剔除) */ + float car_min_x, float car_max_x, float car_min_y, float car_max_y, float car_min_z, + float car_max_z, bool filter_car, + /* 右侧扫刷 */ + float brush_r_min_x, float brush_r_max_x, float brush_r_min_y, float brush_r_max_y, + float brush_r_min_z, float brush_r_max_z, + /* 左侧扫刷 */ + float brush_l_min_x, float brush_l_max_x, float brush_l_min_y, float brush_l_max_y, + float brush_l_min_z, float brush_l_max_z, bool filter_brush, + uint8_t* __restrict__ pass_mask) { int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx >= num_points) return; - const float min_x = d_const_params[0]; - const float min_y = d_const_params[1]; - const float min_z = d_const_params[2]; - const float max_x = d_const_params[3]; - const float max_y = d_const_params[4]; - const float max_z = d_const_params[5]; - const float voxel_res = d_const_params[6]; - const float grid_range = d_const_params[7]; - const float std_mult = d_const_params[8]; - const int mean_k = d_const_ints[0]; - const int grid_size = d_const_ints[1]; + float px = points[idx * 3 + 0]; + float py = points[idx * 3 + 1]; + float pz = points[idx * 3 + 2]; - const float x = points[idx * 3 + 0]; - const float y = points[idx * 3 + 1]; - const float z = points[idx * 3 + 2]; + /* 1. Bounding box 过滤 */ + bool valid = (px > min_x && px < max_x && py > min_y && py < max_y && pz > min_z && pz < max_z); - // 1. 空间过滤 - bool valid = (x > min_x && x < max_x) && (y > min_y && y < max_y) && (z > min_z && z < max_z); - if (!valid) + /* 2. 车身剔除(在 bbox 内的点中剔除车身范围) */ + if (valid && filter_car) { - output_mask[idx] = false; - return; + bool in_car = (px >= car_min_x && px <= car_max_x && py >= car_min_y && py <= car_max_y && pz >= car_min_z && + pz <= car_max_z); + if (in_car) valid = false; } - // 2. 体素过滤 - const float half = grid_range * 0.5f; - int vx = __float2int_rn((half - x) / voxel_res); - int vy = __float2int_rn((y + half) / voxel_res); - int vz = __float2int_rn((z + half) / voxel_res); - valid = (vx >= 0 && vx < grid_size && vy >= 0 && vy < grid_size && vz >= 0 && vz < grid_size); - if (!valid) + /* 3. 扫刷剔除 */ + if (valid && filter_brush) { - output_mask[idx] = false; - return; + bool in_right = (px >= brush_r_min_x && px <= brush_r_max_x && py >= brush_r_min_y && py <= brush_r_max_y && + pz >= brush_r_min_z && pz <= brush_r_max_z); + bool in_left = (px >= brush_l_min_x && px <= brush_l_max_x && py >= brush_l_min_y && py <= brush_l_max_y && + pz >= brush_l_min_z && pz <= brush_l_max_z); + if (in_right || in_left) valid = false; } - // ################################################################# - // 🔥🔥🔥 这里是强制高计算量,让 GPU 跑满的核心 - // ################################################################# - __shared__ float s_points[1024 * 3]; - int lid = threadIdx.x; + pass_mask[idx] = valid ? 1 : 0; +} - s_points[lid * 3 + 0] = x; - s_points[lid * 3 + 1] = y; - s_points[lid * 3 + 2] = z; - __syncthreads(); +/* ---------------------------------------------------------------- + * 内核 2:体素化(原子累加 + 定点整数,修复竞争条件) + * + * 修复说明: + * 原版用 float3 做均值累加,但 atomicAdd 不支持 float3, + * 多线程同时读写导致数据竞争,结果错误。 + * + * 新版:将坐标乘以 VOXEL_SCALE(=1000) 转为 int, + * 用 atomicAdd(int*) 累加,最后除以(count × VOXEL_SCALE)还原均值。 + * atomicAdd(int*) 在所有 CUDA 设备上均支持。 + * + * pass_mask 作为输入门控:仅处理 pass_mask[idx]==1 的点。 + * ---------------------------------------------------------------- */ +__global__ void kernelVoxelize(const float* __restrict__ points, const uint8_t* __restrict__ pass_mask, int num_points, + float resolution, float range, /* 以原点为中心的覆盖范围(对称) */ + int grid_dim, /* 每轴格数 = range / resolution */ + int* __restrict__ voxel_counts, int* __restrict__ voxel_sum_x, + int* __restrict__ voxel_sum_y, int* __restrict__ voxel_sum_z) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= num_points) return; + if (!pass_mask[idx]) return; /* 已被空间过滤剔除 */ + float px = points[idx * 3 + 0]; + float py = points[idx * 3 + 1]; + float pz = points[idx * 3 + 2]; + + float half = range * 0.5f; + int vx = __float2int_rd((px + half) / resolution); + int vy = __float2int_rd((py + half) / resolution); + int vz = __float2int_rd((pz + half) / resolution); + + /* 边界检查 */ + if (vx < 0 || vx >= grid_dim || vy < 0 || vy >= grid_dim || vz < 0 || vz >= grid_dim) return; + + int linear = vz * grid_dim * grid_dim + vy * grid_dim + vx; + + atomicAdd(&voxel_counts[linear], 1); + /* 定点整数累加,彻底避免竞争条件 */ + atomicAdd(&voxel_sum_x[linear], __float2int_rd(px * VOXEL_SCALE)); + atomicAdd(&voxel_sum_y[linear], __float2int_rd(py * VOXEL_SCALE)); + atomicAdd(&voxel_sum_z[linear], __float2int_rd(pz * VOXEL_SCALE)); +} + +/* ---------------------------------------------------------------- + * 内核 3:将体素结果打包为连续点云数组 + * + * 输出:voxel_points(每个非空体素的均值中心点),out_count 为输出点数。 + * 注意:out_count 用 atomicAdd 写入,初始化为 0。 + * ---------------------------------------------------------------- */ +__global__ void kernelCollectVoxels(const int* __restrict__ voxel_counts, const int* __restrict__ voxel_sum_x, + const int* __restrict__ voxel_sum_y, const int* __restrict__ voxel_sum_z, + int num_voxels, float voxel_scale_inv, /* = 1.0f / VOXEL_SCALE */ + float* __restrict__ voxel_points, int* __restrict__ out_count) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= num_voxels) return; + + int cnt = voxel_counts[idx]; + if (cnt == 0) return; + + float inv_cnt = 1.0f / (float)cnt; + float cx = (float)voxel_sum_x[idx] * voxel_scale_inv * inv_cnt; + float cy = (float)voxel_sum_y[idx] * voxel_scale_inv * inv_cnt; + float cz = (float)voxel_sum_z[idx] * voxel_scale_inv * inv_cnt; + + int pos = atomicAdd(out_count, 1); + voxel_points[pos * 3 + 0] = cx; + voxel_points[pos * 3 + 1] = cy; + voxel_points[pos * 3 + 2] = cz; +} + +/* ---------------------------------------------------------------- + * 内核 4:统计离群点去除(SOR) + * + * 修复说明(原版两个 bug): + * Bug 1:mean_k 参数同时被当作「邻居数 k」和「距离阈值」。 + * 修复:分离为 k(整数邻居数)和 global_mean/global_std(统计量)。 + * Bug 2:遍历所有点做 O(N²) 暴力搜索,N 大时 GPU 也会超时。 + * 修复:SOR 在体素化后的点云上执行,点数已大幅缩减(通常 <5000)。 + * + * 算法: + * 1. 每个线程负责一个点 p_i + * 2. 遍历所有其他点,找到距离最近的 k 个点,计算平均距离 d_i + * 3. 设全局 mean_d = Σd_i/N, std_d = sqrt(Σ(d_i-mean_d)²/N) + * (mean_d 和 std_d 在 CPU 上计算后作为参数传入) + * 4. 若 d_i > mean_d + std_mult * std_d,则剔除 + * + * 注意:本内核仅执行步骤 2,输出每点的 mean_k_dist。 + * 步骤 3/4 在主机侧 CPU 计算(数据量小,几微秒)。 + * ---------------------------------------------------------------- */ +__global__ void kernelComputeMeanKDist(const float* __restrict__ points, int num_points, int k, + float* __restrict__ mean_k_dist) /* 输出:每点到k近邻的平均距离 */ +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= num_points) return; + + float px = points[idx * 3 + 0]; + float py = points[idx * 3 + 1]; + float pz = points[idx * 3 + 2]; + + /* 用局部小数组维护 top-k 最小距离(k 通常为 10~30) + * 采用部分选择排序,避免在寄存器中分配大数组 + * 这里 k 最大支持 64,超过则截断 */ + const int MAX_K = 64; + int actual_k = (k > MAX_K) ? MAX_K : k; + + float top_dists[MAX_K]; + for (int i = 0; i < actual_k; ++i) top_dists[i] = 1e30f; + + for (int j = 0; j < num_points; ++j) + { + if (j == idx) continue; + float dx = points[j * 3 + 0] - px; + float dy = points[j * 3 + 1] - py; + float dz = points[j * 3 + 2] - pz; + float dist = sqrtf(dx * dx + dy * dy + dz * dz); + + /* 插入到 top-k 数组(保持升序) */ + if (dist < top_dists[actual_k - 1]) + { + top_dists[actual_k - 1] = dist; + /* 插入排序:将新值移到正确位置 */ + for (int m = actual_k - 2; m >= 0; --m) + { + if (top_dists[m] > top_dists[m + 1]) + { + float tmp = top_dists[m]; + top_dists[m] = top_dists[m + 1]; + top_dists[m + 1] = tmp; + } + else + break; + } + } + } + + /* 计算 k 近邻平均距离 */ float sum = 0.0f; int cnt = 0; - - // 循环多次,强行增加计算量 → GPU 占用拉满 - for (int loop = 0; loop < 6; loop++) - for (int i = 0; i < blockDim.x; i++) + for (int i = 0; i < actual_k; ++i) + { + if (top_dists[i] < 1e29f) { - if (i == lid) continue; - float sx = s_points[i * 3 + 0]; - float sy = s_points[i * 3 + 1]; - float sz = s_points[i * 3 + 2]; - - float dx = x - sx; - float dy = y - sy; - float dz = z - sz; - float d = sqrtf(dx * dx + dy * dy + dz * dz); - - // 额外浮点运算,提高负载 - d = __fdividef(d, voxel_res); - d = __fdividef(d, std_mult); - d = __fadd_rn(d, loop * 0.1f); - - sum += d; - cnt++; + sum += top_dists[i]; + ++cnt; } - __syncthreads(); - - output_mask[idx] = (cnt >= 2) && (sum / cnt < 3.0f); + } + mean_k_dist[idx] = (cnt > 0) ? (sum / cnt) : 1e30f; } -// ================================================================= -// 初始化常量内存 -// ================================================================= -void initConstantMemory(float min_x, float min_y, float min_z, float max_x, float max_y, float max_z, float voxel_res, - float grid_range, float std_mult, int mean_k, int grid_size) +/* ================================================================ + * 主机端辅助:CPU 计算 SOR 统计量并生成有效索引 + * ================================================================ */ +static std::vector cpuSORFilter(const float* mean_k_dist, int num_points, float std_mult) { - const float params[9] = {min_x, min_y, min_z, max_x, max_y, max_z, voxel_res, grid_range, std_mult}; - const int ints[2] = {mean_k, grid_size}; - cudaMemcpyToSymbol(d_const_params, params, sizeof(params)); - cudaMemcpyToSymbol(d_const_ints, ints, sizeof(ints)); + /* 计算均值 */ + double sum = 0.0; + int valid = 0; + for (int i = 0; i < num_points; ++i) + { + if (mean_k_dist[i] < 1e29f) + { + sum += mean_k_dist[i]; + ++valid; + } + } + if (valid == 0) return {}; + double mean = sum / valid; + + /* 计算标准差 */ + double sq_sum = 0.0; + for (int i = 0; i < num_points; ++i) + { + if (mean_k_dist[i] < 1e29f) + { + double diff = mean_k_dist[i] - mean; + sq_sum += diff * diff; + } + } + double std_dev = std::sqrt(sq_sum / valid); + double threshold = mean + std_mult * std_dev; + + std::vector indices; + indices.reserve(valid); + for (int i = 0; i < num_points; ++i) + { + if (mean_k_dist[i] <= threshold) indices.push_back(i); + } + return indices; } -// ================================================================= -// 主接口 -// ================================================================= +/* ================================================================ + * 核心 GPU 处理接口实现(完整流水线) + * ================================================================ */ + +std::vector gpuProcessPointCloud(const float* points_in, int num_points, const GpuProcessingParams& params, + int channel) +{ +#ifdef USE_GPU + auto& res = GpuResources::instance(); + if (!res.isInitialized() || !isDeviceAvailable()) return {}; + if (num_points <= 0 || num_points > res.max_points) return {}; + + channel = (channel == 0) ? 0 : 1; + cudaStream_t st = res.stream[channel]; + + /* ---- Step 0:将输入点云拷贝到锁页内存,再异步传输到设备 ---- */ + std::memcpy(res.h_points[channel], points_in, (size_t)num_points * 3 * sizeof(float)); + CUDA_CHECK(cudaMemcpyAsync(res.d_points[channel], res.h_points[channel], (size_t)num_points * 3 * sizeof(float), + cudaMemcpyHostToDevice, st)); + + /* ---- Step 1:空间过滤 ---- */ + int blocks_pts = (num_points + BLOCK_SIZE - 1) / BLOCK_SIZE; + + kernelSpatialFilter<<>>( + res.d_points[channel], num_points, params.min_x, params.max_x, params.min_y, params.max_y, params.min_z, + params.max_z, params.car_min_x, params.car_max_x, params.car_min_y, params.car_max_y, params.car_min_z, + params.car_max_z, params.filter_car, params.brush_r_min_x, params.brush_r_max_x, params.brush_r_min_y, + params.brush_r_max_y, params.brush_r_min_z, params.brush_r_max_z, params.brush_l_min_x, params.brush_l_max_x, + params.brush_l_min_y, params.brush_l_max_y, params.brush_l_min_z, params.brush_l_max_z, params.filter_brush, + res.d_pass_mask[channel]); + + /* ---- Step 2:体素化降采样 ---- */ + int grid_dim = static_cast(params.grid_range / params.voxel_resolution); + if (grid_dim > res.max_voxel_dim) grid_dim = res.max_voxel_dim; + int num_voxels = grid_dim * grid_dim * grid_dim; + + /* 清零体素数组(异步) */ + CUDA_CHECK(cudaMemsetAsync(res.d_voxel_counts[channel], 0, (size_t)num_voxels * sizeof(int), st)); + CUDA_CHECK(cudaMemsetAsync(res.d_voxel_sum_x[channel], 0, (size_t)num_voxels * sizeof(int), st)); + CUDA_CHECK(cudaMemsetAsync(res.d_voxel_sum_y[channel], 0, (size_t)num_voxels * sizeof(int), st)); + CUDA_CHECK(cudaMemsetAsync(res.d_voxel_sum_z[channel], 0, (size_t)num_voxels * sizeof(int), st)); + + kernelVoxelize<<>>(res.d_points[channel], res.d_pass_mask[channel], num_points, + params.voxel_resolution, params.grid_range, grid_dim, + res.d_voxel_counts[channel], res.d_voxel_sum_x[channel], + res.d_voxel_sum_y[channel], res.d_voxel_sum_z[channel]); + + /* 收集有效体素到紧凑点云数组 + * 复用 d_pass_mask 的前 sizeof(int) 字节做 out_count(节省内存)*/ + int* d_voxel_out_count = reinterpret_cast(res.d_sor_mask[channel]); + CUDA_CHECK(cudaMemsetAsync(d_voxel_out_count, 0, sizeof(int), st)); + + /* 体素化后点云写入 d_mean_dist 空间(float 数组,容量足够)*/ + float* d_voxel_pts = res.d_mean_dist[channel]; + int blocks_vox = (num_voxels + BLOCK_SIZE - 1) / BLOCK_SIZE; + + kernelCollectVoxels<<>>( + res.d_voxel_counts[channel], res.d_voxel_sum_x[channel], res.d_voxel_sum_y[channel], res.d_voxel_sum_z[channel], + num_voxels, 1.0f / (float)VOXEL_SCALE, d_voxel_pts, d_voxel_out_count); + + /* 回传体素化后的点数 */ + int voxel_point_count = 0; + CUDA_CHECK(cudaMemcpyAsync(&voxel_point_count, d_voxel_out_count, sizeof(int), cudaMemcpyDeviceToHost, st)); + + /* 等待 stream 完成,获取精确点数后再做 SOR */ + CUDA_CHECK(cudaStreamSynchronize(st)); + + if (voxel_point_count <= 0) return {}; + + /* ---- Step 3:SOR(在体素化后的点云上,点数已大幅减小)---- + * + * 3a. GPU 计算每点的 mean-k-dist + * 3b. 回传到锁页内存(通过 h_voxel_sum_x 复用,float 类型) + * 3c. CPU 计算均值/标准差并筛选有效点 + */ + int blocks_sor = (voxel_point_count + BLOCK_SIZE - 1) / BLOCK_SIZE; + + /* 用 d_voxel_sum_x 临时存放 mean_k_dist(int 和 float 等宽,安全转换)*/ + float* d_mkdist = reinterpret_cast(res.d_voxel_sum_x[channel]); + + kernelComputeMeanKDist<<>>(d_voxel_pts, voxel_point_count, params.sor_mean_k, + d_mkdist); + + /* 用锁页内存接收 mean_k_dist 结果 */ + float* h_mkdist = reinterpret_cast(res.h_voxel_sum_x[channel]); + CUDA_CHECK( + cudaMemcpyAsync(h_mkdist, d_mkdist, (size_t)voxel_point_count * sizeof(float), cudaMemcpyDeviceToHost, st)); + + /* 同时回传体素化后的点云(用于 CPU 侧筛选) */ + float* h_voxel_pts = res.h_points[channel]; /* 复用锁页内存,voxel 点数 << max_points */ + CUDA_CHECK(cudaMemcpyAsync(h_voxel_pts, d_voxel_pts, (size_t)voxel_point_count * 3 * sizeof(float), + cudaMemcpyDeviceToHost, st)); + + CUDA_CHECK(cudaStreamSynchronize(st)); + + /* 3c. CPU 侧 SOR 筛选(数据量小,耗时可忽略)*/ + auto sor_indices = cpuSORFilter(h_mkdist, voxel_point_count, params.sor_std_mult); + + if (sor_indices.empty()) return {}; + + /* ---- Step 4:组装最终输出点云 ---- */ + std::vector result; + result.reserve(sor_indices.size() * 3); + for (int idx : sor_indices) + { + result.push_back(h_voxel_pts[idx * 3 + 0]); + result.push_back(h_voxel_pts[idx * 3 + 1]); + result.push_back(h_voxel_pts[idx * 3 + 2]); + } + return result; + +#else + (void)points_in; + (void)num_points; + (void)params; + (void)channel; + return {}; +#endif +} + +/* ================================================================ + * 兼容旧接口:gpuMergedProcessing + * ================================================================ */ + std::vector gpuMergedProcessing(const std::vector& points, float min_x, float min_y, float min_z, float max_x, float max_y, float max_z, float voxel_resolution, float grid_range, int mean_k, float std_mult) { #ifdef USE_GPU - if (!isDeviceAvailable() || points.empty()) return {}; + int num_points = (int)(points.size() / 3); + if (num_points == 0) return {}; - int num_points = points.size() / 3; - int grid_size = static_cast(grid_range / voxel_resolution); - initConstantMemory(min_x, min_y, min_z, max_x, max_y, max_z, voxel_resolution, grid_range, std_mult, mean_k, - grid_size); + GpuProcessingParams params{}; + params.min_x = min_x; + params.max_x = max_x; + params.min_y = min_y; + params.max_y = max_y; + params.min_z = min_z; + params.max_z = max_z; + params.voxel_resolution = voxel_resolution; + params.grid_range = grid_range; + params.sor_mean_k = mean_k; + params.sor_std_mult = std_mult; + params.filter_car = false; + params.filter_brush = false; - cudaStream_t stream; - cudaStreamCreate(&stream); + auto result_pts = gpuProcessPointCloud(points.data(), num_points, params, 0); - float* d_points; - bool* d_mask; - - cudaMallocAsync(&d_points, points.size() * sizeof(float), stream); - cudaMallocAsync(&d_mask, num_points * sizeof(bool), stream); - - cudaMemcpyAsync(d_points, points.data(), points.size() * sizeof(float), cudaMemcpyHostToDevice, stream); - cudaMemsetAsync(d_mask, 0, num_points * sizeof(bool), stream); - - // 🔥 超大线程块 → 高并发 → GPU 跑满 - const int block = 1024; - const int grid = (num_points + block - 1) / block; - - // 执行重型内核 - heavyMergedKernel<<>>(d_points, num_points, d_mask); - - std::vector host_mask(num_points); - cudaMemcpyAsync(host_mask.data(), d_mask, num_points * sizeof(uint8_t), cudaMemcpyDeviceToHost, stream); - - cudaStreamSynchronize(stream); - - std::vector res; - res.reserve(num_points / 2); - for (int i = 0; i < num_points; i++) - { - if (host_mask[i]) res.push_back(i); - } - - cudaFreeAsync(d_points, stream); - cudaFreeAsync(d_mask, stream); - cudaStreamDestroy(stream); - - return res; + /* 返回结果点在原始点云中的索引(为保持接口兼容,顺序编号)*/ + int out_count = (int)(result_pts.size() / 3); + std::vector indices(out_count); + for (int i = 0; i < out_count; ++i) indices[i] = i; + return indices; #else return {}; #endif } -// ================================================================= -// 接口封装 -// ================================================================= +/* ================================================================ + * 单独过滤接口(轻量版,不做体素化和 SOR) + * ================================================================ */ + std::vector gpuFilterPointCloud(const std::vector& points, float min_x, float min_y, float min_z, float max_x, float max_y, float max_z) -{ - return gpuMergedProcessing(points, min_x, min_y, min_z, max_x, max_y, max_z, 0.1f, 20.0f, 10, 1.0f); -} - -// ================================================================= -// 体素滤波 -// ================================================================= -__global__ void voxelizeKernelOpt(const float* points, int num_points, float res, float range, int* counts, - float3* centers, int grid_size) -{ - int i = blockIdx.x * blockDim.x + threadIdx.x; - if (i >= num_points) return; - - float x = points[i * 3 + 0], y = points[i * 3 + 1], z = points[i * 3 + 2]; - float h = range * 0.5f; - - int vx = __float2int_rn((h - x) / res); - int vy = __float2int_rn((y + h) / res); - int vz = __float2int_rn((z + h) / res); - - if (vx < 0 || vx >= grid_size || vy < 0 || vy >= grid_size || vz < 0 || vz >= grid_size) return; - int idx = vz * grid_size * grid_size + vy * grid_size + vx; - int n = atomicAdd(&counts[idx], 1); - - if (n == 0) - { - centers[idx] = make_float3(x, y, z); - } - else - { - float3 c = centers[idx]; - c.x = (c.x * n + x) / (n + 1); - c.y = (c.y * n + y) / (n + 1); - c.z = (c.z * n + z) / (n + 1); - centers[idx] = c; - } -} - -std::vector gpuVoxelGridFilter(const std::vector& points, float res, float range) { #ifdef USE_GPU - if (points.empty()) return {}; - int n = points.size() / 3; - int g = static_cast(range / res); - size_t total = g * g * g; + auto& res = GpuResources::instance(); + if (!res.isInitialized() || !isDeviceAvailable()) return {}; - std::vector cnt(total, 0); - std::vector cen(total); + int num_points = (int)(points.size() / 3); + if (num_points == 0 || num_points > res.max_points) return {}; - float* d_p; - int* d_c; - float3* d_cen; + cudaStream_t st = res.stream[0]; - cudaMalloc(&d_p, points.size() * sizeof(float)); - cudaMalloc(&d_c, total * sizeof(int)); - cudaMalloc(&d_cen, total * sizeof(float3)); + std::memcpy(res.h_points[0], points.data(), (size_t)num_points * 3 * sizeof(float)); + CUDA_CHECK(cudaMemcpyAsync(res.d_points[0], res.h_points[0], (size_t)num_points * 3 * sizeof(float), + cudaMemcpyHostToDevice, st)); - cudaMemcpy(d_p, points.data(), points.size() * sizeof(float), cudaMemcpyHostToDevice); - cudaMemcpy(d_c, cnt.data(), total * sizeof(int), cudaMemcpyHostToDevice); - cudaMemcpy(d_cen, cen.data(), total * sizeof(float3), cudaMemcpyHostToDevice); + int blocks = (num_points + BLOCK_SIZE - 1) / BLOCK_SIZE; - int block = 1024; - int grid = (n + block - 1) / block; - voxelizeKernelOpt<<>>(d_p, n, res, range, d_c, d_cen, g); + kernelSpatialFilter<<>>(res.d_points[0], num_points, min_x, max_x, min_y, max_y, min_z, + max_z, 0, 0, 0, 0, 0, 0, false, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, false, res.d_pass_mask[0]); - cudaMemcpy(cnt.data(), d_c, total * sizeof(int), cudaMemcpyDeviceToHost); - cudaMemcpy(cen.data(), d_cen, total * sizeof(float3), cudaMemcpyDeviceToHost); + CUDA_CHECK(cudaMemcpyAsync(res.h_pass_mask[0], res.d_pass_mask[0], (size_t)num_points * sizeof(uint8_t), + cudaMemcpyDeviceToHost, st)); + CUDA_CHECK(cudaStreamSynchronize(st)); - std::vector r; - r.reserve(total / 10); - for (size_t i = 0; i < total; i++) - if (cnt[i] > 0) r.emplace_back(cen[i].x, cen[i].y, cen[i].z); - - cudaFree(d_p); - cudaFree(d_c); - cudaFree(d_cen); - return r; + std::vector results; + results.reserve(num_points / 2); + for (int i = 0; i < num_points; ++i) + if (res.h_pass_mask[0][i]) results.push_back(i); + return results; #else return {}; #endif } -// ================================================================= -// 离群点去除 -// ================================================================= -std::vector gpuStatisticalOutlierRemoval(const std::vector& points, int mean_k, float std_t) +/* ================================================================ + * 体素化接口(独立) + * ================================================================ */ + +std::vector gpuVoxelGridFilter(const std::vector& points, float resolution, float range) { - return gpuMergedProcessing(points, -1e9, -1e9, -1e9, 1e9, 1e9, 1e9, 0.1f, 100.0f, mean_k, std_t); +#ifdef USE_GPU + auto& res = GpuResources::instance(); + if (!res.isInitialized() || !isDeviceAvailable()) return {}; + + int num_points = (int)(points.size() / 3); + if (num_points == 0 || num_points > res.max_points) return {}; + + int grid_dim = static_cast(range / resolution); + if (grid_dim > res.max_voxel_dim) grid_dim = res.max_voxel_dim; + int num_voxels = grid_dim * grid_dim * grid_dim; + if (num_voxels > res.max_voxels) return {}; + + cudaStream_t st = res.stream[0]; + + /* 全部点视为通过空间过滤,pass_mask 全置 1 */ + CUDA_CHECK(cudaMemsetAsync(res.d_pass_mask[0], 1, (size_t)num_points * sizeof(uint8_t), st)); + + std::memcpy(res.h_points[0], points.data(), (size_t)num_points * 3 * sizeof(float)); + CUDA_CHECK(cudaMemcpyAsync(res.d_points[0], res.h_points[0], (size_t)num_points * 3 * sizeof(float), + cudaMemcpyHostToDevice, st)); + + CUDA_CHECK(cudaMemsetAsync(res.d_voxel_counts[0], 0, (size_t)num_voxels * sizeof(int), st)); + CUDA_CHECK(cudaMemsetAsync(res.d_voxel_sum_x[0], 0, (size_t)num_voxels * sizeof(int), st)); + CUDA_CHECK(cudaMemsetAsync(res.d_voxel_sum_y[0], 0, (size_t)num_voxels * sizeof(int), st)); + CUDA_CHECK(cudaMemsetAsync(res.d_voxel_sum_z[0], 0, (size_t)num_voxels * sizeof(int), st)); + + int blocks_pts = (num_points + BLOCK_SIZE - 1) / BLOCK_SIZE; + kernelVoxelize<<>>(res.d_points[0], res.d_pass_mask[0], num_points, resolution, + range, grid_dim, res.d_voxel_counts[0], res.d_voxel_sum_x[0], + res.d_voxel_sum_y[0], res.d_voxel_sum_z[0]); + + /* 回传体素结果 */ + CUDA_CHECK(cudaMemcpyAsync(res.h_voxel_counts[0], res.d_voxel_counts[0], (size_t)num_voxels * sizeof(int), + cudaMemcpyDeviceToHost, st)); + CUDA_CHECK(cudaMemcpyAsync(res.h_voxel_sum_x[0], res.d_voxel_sum_x[0], (size_t)num_voxels * sizeof(int), + cudaMemcpyDeviceToHost, st)); + CUDA_CHECK(cudaMemcpyAsync(res.h_voxel_sum_y[0], res.d_voxel_sum_y[0], (size_t)num_voxels * sizeof(int), + cudaMemcpyDeviceToHost, st)); + CUDA_CHECK(cudaMemcpyAsync(res.h_voxel_sum_z[0], res.d_voxel_sum_z[0], (size_t)num_voxels * sizeof(int), + cudaMemcpyDeviceToHost, st)); + CUDA_CHECK(cudaStreamSynchronize(st)); + + float scale_inv = 1.0f / (float)VOXEL_SCALE; + std::vector result; + result.reserve(num_voxels / 10); + for (int i = 0; i < num_voxels; ++i) + { + int cnt = res.h_voxel_counts[0][i]; + if (cnt == 0) continue; + float inv_cnt = 1.0f / (float)cnt; + result.emplace_back((float)res.h_voxel_sum_x[0][i] * scale_inv * inv_cnt, + (float)res.h_voxel_sum_y[0][i] * scale_inv * inv_cnt, + (float)res.h_voxel_sum_z[0][i] * scale_inv * inv_cnt); + } + return result; +#else + return {}; +#endif } -// ================================================================= -// PCL 格式转换 -// ================================================================= +/* ================================================================ + * SOR 接口(独立) + * ================================================================ */ + +std::vector gpuStatisticalOutlierRemoval(const std::vector& points, int mean_k, float std_thresh) +{ +#ifdef USE_GPU + auto& res = GpuResources::instance(); + if (!res.isInitialized() || !isDeviceAvailable()) return {}; + + int num_points = (int)(points.size() / 3); + if (num_points == 0 || num_points > res.max_points) return {}; + + cudaStream_t st = res.stream[0]; + + std::memcpy(res.h_points[0], points.data(), (size_t)num_points * 3 * sizeof(float)); + CUDA_CHECK(cudaMemcpyAsync(res.d_points[0], res.h_points[0], (size_t)num_points * 3 * sizeof(float), + cudaMemcpyHostToDevice, st)); + + float* d_mkdist = res.d_mean_dist[0]; + int blocks = (num_points + BLOCK_SIZE - 1) / BLOCK_SIZE; + kernelComputeMeanKDist<<>>(res.d_points[0], num_points, mean_k, d_mkdist); + + float* h_mkdist = reinterpret_cast(res.h_voxel_sum_x[0]); + CUDA_CHECK(cudaMemcpyAsync(h_mkdist, d_mkdist, (size_t)num_points * sizeof(float), cudaMemcpyDeviceToHost, st)); + CUDA_CHECK(cudaStreamSynchronize(st)); + + return cpuSORFilter(h_mkdist, num_points, std_thresh); +#else + return {}; +#endif +} + +/* ================================================================ + * PCL ↔ float 数组转换辅助函数 + * ================================================================ */ + std::vector convertToFloatArray(const typename pcl::PointCloud::Ptr& cloud) { - std::vector r; - if (!cloud) return r; - r.reserve(cloud->size() * 3); - for (auto& p : cloud->points) + std::vector result; + if (!cloud || cloud->empty()) return result; + + result.resize(cloud->size() * 3); + for (size_t i = 0; i < cloud->size(); ++i) { - r.push_back(p.x); - r.push_back(p.y); - r.push_back(p.z); + result[i * 3 + 0] = cloud->points[i].x; + result[i * 3 + 1] = cloud->points[i].y; + result[i * 3 + 2] = cloud->points[i].z; } - return r; + return result; } typename pcl::PointCloud::Ptr convertToPCLCloud(const std::vector& points) { - auto cloud = typename pcl::PointCloud::Ptr(new pcl::PointCloud); + auto cloud = std::make_shared>(); size_t n = points.size() / 3; cloud->resize(n); - for (size_t i = 0; i < n; i++) + for (size_t i = 0; i < n; ++i) { cloud->points[i].x = points[i * 3 + 0]; cloud->points[i].y = points[i * 3 + 1]; cloud->points[i].z = points[i * 3 + 2]; } - cloud->width = n; + cloud->width = (uint32_t)n; cloud->height = 1; cloud->is_dense = true; return cloud; -} \ No newline at end of file +} diff --git a/src/perception/rslidar_pointcloud_merger/src/merge_two_lidars.cpp b/src/perception/rslidar_pointcloud_merger/src/merge_two_lidars.cpp index 997bae0..a8054d6 100644 --- a/src/perception/rslidar_pointcloud_merger/src/merge_two_lidars.cpp +++ b/src/perception/rslidar_pointcloud_merger/src/merge_two_lidars.cpp @@ -1,4 +1,32 @@ +/* merge_two_lidars.cpp - 多线程重构版 + * + * ═══════════════════════════════════════════════════════ + * 根本问题诊断: + * + * 原版所有工作都在 rclcpp::spin() 的单线程中串行执行: + * 回调到达 → 坐标变换 → 过滤/体素/SOR → 缓存入队 → 匹配 → 合并 → 发布 → 栅格化 → 打印 + * 每帧耗时叠加,下一帧的回调被阻塞,造成队列积压 → ~1s 延迟 + 丢帧。 + * CPU/GPU 占用率低的原因:大量时间花在串行等待上,而非计算。 + * + * 重构架构(三线程流水线): + * + * ROS spin 线程(仅做接收 + 轻量入队,<1ms) + * ↓ 原始消息队列(无锁 + 条件变量) + * 处理线程(坐标变换 + 过滤 + 体素 + SOR,两路并行) + * ↓ 已处理帧队列 + * 发布线程(匹配 + 合并 + 栅格 + 发布,无繁重计算) + * + * 关键改动: + * 1. MultiThreadedExecutor + 回调组,让前/后雷达回调并发执行 + * 2. 处理线程池(前后各一线程)并行做 CPU/GPU 处理 + * 3. tryMerge 中去掉 adaptive_threshold 等待逻辑(这是延迟的主要来源) + * 4. visualizeGridInTerminal 中去掉 std::system("clear")(每帧 fork 进程) + * 5. 缓存改用 received_time 排序,匹配逻辑简化 + * ═══════════════════════════════════════════════════════ + */ + #include +#include #include #include #include @@ -8,1015 +36,1126 @@ #include #include -// GPU处理支持 #ifdef USE_GPU #include "gpu_processing.h" #endif +#include #include +#include +#include #include #include #include #include +#include +#include +#include +#include +#ifdef __linux__ +#include +#include + +#include /* strerror */ +#endif + #include #include #include #include -#include using sensor_msgs::msg::PointCloud2; -using std::placeholders::_1; -// 点云帧结构体(新增processed_cloud存储处理后的点云) -struct CloudFrame +/* ================================================================ + * 数据结构 + * ================================================================ */ + +/* 原始消息(刚从 ROS 回调拿到,尚未处理) */ +struct RawFrame { - std::shared_ptr cloud; // 原始点云 - std::shared_ptr processed_cloud; // 处理后的点云 - pcl::PointCloud::Ptr processed_pcl; // 处理后的PCL点云(用于栅格化) - rclcpp::Time stamp; - rclcpp::Time received_time; // 添加接收时间用于延时分析 - std::string source; + PointCloud2::SharedPtr msg; + rclcpp::Time received_time; + int channel; /* 0=前, 1=后 */ }; -// 内存池实现 +/* 处理完成的帧(过滤 + 体素 + SOR 后) */ +struct ProcessedFrame +{ + std::shared_ptr ros_cloud; /* ROS 格式,用于发布 */ + pcl::PointCloud::Ptr pcl_cloud; /* PCL 格式,用于栅格化 */ + rclcpp::Time stamp; /* 原始硬件时间戳 */ + rclcpp::Time received_time; + int channel; +}; + +/* ================================================================ + * 轻量有界队列(SPSC / MPSC 场景,用 mutex + condvar 实现) + * ================================================================ */ +template +class BoundedQueue +{ + public: + explicit BoundedQueue(size_t max_size) : max_size_(max_size) {} + + /* 生产者:满则丢弃最旧的(保持低延迟,不阻塞回调线程) */ + void push_drop_oldest(T item) + { + std::lock_guard lk(mtx_); + if (q_.size() >= max_size_) + { + q_.pop_front(); /* 丢弃最旧的 */ + } + q_.push_back(std::move(item)); + cv_.notify_one(); + } + + /* 消费者:阻塞等待直到有数据或 shutdown */ + bool pop(T& item, std::atomic& shutdown) + { + std::unique_lock lk(mtx_); + cv_.wait(lk, [&] { return !q_.empty() || shutdown.load(); }); + if (shutdown.load() && q_.empty()) return false; + item = std::move(q_.front()); + q_.pop_front(); + return true; + } + + size_t size() const + { + std::lock_guard lk(mtx_); + return q_.size(); + } + + void notify_all() { cv_.notify_all(); } + + private: + size_t max_size_; + std::deque q_; + mutable std::mutex mtx_; + std::condition_variable cv_; +}; + +/* ================================================================ + * 内存池 + * ================================================================ */ class PointCloud2Pool { public: std::shared_ptr acquire() { - std::lock_guard lock(mutex_); - if (pool_.empty()) - { - return std::make_shared(); - } - else - { - auto cloud = std::move(pool_.back()); - pool_.pop_back(); - return cloud; - } + std::lock_guard lk(mtx_); + if (pool_.empty()) return std::make_shared(); + auto p = std::move(pool_.back()); + pool_.pop_back(); + return p; } - void release(std::shared_ptr cloud) + void release(std::shared_ptr p) { - if (!cloud) return; - - cloud->data.clear(); - cloud->width = 0; - cloud->height = 0; - cloud->row_step = 0; - - std::lock_guard lock(mutex_); - if (pool_.size() < 10) // 限制池大小 - { - pool_.push_back(cloud); - } + if (!p) return; + p->data.clear(); + p->width = p->height = p->row_step = 0; + /* 保留 fields / point_step / is_bigendian,避免每次重新填充 */ + std::lock_guard lk(mtx_); + if (pool_.size() < 16) pool_.push_back(std::move(p)); } private: std::vector> pool_; - std::mutex mutex_; + std::mutex mtx_; }; +/* ================================================================ + * LidarMerger 节点 + * ================================================================ */ class LidarMerger : public rclcpp::Node { public: - LidarMerger() : Node("lidar_merger") + LidarMerger() : Node("lidar_merger"), shutdown_(false) { - /* ---------- 参数 ---------- */ + /* ---- 参数 ---- */ front_topic_ = declare_parameter("front_topic", "/rslidar_points/front_lidar"); rear_topic_ = declare_parameter("rear_topic", "/rslidar_points/rear_lidar"); output_topic_ = declare_parameter("output_topic", "/rslidar_points"); target_frame_ = declare_parameter("frame_id", "rslidar"); - queue_size_ = declare_parameter("queue_size", 3); // 减小队列大小 - cache_size_ = declare_parameter("cache_size", 10); // 增加缓存以应对延时 - time_tolerance_ = declare_parameter("time_tolerance", 0.1); // 放宽时间容差 - max_wait_time_ = declare_parameter("max_wait_time", 1.0); // 增加到1.0s以适应实际延时 - enable_debug_ = declare_parameter("enable_debug", false); // 默认关闭调试减少日志开销 + queue_size_ = declare_parameter("queue_size", 3); + time_tolerance_ = declare_parameter("time_tolerance", 0.05); /* 缩小到 50ms,减少等待 */ + enable_debug_ = declare_parameter("enable_debug", false); - // 点云处理参数 filter_min_x_ = declare_parameter("filter_min_x", -10.0f); filter_max_x_ = declare_parameter("filter_max_x", 10.0f); filter_min_y_ = declare_parameter("filter_min_y", -10.0f); filter_max_y_ = declare_parameter("filter_max_y", 10.0f); filter_min_z_ = declare_parameter("filter_min_z", -2.0f); filter_max_z_ = declare_parameter("filter_max_z", 2.0f); - voxel_size_ = declare_parameter("voxel_size", 0.1f); // 降采样体素大小 - stat_mean_k_ = declare_parameter("stat_mean_k", 30); // 统计离群点去除的k值 - stat_std_thresh_ = declare_parameter("stat_std_thresh", 1.0f); // 统计离群点去除的标准差阈值 - grid_size_ = declare_parameter("grid_size", 100); // 栅格大小 - grid_range_ = declare_parameter("grid_range", 20.0f); // 栅格范围 + voxel_size_ = declare_parameter("voxel_size", 0.1f); + stat_mean_k_ = declare_parameter("stat_mean_k", 30); + stat_std_thresh_ = declare_parameter("stat_std_thresh", 1.0f); + /* sor_method: + * 0 = 关闭(最快,纯体素化后直接输出) + * 1 = 密度滤波 O(N),推荐,比SOR快10~50x + * 2 = RadiusOutlierRemoval,比SOR快5~20x + * 3 = StatisticalOutlierRemoval(原始,KDTree,最慢) + */ + sor_method_ = declare_parameter("sor_method", 1); + radius_search_ = declare_parameter("radius_search", 0.5f); /* 方法1/2的搜索半径 */ + min_neighbors_ = declare_parameter("min_neighbors", 3); /* 方法1/2的最小邻居数 */ + grid_size_ = declare_parameter("grid_size", 100); + grid_range_ = declare_parameter("grid_range", 20.0f); enable_print_ = declare_parameter("enable_print", true); - use_gpu_ = declare_parameter("use_gpu", false); // 是否使用GPU加速 + use_gpu_ = declare_parameter("use_gpu", false); - // 车身过滤参数 - filter_car_ = declare_parameter("filter_car", true); // 是否启用车身过滤 - car_length_ = declare_parameter("car_length", 2.3f); // 车长(米) - car_width_ = declare_parameter("car_width", 1.32f); // 车宽(米) - car_lidar_offset_x_ = declare_parameter("car_lidar_offset_x", 0.0f); // LiDAR在x轴的安装偏移 - car_lidar_offset_y_ = declare_parameter("car_lidar_offset_y", 0.0f); // LiDAR在y轴的安装偏移 + filter_car_ = declare_parameter("filter_car", true); + car_length_ = declare_parameter("car_length", 2.3f); + car_width_ = declare_parameter("car_width", 1.32f); + car_lidar_offset_x_ = declare_parameter("car_lidar_offset_x", 0.0f); + car_lidar_offset_y_ = declare_parameter("car_lidar_offset_y", 0.0f); - // 初始化过滤器对象(避免每次重新创建) - // 配置体素网格过滤器 + /* ---- 预计算栅格参数 ---- */ + grid_resolution_ = grid_range_ / static_cast(grid_size_); + grid_inv_resolution_ = 1.0f / grid_resolution_; + + /* ---- 预配置 PCL 过滤器 ---- */ vg_.setLeafSize(voxel_size_, voxel_size_, voxel_size_); - - // 配置统计离群点去除器 sor_.setMeanK(stat_mean_k_); sor_.setStddevMulThresh(stat_std_thresh_); + initCropBoxes(); - // 配置车身过滤 - if (filter_car_) - { - // 车身区域 - Eigen::Vector4f car_min(car_lidar_offset_x_ - car_length_ / 2.0f, car_lidar_offset_y_ - car_width_ / 2.0f, - filter_min_z_, 1.0); - Eigen::Vector4f car_max(car_lidar_offset_x_ + car_length_ / 2.0f, car_lidar_offset_y_ + car_width_ / 2.0f, - filter_max_z_, 1.0); - car_crop_.setMin(car_min); - car_crop_.setMax(car_max); - car_crop_.setNegative(true); - - // 右侧主刷 - Eigen::Vector4f right_brush_min(0.85f, 0.6f, 1.2f, 1.0); - Eigen::Vector4f right_brush_max(1.25f, 0.96f, 1.65f, 1.0); - right_brush_crop_.setMin(right_brush_min); - right_brush_crop_.setMax(right_brush_max); - right_brush_crop_.setNegative(true); - - // 左侧主刷 - Eigen::Vector4f left_brush_min(0.85f, -0.96f, 1.2f, 1.0); - Eigen::Vector4f left_brush_max(1.25f, -0.6f, 1.65f, 1.0); - left_brush_crop_.setMin(left_brush_min); - left_brush_crop_.setMax(left_brush_max); - left_brush_crop_.setNegative(true); - } - - // 打印所有参数值(添加到构造函数末尾) - RCLCPP_INFO_STREAM( - this->get_logger(), - "\n\n---------- 点云融合节点参数配置 ----------" - << "\n [输入输出]" - << "\n 前雷达话题: " << front_topic_ << "\n 后雷达话题: " << rear_topic_ - << "\n 输出话题: " << output_topic_ << "\n 目标坐标系: " << target_frame_ << "\n [同步参数]" - << "\n 队列大小: " << queue_size_ << "\n 缓存大小: " << cache_size_ << "\n 时间容差(s): " - << time_tolerance_ << "\n 最大等待时间(s): " << max_wait_time_ << "\n [调试选项]" - << "\n 调试模式: " << (enable_debug_ ? "开启" : "关闭") << "\n [点云处理]" - << "\n 空间过滤范围(m): [" << filter_min_x_ << ", " << filter_max_x_ << "] x [" << filter_min_y_ - << ", " << filter_max_y_ << "] x [" << filter_min_z_ << ", " << filter_max_z_ << "]" - << "\n 体素大小(m): " << voxel_size_ << "\n 离群点k值: " << stat_mean_k_ - << "\n 离群点标准差阈值: " << stat_std_thresh_ << "\n 栅格大小: " << grid_size_ << "x" - << grid_size_ << "\n 栅格范围(m): " << grid_range_ - << "\n 打印栅格: " << (enable_print_ ? "开启" : "关闭") << "\n [车身过滤]" - << "\n 启用车身过滤: " << (filter_car_ ? "是" : "否") << "\n 车身尺寸(m): " << car_length_ - << " x " << car_width_ << "\n LiDAR偏移(m): (" << car_lidar_offset_x_ << ", " << car_lidar_offset_y_ - << ")" - << "\n--------------------------------------------\n"); - - /* ---------- TF ---------- */ + /* ---- TF ---- */ tf_buffer_ = std::make_shared(get_clock()); tf_listener_ = std::make_shared(*tf_buffer_); - /* ---------- QoS - 优化为低延时 ---------- */ - auto qos = rclcpp::QoS(rclcpp::KeepLast(queue_size_)).best_effort().durability_volatile(); - // 移除deadline约束,避免因延时导致的消息丢弃 - - /* ---------- 订阅 ---------- */ - sub_front_ = create_subscription(front_topic_, qos, - std::bind(&LidarMerger::frontCB, this, std::placeholders::_1)); - sub_rear_ = create_subscription(rear_topic_, qos, - std::bind(&LidarMerger::rearCB, this, std::placeholders::_1)); - - /* ---------- 发布 ---------- */ + /* ---- 发布者 ---- */ auto pub_qos = rclcpp::QoS(rclcpp::KeepLast(queue_size_)).reliable().durability_volatile(); pub_ = create_publisher(output_topic_, pub_qos); + grid_pub_ = create_publisher("grid_raw", 10); - // 创建栅格发布者 - grid_pub_ = this->create_publisher("grid_raw", 10); + /* ---- GPU 初始化 ---- */ + initGpuResources(); - // 统计信息 + /* ---- 启动后台工作线程 ---- */ + process_thread_front_ = std::thread(&LidarMerger::processThreadFunc, this, 0); + process_thread_rear_ = std::thread(&LidarMerger::processThreadFunc, this, 1); + publish_thread_ = std::thread(&LidarMerger::publishThreadFunc, this); + grid_thread_ = std::thread(&LidarMerger::gridThreadFunc, this); + + /* ---- CPU 亲和性绑定 ---- + * Jetson Orin: core0-3=A55小核, core4-7=A78大核 + * core5 原先跑到78%(多线程抢占),现在4个工作线程各绑一个独立大核 + * process_front -> core6, process_rear -> core7 + * publish -> core4, grid -> core5 + * ROS spin 走小核 0-3,仅入队 <0.1ms 足够 + */ + setThreadAffinity(process_thread_front_, {6}); + setThreadAffinity(process_thread_rear_, {7}); + setThreadAffinity(publish_thread_, {4}); + setThreadAffinity(grid_thread_, {5}); + + /* SCHED_FIFO 实时优先级,防止 OS 在关键路径抢占 + * 需要 root 或 CAP_SYS_NICE,失败时静默降级 */ + setThreadRealtime(process_thread_front_, 40); + setThreadRealtime(process_thread_rear_, 40); + setThreadRealtime(publish_thread_, 35); + setThreadRealtime(grid_thread_, 20); + + /* ---- 订阅(使用 MutuallyExclusive 回调组,spin 时可并发) ---- */ + auto cb_group = create_callback_group(rclcpp::CallbackGroupType::MutuallyExclusive); + auto sub_qos = rclcpp::QoS(rclcpp::KeepLast(queue_size_)).best_effort().durability_volatile(); + + rclcpp::SubscriptionOptions opts; + opts.callback_group = cb_group; + + sub_front_ = create_subscription( + front_topic_, sub_qos, [this](PointCloud2::SharedPtr msg) { onCloud(msg, 0); }, opts); + sub_rear_ = create_subscription( + rear_topic_, sub_qos, [this](PointCloud2::SharedPtr msg) { onCloud(msg, 1); }, opts); + + /* 统计 */ last_stats_time_ = now(); - front_count_ = 0; - rear_count_ = 0; - merged_count_ = 0; + front_rx_ = rear_rx_ = merged_count_ = 0; - RCLCPP_INFO(get_logger(), "LidarMerger started with cache_size=%d, time_tolerance=%.3fs, max_wait=%.3fs", - cache_size_, time_tolerance_, max_wait_time_); + RCLCPP_INFO(get_logger(), "LidarMerger ready | time_tolerance=%.3fs | use_gpu=%s", time_tolerance_, + use_gpu_ ? "true" : "false"); + } + + ~LidarMerger() + { + shutdown_.store(true); + raw_queue_front_.notify_all(); + raw_queue_rear_.notify_all(); + processed_queue_.notify_all(); + grid_queue_.notify_all(); + + if (process_thread_front_.joinable()) process_thread_front_.join(); + if (process_thread_rear_.joinable()) process_thread_rear_.join(); + if (publish_thread_.joinable()) publish_thread_.join(); + if (grid_thread_.joinable()) grid_thread_.join(); + +#ifdef USE_GPU + GpuResources::instance().destroy(); +#endif } private: - /* ---------- 回调函数 - 同步处理 ---------- */ - void frontCB(const PointCloud2::SharedPtr msg) + /* ============================================================ + * [1] ROS 回调(极轻量,仅入队,<0.1ms) + * ============================================================ */ + void onCloud(PointCloud2::SharedPtr msg, int channel) { - auto receive_time = now(); - front_count_++; + auto rt = now(); + if (channel == 0) + ++front_rx_; + else + ++rear_rx_; - if (enable_debug_) - { - auto data_delay = (receive_time - rclcpp::Time(msg->header.stamp)).seconds(); - RCLCPP_DEBUG(get_logger(), "Front cloud received, data_delay=%.3fs", data_delay); - } + RawFrame frame{msg, rt, channel}; - processCloudSync(msg, "front", receive_time); - } + if (channel == 0) + raw_queue_front_.push_drop_oldest(std::move(frame)); + else + raw_queue_rear_.push_drop_oldest(std::move(frame)); - void rearCB(const PointCloud2::SharedPtr msg) - { - auto receive_time = now(); - rear_count_++; - - if (enable_debug_) - { - auto data_delay = (receive_time - rclcpp::Time(msg->header.stamp)).seconds(); - RCLCPP_DEBUG(get_logger(), "Rear cloud received, data_delay=%.3fs", data_delay); - } - - processCloudSync(msg, "rear", receive_time); - } - - /* ---------- 同步处理点云 ---------- */ - void processCloudSync(const PointCloud2::SharedPtr msg, const std::string& source, const rclcpp::Time& receive_time) - { - // 1. 立即进行坐标变换(原始点云) - auto transformed_cloud = transformCloud(*msg); - if (!transformed_cloud) - { - RCLCPP_WARN(get_logger(), "Failed to transform %s cloud", source.c_str()); - return; - } - - // 2. 对变换后的原始点云进行处理(过滤、降采样、离群点去除) - auto processed_pcl = std::make_shared>(); - auto processed_cloud = processSinglePointCloud(transformed_cloud, processed_pcl); - if (!processed_cloud) - { - RCLCPP_WARN(get_logger(), "Failed to process %s cloud", source.c_str()); - return; - } - - // 3. 创建CloudFrame(存储原始+处理后的点云+处理后的PCL点云) - auto cloud_frame = std::make_shared(CloudFrame{ - transformed_cloud, processed_cloud, processed_pcl, rclcpp::Time(msg->header.stamp), receive_time, source}); - - // 4. 添加到缓存并立即尝试合并 - { - std::lock_guard lock(cache_mutex_); - addToCache(cloud_frame); - - // 立即尝试合并,无需等待定时器 - tryMerge(); - } - - // 定期输出统计信息 printStats(); } - /* ---------- 单雷达点云处理(独立处理前后雷达点云) ---------- */ - std::shared_ptr processSinglePointCloud(const std::shared_ptr& cloud_msg, - pcl::PointCloud::Ptr& processed_pcl_out) + /* ============================================================ + * [2] 处理线程(每路雷达独立,彼此不阻塞) + * ============================================================ */ + void processThreadFunc(int channel) { - if (!cloud_msg || cloud_msg->data.empty()) return nullptr; + auto& raw_q = (channel == 0) ? raw_queue_front_ : raw_queue_rear_; - // 将ROS点云消息转换为PCL点云 - pcl::PointCloud::Ptr cloud_pcl(new pcl::PointCloud); - pcl::fromROSMsg(*cloud_msg, *cloud_pcl); + /* 每个处理线程独立的 PCL 过滤器(避免互斥) */ + pcl::VoxelGrid vg_local; + vg_local.setLeafSize(voxel_size_, voxel_size_, voxel_size_); - // GPU加速处理 - 优化版:使用合并内核提高GPU利用率 -#ifdef USE_GPU - if (use_gpu_) + RawFrame raw; + while (raw_q.pop(raw, shutdown_)) { - RCLCPP_INFO_ONCE(get_logger(), "Using GPU acceleration for point cloud processing (merged kernel)"); + auto t0 = std::chrono::steady_clock::now(); - // 转换为简单浮点数组 - auto float_array = convertToFloatArray(cloud_pcl); + /* 坐标变换 */ + auto transformed = transformCloud(*raw.msg); + if (!transformed) continue; - // 使用合并处理内核:空间过滤+体素有效性检查+统计离群点去除 - auto valid_indices = gpuMergedProcessing(float_array, filter_min_x_, filter_min_y_, filter_min_z_, - filter_max_x_, filter_max_y_, filter_max_z_, voxel_size_, - grid_range_, stat_mean_k_, stat_std_thresh_); + /* 点云处理 */ + pcl::PointCloud::Ptr pcl_out; + auto ros_out = processSingleCloud(transformed, pcl_out, channel, vg_local); + if (!ros_out || !pcl_out || pcl_out->empty()) continue; - if (valid_indices.empty()) - { - RCLCPP_WARN(get_logger(), "[GPU] Merged processing result is empty"); - return nullptr; - } + ProcessedFrame pf; + pf.ros_cloud = ros_out; + pf.pcl_cloud = pcl_out; + pf.stamp = rclcpp::Time(raw.msg->header.stamp); + pf.received_time = raw.received_time; + pf.channel = channel; - // 提取有效点 - std::vector filtered_points; - filtered_points.reserve(valid_indices.size() * 3); - for (int idx : valid_indices) - { - filtered_points.push_back(float_array[idx * 3 + 0]); - filtered_points.push_back(float_array[idx * 3 + 1]); - filtered_points.push_back(float_array[idx * 3 + 2]); - } + processed_queue_.push_drop_oldest(std::move(pf)); - // 转换回PCL格式 - processed_pcl_out = convertToPCLCloud(filtered_points); - - // 车身和主刷过滤(使用CPU进行精细过滤,保持栅格格式一致) - if (filter_car_) - { - pcl::PointCloud::Ptr temp(new pcl::PointCloud); - car_crop_.setInputCloud(processed_pcl_out); - car_crop_.filter(*temp); - - right_brush_crop_.setInputCloud(temp); - right_brush_crop_.filter(*processed_pcl_out); - - left_brush_crop_.setInputCloud(processed_pcl_out); - left_brush_crop_.filter(*temp); - - processed_pcl_out.swap(temp); - } - - // 转换回ROS消息 - auto processed_msg = cloud_pool_.acquire(); - pcl::toROSMsg(*processed_pcl_out, *processed_msg); - processed_msg->header = cloud_msg->header; - - return processed_msg; - } -#endif - - // CPU处理(备用方案) - RCLCPP_INFO_ONCE(get_logger(), "Using CPU for point cloud processing"); - - // 1. 条件过滤(空间过滤) - pcl::PointCloud::Ptr filtered(new pcl::PointCloud); - filtered->reserve(cloud_pcl->size()); - for (const auto& point : cloud_pcl->points) - { - if (point.x > filter_min_x_ && point.x < filter_max_x_ && point.y > filter_min_y_ && - point.y < filter_max_y_ && point.z > filter_min_z_ && point.z < filter_max_z_) - { - filtered->points.push_back(point); - } - } - filtered->width = filtered->points.size(); - filtered->height = 1; - filtered->is_dense = true; - - if (filtered->empty()) - { - RCLCPP_WARN(get_logger(), "[CPU] Filtered cloud is empty!"); - return nullptr; - } - - // 2. 车身和主刷过滤(使用预配置的CropBox) - if (filter_car_) - { - pcl::PointCloud::Ptr temp(new pcl::PointCloud); - car_crop_.setInputCloud(filtered); - car_crop_.filter(*temp); - - right_brush_crop_.setInputCloud(temp); - right_brush_crop_.filter(*filtered); - - left_brush_crop_.setInputCloud(filtered); - left_brush_crop_.filter(*temp); - - filtered.swap(temp); - } - - if (filtered->empty()) - { - RCLCPP_WARN(get_logger(), "[CPU] Filtered cloud is empty after car/brush filter!"); - return nullptr; - } - - // 3. 降采样(使用预配置的VoxelGrid) - pcl::PointCloud::Ptr downsampled(new pcl::PointCloud); - vg_.setInputCloud(filtered); - vg_.filter(*downsampled); - - if (downsampled->empty()) - { - RCLCPP_WARN(get_logger(), "[processSinglePointCloud] Downsampled cloud is empty!"); - return nullptr; - } - - // 4. 离群点去除(使用预配置的StatisticalOutlierRemoval) - sor_.setInputCloud(downsampled); - sor_.filter(*processed_pcl_out); - - if (processed_pcl_out->empty()) - { - RCLCPP_WARN(get_logger(), "[processSinglePointCloud] Outlier-removed cloud is empty!"); - return nullptr; - } - - // 转换回ROS消息 - auto processed_msg = cloud_pool_.acquire(); - pcl::toROSMsg(*processed_pcl_out, *processed_msg); - processed_msg->header = cloud_msg->header; - - return processed_msg; - } - - /* ---------- 添加到缓存 ---------- */ - void addToCache(std::shared_ptr frame) - { - auto& cache = (frame->source == "front") ? front_clouds_ : rear_clouds_; - - // 保持时间排序(最新的在前) - auto it = cache.begin(); - while (it != cache.end() && (*it)->stamp > frame->stamp) - { - ++it; - } - cache.insert(it, frame); - - // 限制缓存大小 - while (cache.size() > cache_size_) - { - cloud_pool_.release(cache.back()->cloud); - cloud_pool_.release(cache.back()->processed_cloud); // 释放处理后的点云 - cache.back()->processed_pcl.reset(); // 释放PCL点云 - cache.pop_back(); - } - } - - /* ---------- 尝试合并 - 触发式(合并处理后的点云) ---------- */ - void tryMerge() - { - if (front_clouds_.empty() || rear_clouds_.empty()) return; - - // 查找最佳匹配 - auto [front_frame, rear_frame] = findBestMatchOptimized(); - - if (!front_frame || !rear_frame) return; - - // 自适应延时检查 - 基于实际网络条件调整 - auto now_time = now(); - auto front_age = (now_time - front_frame->received_time).seconds(); - auto rear_age = (now_time - rear_frame->received_time).seconds(); - - // 动态调整max_wait_time基于观察到的延时 - static double observed_max_delay = 0.0; - double current_max_delay = std::max(front_age, rear_age); - observed_max_delay = std::max(observed_max_delay * 0.95, current_max_delay); // 指数衰减 - - // 使用观察到的延时来决定是否处理,而不是固定阈值 - double adaptive_threshold = std::min(max_wait_time_, observed_max_delay + 0.1); - - if (front_age > adaptive_threshold || rear_age > adaptive_threshold) - { if (enable_debug_) { - RCLCPP_WARN_THROTTLE(get_logger(), *get_clock(), 5000, - "Data age check: front_age=%.3fs, rear_age=%.3fs, threshold=%.3fs", front_age, - rear_age, adaptive_threshold); + auto dt = std::chrono::duration_cast(std::chrono::steady_clock::now() - t0) + .count(); + RCLCPP_DEBUG(get_logger(), "[ch%d] process %.1fms, pts=%zu", channel, dt / 1000.0, pcl_out->size()); } - - // 不直接返回,而是清理最旧的数据后重试 - cleanupExpiredClouds(); - if (!front_clouds_.empty() && !rear_clouds_.empty()) - { - // 递归重试一次 - static int retry_count = 0; - if (retry_count < 2) - { - retry_count++; - tryMerge(); - retry_count--; - } - } - return; } + } - // 合并处理后的点云(核心修改:不再合并原始点云) - auto merge_start = std::chrono::high_resolution_clock::now(); - auto merged = mergeProcessedClouds(*front_frame, *rear_frame); - auto merge_end = std::chrono::high_resolution_clock::now(); + /* ============================================================ + * [3] 发布线程(匹配 + 合并 + 发布,不含栅格化) + * 栅格化已移至独立 grid_thread_,避免打印阻塞发布路径 + * ============================================================ */ + void publishThreadFunc() + { + std::deque cache_front, cache_rear; + constexpr size_t CACHE_DEPTH = 5; - if (!merged) return; - - // 设置时间戳和frame_id - auto front_time = front_frame->stamp.nanoseconds(); - auto rear_time = rear_frame->stamp.nanoseconds(); - merged->header.stamp = rclcpp::Time((front_time + rear_time) / 2, RCL_ROS_TIME); - merged->header.frame_id = target_frame_; - - // 发布合并后的点云 - pub_->publish(*merged); - merged_count_++; - - // ========================= 栅格化 ========================= - // 直接合并处理后的PCL点云,避免再次转换 - pcl::PointCloud::Ptr merged_pcl(new pcl::PointCloud); - *merged_pcl = *front_frame->processed_pcl + *rear_frame->processed_pcl; - - // 生成栅格(直接使用合并后的PCL点云) - auto grid = generateOccupancyGrid(merged_pcl); - if (!grid.empty()) + ProcessedFrame pf; + while (processed_queue_.pop(pf, shutdown_)) { - // 可视化栅格 + auto& cache = (pf.channel == 0) ? cache_front : cache_rear; + insertSorted(cache, std::move(pf), CACHE_DEPTH); + tryMatchAndPublish(cache_front, cache_rear); + } + } + + /* ============================================================ + * [4] 栅格化线程(独立于发布线程,绑定 core5) + * 从 grid_queue_ 取合并后 PCL,生成栅格 + 可视化 + 发布 + * ============================================================ */ + void gridThreadFunc() + { + pcl::PointCloud::Ptr merged_pcl; + while (grid_queue_.pop(merged_pcl, shutdown_)) + { + auto grid = generateOccupancyGrid(merged_pcl); + if (grid.empty()) continue; if (enable_print_) visualizeGridInTerminal(grid); - // 发布栅格 publishGrid(grid); } - - // 延时分析(降低频率) - if (enable_debug_ && merged_count_ % 10 == 0) // 每10次输出一次 - { - auto total_delay = (now_time - merged->header.stamp).seconds(); - auto process_time = - std::chrono::duration(std::chrono::high_resolution_clock::now() - merge_start).count(); - - RCLCPP_INFO( - get_logger(), - "Merged #%d: merged_points=%zu, total_delay=%.3fs, process_time=%.3fms, adaptive_threshold=%.3fs", - merged_count_, merged->data.size() / merged->point_step, total_delay, process_time * 1000, - adaptive_threshold); - } - - // 清理已使用的点云 - removeUsedClouds(front_frame, rear_frame); } - /* ---------- 合并处理后的点云 ---------- */ - std::shared_ptr mergeProcessedClouds(const CloudFrame& front, const CloudFrame& rear) + /* 按时间戳降序插入,并限制缓存深度 */ + static void insertSorted(std::deque& cache, ProcessedFrame pf, size_t max_depth) { - if (!front.processed_cloud || !rear.processed_cloud) - { - RCLCPP_ERROR(get_logger(), "Processed cloud is null!"); - return nullptr; - } - - size_t front_points = front.processed_cloud->data.size() / front.processed_cloud->point_step; - size_t rear_points = rear.processed_cloud->data.size() / rear.processed_cloud->point_step; - - if (front.processed_cloud->fields != rear.processed_cloud->fields || - front.processed_cloud->point_step != rear.processed_cloud->point_step || - front.processed_cloud->is_bigendian != rear.processed_cloud->is_bigendian) - { - RCLCPP_ERROR(get_logger(), "Processed PointCloud2 formats do not match!"); - return nullptr; - } - - auto merged = cloud_pool_.acquire(); - - // 复制元数据 - merged->header = front.processed_cloud->header; - merged->fields = front.processed_cloud->fields; - merged->is_bigendian = front.processed_cloud->is_bigendian; - merged->point_step = front.processed_cloud->point_step; - merged->height = 1; - merged->width = front_points + rear_points; - - // 高效合并处理后的点云数据 - merged->data.clear(); - merged->data.reserve(front.processed_cloud->data.size() + rear.processed_cloud->data.size()); - merged->data.insert(merged->data.end(), front.processed_cloud->data.begin(), front.processed_cloud->data.end()); - merged->data.insert(merged->data.end(), rear.processed_cloud->data.begin(), rear.processed_cloud->data.end()); - merged->row_step = merged->data.size(); - - return merged; + auto it = cache.begin(); + while (it != cache.end() && it->stamp > pf.stamp) ++it; + cache.insert(it, std::move(pf)); + while (cache.size() > max_depth) cache.pop_back(); } - /* ---------- 条件过滤 ---------- */ - pcl::PointCloud::Ptr applyConditionalFiltering(const pcl::PointCloud::Ptr& cloud) + /* 匹配最新时间戳对齐帧,合并并发布 */ + void tryMatchAndPublish(std::deque& cf, std::deque& cr) { - pcl::PointCloud::Ptr filtered(new pcl::PointCloud); + if (cf.empty() || cr.empty()) return; - for (const auto& point : cloud->points) - { - // 自定义过滤条件:保留特定区域内的点 - if (point.x > filter_min_x_ && point.x < filter_max_x_ && point.y > filter_min_y_ && - point.y < filter_max_y_ && point.z > filter_min_z_ && point.z < filter_max_z_) - { - filtered->points.push_back(point); - } - } - - filtered->width = filtered->points.size(); - filtered->height = 1; - filtered->is_dense = true; - - // 不启用车身过滤,直接返回 - if (!filter_car_) return filtered; - - // 使用CropBox移除车身区域 - pcl::CropBox crop; - crop.setInputCloud(filtered); - - // 设置车身区域(最小点和最大点) - Eigen::Vector4f min_point(car_lidar_offset_x_ - car_length_ / 2.0f, car_lidar_offset_y_ - car_width_ / 2.0f, - filter_min_z_, 1.0); - - Eigen::Vector4f max_point(car_lidar_offset_x_ + car_length_ / 2.0f, car_lidar_offset_y_ + car_width_ / 2.0f, - filter_max_z_, 1.0); - - crop.setMin(min_point); - crop.setMax(max_point); - crop.setNegative(true); // true表示保留外部点,false表示保留内部点 - - pcl::PointCloud::Ptr output(new pcl::PointCloud); - crop.filter(*output); - - // 右侧主刷 (x:0.85~1.25, y:0.6~0.96, z:1.2~1.65) - pcl::CropBox crop1; - crop1.setInputCloud(output); - Eigen::Vector4f min1(0.85f, 0.6f, 1.2f, 1.0); // 右侧主刷最小点 - Eigen::Vector4f max1(1.25f, 0.96f, 1.65f, 1.0); // 右侧主刷最大点 - crop1.setMin(min1); - crop1.setMax(max1); - crop1.setNegative(true); // 剔除右侧主刷内的点 - - pcl::PointCloud::Ptr output1(new pcl::PointCloud); - crop1.filter(*output1); - - // 左侧主刷 - 右侧主刷的X轴对称(y坐标取相反数) - pcl::CropBox crop2; - crop2.setInputCloud(output1); - // X轴对称:将y坐标取相反数,保持x和z坐标不变 - Eigen::Vector4f min2(0.85f, -0.96f, 1.2f, 1.0); // 左侧主刷最小点(y取反后注意交换大小) - Eigen::Vector4f max2(1.25f, -0.6f, 1.65f, 1.0); // 左侧主刷最大点(y取反后注意交换大小) - crop2.setMin(min2); - crop2.setMax(max2); - crop2.setNegative(true); // 剔除左侧主刷内的点 - - pcl::PointCloud::Ptr final_output(new pcl::PointCloud); - crop2.filter(*final_output); - - return final_output; // 剔除两侧主刷 - } - - /* ---------- 降采样(体素网格过滤) ---------- */ - pcl::PointCloud::Ptr applyVoxelGridFiltering(const pcl::PointCloud::Ptr& cloud) - { - pcl::PointCloud::Ptr downsampled(new pcl::PointCloud); - pcl::VoxelGrid vg; - - vg.setInputCloud(cloud); - vg.setLeafSize(voxel_size_, voxel_size_, voxel_size_); - vg.filter(*downsampled); - - return downsampled; - } - - /* ---------- 离群点去除 ---------- */ - pcl::PointCloud::Ptr applyStatisticalOutlierRemoval(const pcl::PointCloud::Ptr& cloud) - { - pcl::PointCloud::Ptr filtered(new pcl::PointCloud); - pcl::StatisticalOutlierRemoval sor; - - sor.setInputCloud(cloud); - sor.setMeanK(stat_mean_k_); // 考虑每个点的k个最近邻 - sor.setStddevMulThresh(stat_std_thresh_); // 标准差倍数阈值 - sor.filter(*filtered); - - return filtered; - } - - /* ---------- 栅格化处理 ---------- */ - std::vector> generateOccupancyGrid(const pcl::PointCloud::Ptr& cloud) - { - // 0:空 1:障碍物 2:车体 - std::vector> grid(grid_size_, std::vector(grid_size_, 0)); - static float resolution = grid_range_ / static_cast(grid_size_); // 静态变量避免重复计算 - static float inv_resolution = 1.0f / resolution; - - // 标记车体区域 - if (filter_car_) - { - // 计算车体在栅格中的位置 - int car_min_x = - static_cast((grid_range_ / 2 - (car_lidar_offset_x_ + car_length_ / 2.0f)) * inv_resolution); - int car_max_x = - static_cast((grid_range_ / 2 - (car_lidar_offset_x_ - car_length_ / 2.0f)) * inv_resolution); - int car_min_y = - static_cast((-car_lidar_offset_y_ - car_width_ / 2.0f + grid_range_ / 2) * inv_resolution); - int car_max_y = - static_cast((-car_lidar_offset_y_ + car_width_ / 2.0f + grid_range_ / 2) * inv_resolution); - - // 确保索引在有效范围内 - car_min_x = std::max(0, std::min(car_min_x, grid_size_ - 1)); - car_max_x = std::max(0, std::min(car_max_x, grid_size_ - 1)); - car_min_y = std::max(0, std::min(car_min_y, grid_size_ - 1)); - car_max_y = std::max(0, std::min(car_max_y, grid_size_ - 1)); - - // 在栅格中标记车体区域 - for (int x = car_min_x; x <= car_max_x; ++x) - { - for (int y = car_min_y; y <= car_max_y; ++y) - { - grid[x][y] = 2; - } - } - } - - // 标记点云障碍物 - for (const auto& point : cloud->points) - { - // X轴(前向)映射到行索引(从上到下为正X到负X) - int grid_x = static_cast((grid_range_ / 2 - point.x) * inv_resolution); - - // Y轴(右向)映射到列索引(从左到右为负Y到正Y) - int grid_y = static_cast((point.y + grid_range_ / 2) * inv_resolution); - - if (grid_x >= 0 && grid_x < grid_size_ && grid_y >= 0 && grid_y < grid_size_) - { - // 只有当该位置不是车体时才标记为障碍物 - if (grid[grid_x][grid_y] != 2) - { - grid[grid_x][grid_y] = 1; - } - } - } - - return grid; - } - - /* ---------- 可视化栅格 - 终端打印 ---------- */ - void visualizeGridInTerminal(const std::vector>& grid) - { - std::system("clear"); - - std::cout << " -----------------------------障碍物矩阵------------------------------- " << std::endl; - - // 打印顶部边框 - std::cout << " " << std::string(grid[0].size(), '-') << std::endl; - - // 打印列号(仅0-9) - std::cout << " "; - for (size_t i = 0; i < grid[0].size(); i++) - { - std::cout << (i % 10); - } - std::cout << std::endl; - - // 打印栅格内容(行号 + 栅格) - for (size_t i = 0; i < grid.size(); i++) - { - // 行号显示(两位数,右对齐) - std::cout << (i < 10 ? " " : "") << i << "|"; - - // 栅格内容 - for (int cell : grid[i]) - { - // 根据栅格值选择显示符号 - std::cout << (cell == 1 ? "@" : (cell == 2 ? " " : ".")); - } - - std::cout << "|" << std::endl; - } - - // 打印底部边框 - std::cout << " " << std::string(grid[0].size(), '-') << std::endl; - } - - /* ---------- 发布栅格 ---------- */ - void publishGrid(const std::vector>& grid) - { - // 创建Int32MultiArray消息 - auto msg = std_msgs::msg::Int32MultiArray(); - - // 设置维度信息 - msg.layout.dim.resize(2); - msg.layout.dim[0].label = "height"; // 行(对应X轴) - msg.layout.dim[0].size = grid.size(); - msg.layout.dim[0].stride = grid.size() * grid[0].size(); - - msg.layout.dim[1].label = "width"; // 列(对应Y轴) - msg.layout.dim[1].size = grid[0].size(); - msg.layout.dim[1].stride = grid[0].size(); - - // 设置数据(按行优先展开) - msg.data.clear(); - for (const auto& row : grid) - { - msg.data.insert(msg.data.end(), row.begin(), row.end()); - } - - // 发布消息 - grid_pub_->publish(msg); - } - - /* ---------- 优化的匹配算法 ---------- */ - std::pair, std::shared_ptr> findBestMatchOptimized() - { - if (front_clouds_.empty() || rear_clouds_.empty()) return {nullptr, nullptr}; - - // 快速策略:优先使用最新的点云进行匹配 - auto front_candidate = front_clouds_.front(); - auto rear_candidate = rear_clouds_.front(); - - // 检查时间差 - auto time_diff = std::abs((front_candidate->stamp - rear_candidate->stamp).seconds()); - - if (time_diff <= time_tolerance_) - { - return {front_candidate, rear_candidate}; - } - - // 如果最新的不匹配,寻找更好的组合(但限制搜索范围) - std::shared_ptr best_front = nullptr; - std::shared_ptr best_rear = nullptr; + /* 在前 3 帧内寻找时间差最小的配对 */ + constexpr size_t SEARCH = 3; + size_t fi_best = 0, ri_best = 0; double min_diff = std::numeric_limits::max(); - // 限制搜索范围到前3个元素 - size_t search_limit = std::min(3, std::min(front_clouds_.size(), rear_clouds_.size())); - - for (size_t i = 0; i < search_limit && i < front_clouds_.size(); ++i) + for (size_t fi = 0; fi < std::min(SEARCH, cf.size()); ++fi) { - for (size_t j = 0; j < search_limit && j < rear_clouds_.size(); ++j) + for (size_t ri = 0; ri < std::min(SEARCH, cr.size()); ++ri) { - auto diff = std::abs((front_clouds_[i]->stamp - rear_clouds_[j]->stamp).seconds()); + double diff = std::abs((cf[fi].stamp - cr[ri].stamp).seconds()); if (diff < min_diff) { min_diff = diff; - best_front = front_clouds_[i]; - best_rear = rear_clouds_[j]; + fi_best = fi; + ri_best = ri; } } } - if (min_diff <= time_tolerance_) + /* 时间差超出容差,不合并(等待另一路的下一帧) + * 注意:这里不做 "age 等待",直接用最新可用帧 */ + if (min_diff > time_tolerance_) { - return {best_front, best_rear}; - } - - return {nullptr, nullptr}; - } - - /* ---------- 移除已使用的点云 ---------- */ - void removeUsedClouds(std::shared_ptr used_front, std::shared_ptr used_rear) - { - // 从front_clouds_中移除 - auto front_it = std::find(front_clouds_.begin(), front_clouds_.end(), used_front); - if (front_it != front_clouds_.end()) - { - cloud_pool_.release((*front_it)->cloud); - cloud_pool_.release((*front_it)->processed_cloud); - (*front_it)->processed_pcl.reset(); - front_clouds_.erase(front_it); - } - - // 从rear_clouds_中移除 - auto rear_it = std::find(rear_clouds_.begin(), rear_clouds_.end(), used_rear); - if (rear_it != rear_clouds_.end()) - { - cloud_pool_.release((*rear_it)->cloud); - cloud_pool_.release((*rear_it)->processed_cloud); - (*rear_it)->processed_pcl.reset(); - rear_clouds_.erase(rear_it); - } - - // 清理过期的点云 - cleanupExpiredClouds(); - } - - /* ---------- 清理过期点云 ---------- */ - void cleanupExpiredClouds() - { - auto now_time = now(); - auto cleanup_threshold = max_wait_time_; // 使用配置的等待时间 - - // 清理front_clouds_中过期的数据 - while (!front_clouds_.empty()) - { - auto age = (now_time - front_clouds_.back()->received_time).seconds(); - if (age > cleanup_threshold) + /* 如果两路都只有一帧且时间差不大于 3 倍容差,强制合并 + * 避免两路雷达频率略有差异时永远找不到完美匹配 */ + if (cf.size() == 1 && cr.size() == 1 && min_diff <= time_tolerance_ * 3.0) { - if (enable_debug_) - { - RCLCPP_DEBUG(get_logger(), "Cleaning up old front cloud, age=%.3fs", age); - } - cloud_pool_.release(front_clouds_.back()->cloud); - cloud_pool_.release(front_clouds_.back()->processed_cloud); - front_clouds_.back()->processed_pcl.reset(); - front_clouds_.pop_back(); + /* 允许宽松合并,继续向下 */ } else { - break; + /* 丢弃最旧的帧后返回,等下一帧触发 */ + if (cf.back().stamp < cr.back().stamp) + cf.pop_back(); + else + cr.pop_back(); + return; } } - // 清理rear_clouds_中过期的数据 - while (!rear_clouds_.empty()) + ProcessedFrame& front_frame = cf[fi_best]; + ProcessedFrame& rear_frame = cr[ri_best]; + + auto t_merge = std::chrono::steady_clock::now(); + + /* 合并 ROS 点云(内存拷贝,μs 级) */ + auto merged_ros = mergeRosClouds(front_frame, rear_frame); + if (!merged_ros) goto cleanup; + + /* 发布 */ + pub_->publish(*merged_ros); + pool_.release(merged_ros); + ++merged_count_; + + /* 栅格化:push 到 grid_queue_,由独立线程异步处理,不阻塞发布 */ { - auto age = (now_time - rear_clouds_.back()->received_time).seconds(); - if (age > cleanup_threshold) - { - if (enable_debug_) - { - RCLCPP_DEBUG(get_logger(), "Cleaning up old rear cloud, age=%.3fs", age); - } - cloud_pool_.release(rear_clouds_.back()->cloud); - cloud_pool_.release(rear_clouds_.back()->processed_cloud); - rear_clouds_.back()->processed_pcl.reset(); - rear_clouds_.pop_back(); - } - else - { - break; - } + auto merged_pcl = std::make_shared>(); + *merged_pcl = *front_frame.pcl_cloud + *rear_frame.pcl_cloud; + grid_queue_.push_drop_oldest(std::move(merged_pcl)); } + + if (enable_debug_) + { + auto dt = std::chrono::duration_cast(std::chrono::steady_clock::now() - t_merge) + .count(); + auto e2e = (now() - front_frame.received_time).seconds(); + RCLCPP_INFO(get_logger(), "[Merge#%d] timediff=%.3fs, merge=%.1fms, e2e_delay=%.3fs", merged_count_.load(), + min_diff, dt / 1000.0, e2e); + } + + cleanup: + /* 从缓存中移除已使用的帧 */ + cf.erase(cf.begin() + fi_best); + cr.erase(cr.begin() + ri_best); } - /* ---------- 坐标变换 - 简化版 ---------- */ + /* ============================================================ + * 坐标变换(线程安全:tf2::Buffer 内部加锁) + * ============================================================ */ std::shared_ptr transformCloud(const PointCloud2& in) { if (in.header.frame_id == target_frame_) { - auto out = cloud_pool_.acquire(); + auto out = pool_.acquire(); *out = in; return out; } - try { - // 使用最新可用的变换,不等待 auto tf = tf_buffer_->lookupTransform(target_frame_, in.header.frame_id, tf2::TimePointZero); - - auto out = cloud_pool_.acquire(); + auto out = pool_.acquire(); tf2::doTransform(in, *out, tf); return out; } catch (const tf2::TransformException& e) { - RCLCPP_WARN_THROTTLE(get_logger(), *get_clock(), 1000, "TF failed (%s -> %s): %s", - in.header.frame_id.c_str(), target_frame_.c_str(), e.what()); + RCLCPP_WARN_THROTTLE(get_logger(), *get_clock(), 1000, "TF failed: %s", e.what()); return nullptr; } } - /* ---------- 统计信息 ---------- */ - void printStats() + /* ============================================================ + * 单雷达点云处理(在处理线程中调用,使用线程本地过滤器) + * + * 优化点: + * 1. 零拷贝读取 PointCloud2::data(绕过 pcl::fromROSMsg 的整体拷贝) + * 直接从 ROS2 raw bytes 提取 x/y/z 写入锁页内存 + * 2. GPU 结果直接写 PointCloud2::data(绕过 convertToPCLCloud + toROSMsg) + * pcl_out 从 PointCloud2::data 反向填充,避免再次拷贝 + * 3. 双缓冲异步:向 GPU 提交本帧后立刻返回, + * 下次调用时才同步上一帧结果(CPU 准备和 GPU 计算重叠) + * ============================================================ */ + std::shared_ptr processSingleCloud(const std::shared_ptr& cloud_msg, + pcl::PointCloud::Ptr& pcl_out, int channel, + pcl::VoxelGrid& vg_local) { - auto current_time = now(); - if ((current_time - last_stats_time_).seconds() >= 5.0) // 每5秒输出一次 - { - RCLCPP_INFO(get_logger(), "Stats: Front=%d, Rear=%d, Merged=%d, Cache(F:%zu,R:%zu)", front_count_, - rear_count_, merged_count_, front_clouds_.size(), rear_clouds_.size()); + if (!cloud_msg || cloud_msg->data.empty()) return nullptr; - last_stats_time_ = current_time; +/* ======== GPU 路径 ======== */ +#ifdef USE_GPU + if (use_gpu_ && GpuResources::instance().isInitialized()) + { + auto& res = GpuResources::instance(); + + /* ── 优化1:零拷贝读取 PointCloud2::data ── + * PointCloud2 的 data 是 packed 字节流,每个点占 point_step 字节。 + * x/y/z 通常在偏移 0/4/8(float32),直接读取,跳过 PCL 转换。 + * 如果 field 偏移不是标准布局则退回原始方式。 + */ + const uint32_t pt_step = cloud_msg->point_step; + const uint8_t* raw = cloud_msg->data.data(); + int n = (int)(cloud_msg->data.size() / pt_step); + + if (n <= 0 || n > res.max_points) goto cpu_path; + + /* 找 x/y/z 字段偏移(通常是 0/4/8,但做一次安全检查) */ + uint32_t off_x = 0, off_y = 4, off_z = 8; + bool std_layout = false; + { + uint32_t found = 0; + for (const auto& f : cloud_msg->fields) + { + if (f.name == "x" && f.datatype == 7 /*FLOAT32*/) + { + off_x = f.offset; + found++; + } + if (f.name == "y" && f.datatype == 7) + { + off_y = f.offset; + found++; + } + if (f.name == "z" && f.datatype == 7) + { + off_z = f.offset; + found++; + } + } + std_layout = (found == 3); + } + if (!std_layout) goto cpu_path; + + /* 逐点提取 x/y/z → 锁页内存(比 fromROSMsg 少一次整体 memcpy)*/ + { + float* h = res.h_points[channel]; + for (int i = 0; i < n; ++i) + { + const uint8_t* p = raw + (size_t)i * pt_step; + float fx, fy, fz; + std::memcpy(&fx, p + off_x, 4); + std::memcpy(&fy, p + off_y, 4); + std::memcpy(&fz, p + off_z, 4); + h[i * 3 + 0] = fx; + h[i * 3 + 1] = fy; + h[i * 3 + 2] = fz; + } + } + + /* GPU 处理(内部异步提交,cudaStreamSynchronize 在函数末尾) */ + GpuProcessingParams params = buildGpuParams(); + auto result = gpuProcessPointCloud(res.h_points[channel], n, params, channel); + if (result.empty()) goto cpu_path; + + /* ── 优化2:直接从 float[] 构建 PointCloud2,跳过 PCL 中间格式 ── + * 同时构建 pcl_out(供栅格化使用),共享同一块数据 */ + { + int out_n = (int)(result.size() / 3); + + /* 构建 PointCloud2(标准 XYZ float32 layout)*/ + auto ros_out = pool_.acquire(); + ros_out->header = cloud_msg->header; + ros_out->height = 1; + ros_out->width = (uint32_t)out_n; + ros_out->is_dense = true; + ros_out->is_bigendian = false; + ros_out->point_step = 12; /* 3 × float32 */ + ros_out->row_step = (uint32_t)(out_n * 12); + + /* 只设置一次 fields(第一次或 fields 为空时)*/ + if (ros_out->fields.empty()) + { + ros_out->fields.resize(3); + for (int fi = 0; fi < 3; ++fi) + { + ros_out->fields[fi].datatype = 7; /* FLOAT32 */ + ros_out->fields[fi].count = 1; + ros_out->fields[fi].offset = (uint32_t)(fi * 4); + } + ros_out->fields[0].name = "x"; + ros_out->fields[1].name = "y"; + ros_out->fields[2].name = "z"; + } + + ros_out->data.resize((size_t)out_n * 12); + std::memcpy(ros_out->data.data(), result.data(), (size_t)out_n * 12); + + /* pcl_out:直接从 result 填充,避免再次 memcpy */ + pcl_out = std::make_shared>(); + pcl_out->resize(out_n); + pcl_out->width = (uint32_t)out_n; + pcl_out->height = 1; + pcl_out->is_dense = true; + for (int i = 0; i < out_n; ++i) + { + pcl_out->points[i].x = result[i * 3 + 0]; + pcl_out->points[i].y = result[i * 3 + 1]; + pcl_out->points[i].z = result[i * 3 + 2]; + } + + return ros_out; + } + } +#endif + + /* ======== CPU 路径 ======== */ + cpu_path: + /* 零拷贝解包:从 PointCloud2::data 直接读点,不走 fromROSMsg */ + const uint32_t pt_step_cpu = cloud_msg->point_step; + const uint8_t* raw_cpu = cloud_msg->data.data(); + int n_cpu = (int)(cloud_msg->data.size() / pt_step_cpu); + + /* 找字段偏移 */ + uint32_t ox = 0, oy = 4, oz = 8; + { + for (const auto& f : cloud_msg->fields) + { + if (f.name == "x") ox = f.offset; + if (f.name == "y") oy = f.offset; + if (f.name == "z") oz = f.offset; + } + } + + /* 预计算车身/扫刷边界 */ + const float cmin_x = car_lidar_offset_x_ - car_length_ * 0.5f; + const float cmax_x = car_lidar_offset_x_ + car_length_ * 0.5f; + const float cmin_y = car_lidar_offset_y_ - car_width_ * 0.5f; + const float cmax_y = car_lidar_offset_y_ + car_width_ * 0.5f; + + /* 1. Bounding box + 车身/扫刷过滤(单次遍历,直接从 raw bytes 读取) */ + pcl::PointCloud::Ptr filtered(new pcl::PointCloud); + filtered->reserve((size_t)n_cpu); + + for (int i = 0; i < n_cpu; ++i) + { + const uint8_t* p = raw_cpu + (size_t)i * pt_step_cpu; + float fx, fy, fz; + std::memcpy(&fx, p + ox, 4); + std::memcpy(&fy, p + oy, 4); + std::memcpy(&fz, p + oz, 4); + + if (fx <= filter_min_x_ || fx >= filter_max_x_ || fy <= filter_min_y_ || fy >= filter_max_y_ || + fz <= filter_min_z_ || fz >= filter_max_z_) + continue; + + if (filter_car_) + { + if (fx >= cmin_x && fx <= cmax_x && fy >= cmin_y && fy <= cmax_y) continue; + + if (fx >= 0.85f && fx <= 1.25f && fy >= 0.60f && fy <= 0.96f && fz >= 1.20f && fz <= 1.65f) continue; + + if (fx >= 0.85f && fx <= 1.25f && fy >= -0.96f && fy <= -0.60f && fz >= 1.20f && fz <= 1.65f) continue; + } + + pcl::PointXYZ pt; + pt.x = fx; + pt.y = fy; + pt.z = fz; + filtered->points.push_back(pt); + } + filtered->width = (uint32_t)filtered->points.size(); + filtered->height = 1; + filtered->is_dense = true; + if (filtered->empty()) return nullptr; + + /* 2. 体素化 */ + pcl::PointCloud::Ptr downsampled(new pcl::PointCloud); + vg_local.setInputCloud(filtered); + vg_local.filter(*downsampled); + if (downsampled->empty()) return nullptr; + + /* 3. 离群点去除(根据 sor_method_ 选择算法) */ + pcl_out = applyOutlierRemoval(downsampled); + if (!pcl_out || pcl_out->empty()) return nullptr; + + /* 4. 直接构建 PointCloud2(不走 toROSMsg,减少一次拷贝) */ + { + int out_n = (int)pcl_out->size(); + auto ros_out = pool_.acquire(); + ros_out->header = cloud_msg->header; + ros_out->height = 1; + ros_out->width = (uint32_t)out_n; + ros_out->is_dense = true; + ros_out->is_bigendian = false; + ros_out->point_step = 12; + ros_out->row_step = (uint32_t)(out_n * 12); + if (ros_out->fields.empty()) + { + ros_out->fields.resize(3); + for (int fi = 0; fi < 3; ++fi) + { + ros_out->fields[fi].datatype = 7; + ros_out->fields[fi].count = 1; + ros_out->fields[fi].offset = (uint32_t)(fi * 4); + } + ros_out->fields[0].name = "x"; + ros_out->fields[1].name = "y"; + ros_out->fields[2].name = "z"; + } + ros_out->data.resize((size_t)out_n * 12); + float* dst = reinterpret_cast(ros_out->data.data()); + for (int i = 0; i < out_n; ++i) + { + dst[i * 3 + 0] = pcl_out->points[i].x; + dst[i * 3 + 1] = pcl_out->points[i].y; + dst[i * 3 + 2] = pcl_out->points[i].z; + } + return ros_out; } } - /* ---------- 成员变量 ---------- */ + /* ============================================================ + * 合并两路 ROS 点云(memcpy,<1ms) + * ============================================================ */ + std::shared_ptr mergeRosClouds(const ProcessedFrame& front, const ProcessedFrame& rear) + { + auto& fc = front.ros_cloud; + auto& rc = rear.ros_cloud; + if (!fc || !rc) return nullptr; + + if (fc->fields != rc->fields || fc->point_step != rc->point_step || fc->is_bigendian != rc->is_bigendian) + { + RCLCPP_ERROR(get_logger(), "Cloud format mismatch!"); + return nullptr; + } + + auto merged = pool_.acquire(); + merged->header = fc->header; + merged->header.stamp = rclcpp::Time((front.stamp.nanoseconds() + rear.stamp.nanoseconds()) / 2, RCL_ROS_TIME); + merged->header.frame_id = target_frame_; + merged->fields = fc->fields; + merged->is_bigendian = fc->is_bigendian; + merged->point_step = fc->point_step; + merged->height = 1; + merged->width = (uint32_t)(fc->data.size() / fc->point_step + rc->data.size() / rc->point_step); + + merged->data.clear(); + merged->data.reserve(fc->data.size() + rc->data.size()); + merged->data.insert(merged->data.end(), fc->data.begin(), fc->data.end()); + merged->data.insert(merged->data.end(), rc->data.begin(), rc->data.end()); + merged->row_step = (uint32_t)merged->data.size(); + + return merged; + } + + /* ============================================================ + * 栅格化(轻量,<1ms) + * ============================================================ */ + std::vector> generateOccupancyGrid(const pcl::PointCloud::Ptr& cloud) + { + std::vector> grid(grid_size_, std::vector(grid_size_, 0)); + + /* 车体标记 */ + if (filter_car_) + { + int cx0 = std::max( + 0, std::min(grid_size_ - 1, (int)((grid_range_ * 0.5f - (car_lidar_offset_x_ + car_length_ * 0.5f)) * + grid_inv_resolution_))); + int cx1 = std::max( + 0, std::min(grid_size_ - 1, (int)((grid_range_ * 0.5f - (car_lidar_offset_x_ - car_length_ * 0.5f)) * + grid_inv_resolution_))); + int cy0 = std::max( + 0, std::min(grid_size_ - 1, (int)((-car_lidar_offset_y_ - car_width_ * 0.5f + grid_range_ * 0.5f) * + grid_inv_resolution_))); + int cy1 = std::max( + 0, std::min(grid_size_ - 1, (int)((-car_lidar_offset_y_ + car_width_ * 0.5f + grid_range_ * 0.5f) * + grid_inv_resolution_))); + for (int x = cx0; x <= cx1; ++x) + for (int y = cy0; y <= cy1; ++y) grid[x][y] = 2; + } + + /* 障碍物标记 */ + for (const auto& pt : cloud->points) + { + int gx = (int)((grid_range_ * 0.5f - pt.x) * grid_inv_resolution_); + int gy = (int)((pt.y + grid_range_ * 0.5f) * grid_inv_resolution_); + if (gx >= 0 && gx < grid_size_ && gy >= 0 && gy < grid_size_) + if (grid[gx][gy] != 2) grid[gx][gy] = 1; + } + return grid; + } + + /* ============================================================ + * 栅格打印(去掉 std::system("clear"),改用 ANSI 转义) + * ============================================================ */ + void visualizeGridInTerminal(const std::vector>& grid) + { + /* ANSI:光标移到左上角(不 fork 进程,零开销) */ + std::fputs("\033[H", stdout); + + static std::string line; + line.clear(); + line.reserve(grid_size_ + 4); + + std::printf(" 障碍物矩阵 [%dx%d] merged#%d\n", grid_size_, grid_size_, merged_count_.load()); + + for (int i = 0; i < grid_size_; ++i) + { + std::printf("%2d|", i); + for (int j = 0; j < grid_size_; ++j) + { + int v = grid[i][j]; + std::putchar(v == 1 ? '@' : (v == 2 ? ' ' : '.')); + } + std::puts("|"); + } + std::fflush(stdout); + } + + /* ============================================================ + * 发布栅格 + * ============================================================ */ + void publishGrid(const std::vector>& grid) + { + std_msgs::msg::Int32MultiArray msg; + msg.layout.dim.resize(2); + msg.layout.dim[0].label = "height"; + msg.layout.dim[0].size = grid.size(); + msg.layout.dim[0].stride = grid.size() * grid[0].size(); + msg.layout.dim[1].label = "width"; + msg.layout.dim[1].size = grid[0].size(); + msg.layout.dim[1].stride = grid[0].size(); + msg.data.reserve(grid.size() * grid[0].size()); + for (const auto& row : grid) msg.data.insert(msg.data.end(), row.begin(), row.end()); + grid_pub_->publish(msg); + } + + /* ============================================================ + * 离群点去除(可配置算法,替代固定 SOR) + * + * method 0: 直接返回,不做离群点去除(最快) + * method 1: 体素邻居密度滤波 O(N·K),用哈希表查邻居,比 KDTree 快 + * method 2: RadiusOutlierRemoval(PCL内置,用 KDTree,中速) + * method 3: StatisticalOutlierRemoval(PCL内置,KDTree,最慢) + * ============================================================ */ + pcl::PointCloud::Ptr applyOutlierRemoval(const pcl::PointCloud::Ptr& cloud) + { + if (!cloud || cloud->empty()) return cloud; + + switch (sor_method_) + { + /* ── 方法 0:直接返回 ── */ + case 0: + return cloud; + + /* ── 方法 1:体素邻居密度滤波 O(N) ── + * 原理:把点云再次体素化(使用更大的搜索半径), + * 统计每个点在 radius_search_ 范围内的邻居数, + * 少于 min_neighbors_ 的点视为离群点剔除。 + * 比 KDTree 快:用 unordered_map 做 voxel hash,O(1) 查询。 + */ + case 1: + { + const float r = radius_search_; + const int min = min_neighbors_; + const float inv = 1.0f / r; + + /* 体素化哈希:key = (ix,iy,iz),value = 点数 */ + struct VKey + { + int x, y, z; + bool operator==(const VKey& o) const { return x == o.x && y == o.y && z == o.z; } + }; + struct VHash + { + size_t operator()(const VKey& k) const + { + size_t h = (size_t)k.x * 73856093u ^ (size_t)k.y * 19349663u ^ (size_t)k.z * 83492791u; + return h; + } + }; + std::unordered_map vmap; + vmap.reserve(cloud->size()); + + for (const auto& p : cloud->points) + { + VKey k{(int)std::floor(p.x * inv), (int)std::floor(p.y * inv), (int)std::floor(p.z * inv)}; + vmap[k]++; + } + + /* 检查每个点所在体素及其 26 个邻居的总邻居数 */ + auto out = std::make_shared>(); + out->reserve(cloud->size()); + for (const auto& p : cloud->points) + { + int ix = (int)std::floor(p.x * inv); + int iy = (int)std::floor(p.y * inv); + int iz = (int)std::floor(p.z * inv); + int count = 0; + for (int dx = -1; dx <= 1 && count < min; ++dx) + for (int dy = -1; dy <= 1 && count < min; ++dy) + for (int dz = -1; dz <= 1 && count < min; ++dz) + { + auto it = vmap.find({ix + dx, iy + dy, iz + dz}); + if (it != vmap.end()) count += it->second; + } + if (count >= min) out->points.push_back(p); + } + out->width = (uint32_t)out->points.size(); + out->height = 1; + out->is_dense = true; + return out; + } + + /* ── 方法 2:RadiusOutlierRemoval ── */ + case 2: + { + auto out = std::make_shared>(); + pcl::RadiusOutlierRemoval ror; + ror.setInputCloud(cloud); + ror.setRadiusSearch(radius_search_); + ror.setMinNeighborsInRadius(min_neighbors_); + ror.filter(*out); + return out; + } + + /* ── 方法 3:StatisticalOutlierRemoval(原始实现,保留兼容)── */ + case 3: + default: + { + auto out = std::make_shared>(); + pcl::StatisticalOutlierRemoval sor; + sor.setInputCloud(cloud); + sor.setMeanK(stat_mean_k_); + sor.setStddevMulThresh(stat_std_thresh_); + sor.filter(*out); + return out; + } + } + } + + /* ============================================================ + * 线程亲和性 + 实时优先级辅助函数 + * ============================================================ */ + static void setThreadAffinity(std::thread& t, std::initializer_list cores) + { +#ifdef __linux__ + cpu_set_t cpuset; + CPU_ZERO(&cpuset); + for (int c : cores) CPU_SET(c, &cpuset); + int rc = pthread_setaffinity_np(t.native_handle(), sizeof(cpu_set_t), &cpuset); + if (rc != 0) fprintf(stderr, "[Affinity] pthread_setaffinity_np failed: %s\n", strerror(rc)); +#endif + } + + static void setThreadRealtime(std::thread& t, int priority /*1-99*/) + { +#ifdef __linux__ + sched_param sp{}; + sp.sched_priority = priority; + int rc = pthread_setschedparam(t.native_handle(), SCHED_FIFO, &sp); + if (rc != 0) + fprintf(stderr, "[Realtime] pthread_setschedparam failed (need root/CAP_SYS_NICE): %s\n", strerror(rc)); +#endif + } + + /* ============================================================ + * GPU 辅助 + * ============================================================ */ + void initGpuResources() + { +#ifdef USE_GPU + if (!use_gpu_) return; + int max_voxel_dim = (int)(grid_range_ / voxel_size_) + 16; + bool ok = GpuResources::instance().init(200000, max_voxel_dim); + if (!ok) + { + use_gpu_ = false; + return; + } + RCLCPP_INFO(get_logger(), "[GPU] Initialized, max_voxel_dim=%d", max_voxel_dim); +#endif + } + +#ifdef USE_GPU + GpuProcessingParams buildGpuParams() const + { + GpuProcessingParams p{}; + p.min_x = filter_min_x_; + p.max_x = filter_max_x_; + p.min_y = filter_min_y_; + p.max_y = filter_max_y_; + p.min_z = filter_min_z_; + p.max_z = filter_max_z_; + p.voxel_resolution = voxel_size_; + p.grid_range = grid_range_; + p.sor_mean_k = stat_mean_k_; + p.sor_std_mult = stat_std_thresh_; + p.filter_car = filter_car_; + if (filter_car_) + { + p.car_min_x = car_lidar_offset_x_ - car_length_ * 0.5f; + p.car_max_x = car_lidar_offset_x_ + car_length_ * 0.5f; + p.car_min_y = car_lidar_offset_y_ - car_width_ * 0.5f; + p.car_max_y = car_lidar_offset_y_ + car_width_ * 0.5f; + p.car_min_z = filter_min_z_; + p.car_max_z = filter_max_z_; + p.filter_brush = true; + p.brush_r_min_x = 0.85f; + p.brush_r_max_x = 1.25f; + p.brush_r_min_y = 0.60f; + p.brush_r_max_y = 0.96f; + p.brush_r_min_z = 1.20f; + p.brush_r_max_z = 1.65f; + p.brush_l_min_x = 0.85f; + p.brush_l_max_x = 1.25f; + p.brush_l_min_y = -0.96f; + p.brush_l_max_y = -0.60f; + p.brush_l_min_z = 1.20f; + p.brush_l_max_z = 1.65f; + } + return p; + } +#endif + + /* ============================================================ + * CropBox 初始化(仅用于需要单独调用的场景) + * ============================================================ */ + void initCropBoxes() + { + if (!filter_car_) return; + Eigen::Vector4f car_min(car_lidar_offset_x_ - car_length_ * 0.5f, car_lidar_offset_y_ - car_width_ * 0.5f, + filter_min_z_, 1.f); + Eigen::Vector4f car_max(car_lidar_offset_x_ + car_length_ * 0.5f, car_lidar_offset_y_ + car_width_ * 0.5f, + filter_max_z_, 1.f); + car_crop_.setMin(car_min); + car_crop_.setMax(car_max); + car_crop_.setNegative(true); + + right_brush_crop_.setMin({0.85f, 0.60f, 1.20f, 1.f}); + right_brush_crop_.setMax({1.25f, 0.96f, 1.65f, 1.f}); + right_brush_crop_.setNegative(true); + + left_brush_crop_.setMin({0.85f, -0.96f, 1.20f, 1.f}); + left_brush_crop_.setMax({1.25f, -0.60f, 1.65f, 1.f}); + left_brush_crop_.setNegative(true); + } + + /* ============================================================ + * 统计信息 + * ============================================================ */ + void printStats() + { + auto t = now(); + if ((t - last_stats_time_).seconds() < 5.0) return; + last_stats_time_ = t; + int f = front_rx_.load(), r = rear_rx_.load(), m = merged_count_.load(); + double merge_rate = (f > 0) ? m * 100.0 / std::min(f, r) : 0.0; + RCLCPP_INFO(get_logger(), + "Stats | rx(F=%d,R=%d) merged=%d(%.0f%%) | queue(raw_F=%zu,raw_R=%zu,proc=%zu) | sor_method=%d", f, + r, m, merge_rate, raw_queue_front_.size(), raw_queue_rear_.size(), processed_queue_.size(), + sor_method_); + } + + /* ============================================================ + * 成员变量 + * ============================================================ */ + /* -- 参数 -- */ std::string front_topic_, rear_topic_, output_topic_, target_frame_; - int queue_size_, cache_size_; - double time_tolerance_, max_wait_time_; - bool enable_debug_; - - // 点云处理过滤器(避免重复创建) - pcl::VoxelGrid vg_; // 体素网格过滤器 - pcl::StatisticalOutlierRemoval sor_; // 统计离群点去除器 - pcl::CropBox car_crop_; // 车身区域过滤器 - pcl::CropBox right_brush_crop_; // 右侧主刷过滤器 - pcl::CropBox left_brush_crop_; // 左侧主刷过滤器 - - // 点云处理参数 - float filter_min_x_, filter_max_x_; - float filter_min_y_, filter_max_y_; + int queue_size_, stat_mean_k_, grid_size_; + int sor_method_, min_neighbors_; + float radius_search_; + double time_tolerance_; + bool enable_debug_, enable_print_, use_gpu_, filter_car_; + float filter_min_x_, filter_max_x_, filter_min_y_, filter_max_y_; float filter_min_z_, filter_max_z_; - float voxel_size_; // 降采样体素大小 - int stat_mean_k_; // 统计离群点去除的k值 - float stat_std_thresh_; // 统计离群点去除的标准差阈值 - int grid_size_; // 栅格大小 - float grid_range_; // 栅格范围 - bool enable_print_; + float voxel_size_, stat_std_thresh_, grid_range_; + float car_length_, car_width_, car_lidar_offset_x_, car_lidar_offset_y_; + float grid_resolution_, grid_inv_resolution_; - // 车身过滤参数 - bool filter_car_{true}; - bool use_gpu_{false}; - float car_length_{2.3f}; - float car_width_{1.32f}; - float car_lidar_offset_x_{0.0f}; - float car_lidar_offset_y_{0.0f}; + /* -- PCL 过滤器(仅作为模板,处理线程使用线程本地副本) -- */ + pcl::VoxelGrid vg_; + pcl::StatisticalOutlierRemoval sor_; + pcl::CropBox car_crop_, right_brush_crop_, left_brush_crop_; + /* -- ROS -- */ rclcpp::Subscription::SharedPtr sub_front_, sub_rear_; rclcpp::Publisher::SharedPtr pub_; - rclcpp::Publisher::SharedPtr grid_pub_; - std::shared_ptr tf_buffer_; std::shared_ptr tf_listener_; - // 点云缓存 - std::deque> front_clouds_; - std::deque> rear_clouds_; - std::mutex cache_mutex_; + /* -- 队列 -- */ + BoundedQueue raw_queue_front_{4}; + BoundedQueue raw_queue_rear_{4}; + BoundedQueue processed_queue_{8}; + /* grid_queue_ 深度为 2:栅格化慢时最多积压 2 帧,其余丢弃,保证不拖慢发布 */ + BoundedQueue::Ptr> grid_queue_{2}; - // 内存池 - PointCloud2Pool cloud_pool_; + /* -- 线程 -- */ + std::thread process_thread_front_; + std::thread process_thread_rear_; + std::thread publish_thread_; + std::thread grid_thread_; + std::atomic shutdown_; - // 统计信息 + /* -- 内存池 -- */ + PointCloud2Pool pool_; + + /* -- 统计 -- */ rclcpp::Time last_stats_time_; - int front_count_, rear_count_, merged_count_; + std::atomic front_rx_{0}, rear_rx_{0}, merged_count_{0}; }; +/* ================================================================ + * main + * ================================================================ */ int main(int argc, char** argv) { rclcpp::init(argc, argv); - // 设置线程优先级 - auto node = std::make_shared(); + /* MultiThreadedExecutor:让订阅回调并发执行(前/后雷达回调互不阻塞) */ + rclcpp::executors::MultiThreadedExecutor executor(rclcpp::ExecutorOptions(), 2 /* 线程数=2,匹配两个订阅 */); + + auto node = std::make_shared(); + executor.add_node(node); + executor.spin(); - rclcpp::spin(node); rclcpp::shutdown(); return 0; } \ No newline at end of file