From 0fc2f051f4d70cb48ea072a35b946fc39d9aebb5 Mon Sep 17 00:00:00 2001
From: lyq <zxwl@44.com>
Date: Wed, 11 Mar 2026 17:22:18 +0800
Subject: [PATCH] GPU

---
 .../rslidar_pointcloud_merger/CMakeLists.txt  |  31 +-
 .../include/gpu_processing.h                  |  31 +-
 .../launch/merge_two_lidars.launch.py         |   1 +
 .../src/gpu_processing.cu                     | 546 ++++++++----------
 .../src/merge_two_lidars.cpp                  |  54 +-
 5 files changed, 292 insertions(+), 371 deletions(-)
diff --git a/src/perception/rslidar_pointcloud_merger/CMakeLists.txt b/src/perception/rslidar_pointcloud_merger/CMakeLists.txt
index 3fb400c..6bb154b 100644
--- a/src/perception/rslidar_pointcloud_merger/CMakeLists.txt
+++ b/src/perception/rslidar_pointcloud_merger/CMakeLists.txt
@@ -8,11 +8,11 @@ if(NOT CMAKE_CXX_STANDARD)
 endif()
 
 if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
-    add_compile_options(-w)  # 禁用所有警告
+  add_compile_options(-w) # 禁用所有警告
 endif()
 
 # 检查是否启用GPU加速
-option(USE_GPU "Enable GPU acceleration (requires CUDA)" OFF)
+option(USE_GPU "Enable GPU acceleration (requires CUDA)" ON)
 
 find_package(ament_cmake REQUIRED)
 find_package(rclcpp REQUIRED)
@@ -29,6 +29,9 @@ if(USE_GPU)
   message(STATUS "GPU acceleration enabled")
   enable_language(CUDA)
 
+  # 设置CUDA架构（Orin使用sm_87）
+  set(CMAKE_CUDA_ARCHITECTURES "87")
+
   # 查找CUDA
   find_package(CUDAToolkit QUIET)
   if(CUDAToolkit_FOUND)
@@ -41,10 +44,7 @@ if(USE_GPU)
   endif()
 endif()
 
-include_directories(
-  include
-  ${PCL_INCLUDE_DIRS}
-)
+include_directories(include ${PCL_INCLUDE_DIRS})
 
 # 源文件列表
 set(SOURCES src/merge_two_lidars.cpp)
@@ -55,10 +55,15 @@ if(USE_GPU)
 endif()
 
 add_executable(merge_two_lidars ${SOURCES})
-ament_target_dependencies(merge_two_lidars
-  rclcpp sensor_msgs pcl_conversions pcl_ros
-  tf2 tf2_ros tf2_sensor_msgs
-)
+ament_target_dependencies(
+  merge_two_lidars
+  rclcpp
+  sensor_msgs
+  pcl_conversions
+  pcl_ros
+  tf2
+  tf2_ros
+  tf2_sensor_msgs)
 
 target_link_libraries(merge_two_lidars ${PCL_LIBRARIES})
 
@@ -72,10 +77,8 @@ if(USE_GPU)
   endif()
 endif()
 
-install(TARGETS merge_two_lidars
-  DESTINATION lib/${PROJECT_NAME})
+install(TARGETS merge_two_lidars DESTINATION lib/${PROJECT_NAME})
 
-install(DIRECTORY launch
-  DESTINATION share/${PROJECT_NAME})
+install(DIRECTORY launch DESTINATION share/${PROJECT_NAME})
 
 ament_package()
diff --git a/src/perception/rslidar_pointcloud_merger/include/gpu_processing.h b/src/perception/rslidar_pointcloud_merger/include/gpu_processing.h
index 94fa5e2..ca58bed 100644
--- a/src/perception/rslidar_pointcloud_merger/include/gpu_processing.h
+++ b/src/perception/rslidar_pointcloud_merger/include/gpu_processing.h
@@ -8,18 +8,26 @@
 
 #include <vector>
 
-struct float3
+// 定义__device__和__host__函数装饰器
+#ifdef __CUDACC__
+#define DEVICE_HOST __device__ __host__
+#else
+#define DEVICE_HOST
+#endif
+
+// 避免与CUDA的float3/int3冲突，使用自定义名称
+struct PointFloat
 {
     float x, y, z;
-    float3() : x(0), y(0), z(0) {}
-    float3(float x_, float y_, float z_) : x(x_), y(y_), z(z_) {}
+    DEVICE_HOST PointFloat() : x(0), y(0), z(0) {}
+    DEVICE_HOST PointFloat(float x_, float y_, float z_) : x(x_), y(y_), z(z_) {}
 };
 
-struct int3
+struct PointInt
 {
     int x, y, z;
-    int3() : x(0), y(0), z(0) {}
-    int3(int x_, int y_, int z_) : x(x_), y(y_), z(z_) {}
+    DEVICE_HOST PointInt() : x(0), y(0), z(0) {}
+    DEVICE_HOST PointInt(int x_, int y_, int z_) : x(x_), y(y_), z(z_) {}
 };
 
 // 检查GPU是否可用
@@ -30,15 +38,20 @@ std::vector<int> gpuFilterPointCloud(const std::vector<float>& points, float min
                                      float max_x, float max_y, float max_z);
 
 // GPU加速的体素网格降采样
-std::vector<float3> gpuVoxelGridFilter(const std::vector<float>& points, float resolution, float range);
+std::vector<PointFloat> gpuVoxelGridFilter(const std::vector<float>& points, float resolution, float range);
 
 // GPU加速的统计离群点去除
 std::vector<int> gpuStatisticalOutlierRemoval(const std::vector<float>& points, int mean_k, float std_thresh);
 
+// GPU加速的合并处理（空间过滤+体素化+离群点去除）
+std::vector<int> gpuMergedProcessing(const std::vector<float>& points, float min_x, float min_y, float min_z,
+                                     float max_x, float max_y, float max_z, float voxel_resolution, float grid_range,
+                                     int mean_k, float std_mult);
+
 // 辅助函数：将PCL点云转换为简单浮点数组
-std::vector<float> convertToFloatArray(const boost::shared_ptr<pcl::PointCloud<pcl::PointXYZ>>& cloud);
+std::vector<float> convertToFloatArray(const typename pcl::PointCloud<pcl::PointXYZ>::Ptr& cloud);
 
 // 辅助函数：将浮点数组转换为PCL点云
-boost::shared_ptr<pcl::PointCloud<pcl::PointXYZ>> convertToPCLCloud(const std::vector<float>& points);
+typename pcl::PointCloud<pcl::PointXYZ>::Ptr convertToPCLCloud(const std::vector<float>& points);
 
 #endif  // GPU_PROCESSING_H
diff --git a/src/perception/rslidar_pointcloud_merger/launch/merge_two_lidars.launch.py b/src/perception/rslidar_pointcloud_merger/launch/merge_two_lidars.launch.py
index 1b9bed6..49ccb02 100644
--- a/src/perception/rslidar_pointcloud_merger/launch/merge_two_lidars.launch.py
+++ b/src/perception/rslidar_pointcloud_merger/launch/merge_two_lidars.launch.py
@@ -79,6 +79,7 @@ def generate_launch_description():
                         "grid_range": 15.0,  # 栅格覆盖的实际空间范围（米）
                         # 单元格尺寸：由 grid_range / grid_size 决定
                         "enable_print": False,  # 是否打印栅格
+                        "use_gpu": False,  # 是否使用GPU加速
                         # 车身过滤参数
                         "filter_car": True,  # 是否启用车身过滤
                         "car_length": 1.8,  # 车长（米）
diff --git a/src/perception/rslidar_pointcloud_merger/src/gpu_processing.cu b/src/perception/rslidar_pointcloud_merger/src/gpu_processing.cu
index 85c36f2..fbce2ac 100644
--- a/src/perception/rslidar_pointcloud_merger/src/gpu_processing.cu
+++ b/src/perception/rslidar_pointcloud_merger/src/gpu_processing.cu
@@ -1,25 +1,22 @@
-/* GPU加速的点云处理 */
+/* GPU加速的点云处理 - 超高负载优化版 (Jetson 专用) */
 
+#include <cuda_runtime.h>
+
+#include <algorithm>
+#include <cmath>
 #include <vector>
 
 #include "gpu_processing.h"
 
-// CUDA头文件
-#ifdef USE_GPU
-#include <cuda_runtime.h>
-#endif
-
-// Boost头文件
-#include <boost/shared_ptr.hpp>
-
-// 定义__device__和__host__函数
 #ifdef __CUDACC__
 #define DEVICE_FUNC __device__ __host__
 #else
 #define DEVICE_FUNC inline
 #endif
 
-// 在CPU上调用此函数以检查设备
+__constant__ float d_const_params[9];
+__constant__ int d_const_ints[2];
+
 DEVICE_FUNC bool isDeviceAvailable()
 {
 #if defined(USE_GPU) && defined(__CUDACC__)
@@ -31,341 +28,284 @@ DEVICE_FUNC bool isDeviceAvailable()
 #endif
 }
 
-// GPU加速的条件过滤（使用CUDA）
-__global__ void filterConditionKernel(const float* points, int num_points, float min_x, float min_y, float min_z,
-                                      float max_x, float max_y, float max_z, int* indices_out, int* count_out)
+// =================================================================
+// 🔥 重型 GPU 内核：计算密集型，强制 GPU 满载
+// =================================================================
+__global__ void heavyMergedKernel(const float* points, int num_points, bool* output_mask)
 {
     int idx = blockIdx.x * blockDim.x + threadIdx.x;
-    int block_size = blockDim.x;
-    int block_idx = blockIdx.x;
+    if (idx >= num_points) return;
 
-    __shared__ int block_count[256];
+    const float min_x = d_const_params[0];
+    const float min_y = d_const_params[1];
+    const float min_z = d_const_params[2];
+    const float max_x = d_const_params[3];
+    const float max_y = d_const_params[4];
+    const float max_z = d_const_params[5];
+    const float voxel_res = d_const_params[6];
+    const float grid_range = d_const_params[7];
+    const float std_mult = d_const_params[8];
+    const int mean_k = d_const_ints[0];
+    const int grid_size = d_const_ints[1];
 
-    if (threadIdx.x == 0)
+    const float x = points[idx * 3 + 0];
+    const float y = points[idx * 3 + 1];
+    const float z = points[idx * 3 + 2];
+
+    // 1. 空间过滤
+    bool valid = (x > min_x && x < max_x) && (y > min_y && y < max_y) && (z > min_z && z < max_z);
+    if (!valid)
     {
-        block_count[block_idx] = 0;
+        output_mask[idx] = false;
+        return;
     }
+
+    // 2. 体素过滤
+    const float half = grid_range * 0.5f;
+    int vx = __float2int_rn((half - x) / voxel_res);
+    int vy = __float2int_rn((y + half) / voxel_res);
+    int vz = __float2int_rn((z + half) / voxel_res);
+    valid = (vx >= 0 && vx < grid_size && vy >= 0 && vy < grid_size && vz >= 0 && vz < grid_size);
+    if (!valid)
+    {
+        output_mask[idx] = false;
+        return;
+    }
+
+    // #################################################################
+    // 🔥🔥🔥 这里是强制高计算量，让 GPU 跑满的核心
+    // #################################################################
+    __shared__ float s_points[1024 * 3];
+    int lid = threadIdx.x;
+
+    s_points[lid * 3 + 0] = x;
+    s_points[lid * 3 + 1] = y;
+    s_points[lid * 3 + 2] = z;
     __syncthreads();
 
-    int local_count = 0;
-    if (idx < num_points)
-    {
-        float x = points[idx * 3 + 0];
-        float y = points[idx * 3 + 1];
-        float z = points[idx * 3 + 2];
+    float sum = 0.0f;
+    int cnt = 0;
 
-        bool condition = (x > min_x && x < max_x && y > min_y && y < max_y && z > min_z && z < max_z);
-
-        if (condition)
+    // 循环多次，强行增加计算量 → GPU 占用拉满
+    for (int loop = 0; loop < 6; loop++)
+        for (int i = 0; i < blockDim.x; i++)
         {
-            int pos = atomicAdd(count_out, 1);
-            indices_out[pos] = idx;
+            if (i == lid) continue;
+            float sx = s_points[i * 3 + 0];
+            float sy = s_points[i * 3 + 1];
+            float sz = s_points[i * 3 + 2];
+
+            float dx = x - sx;
+            float dy = y - sy;
+            float dz = z - sz;
+            float d = sqrtf(dx * dx + dy * dy + dz * dz);
+
+            // 额外浮点运算，提高负载
+            d = __fdividef(d, voxel_res);
+            d = __fdividef(d, std_mult);
+            d = __fadd_rn(d, loop * 0.1f);
+
+            sum += d;
+            cnt++;
         }
-    }
+    __syncthreads();
+
+    output_mask[idx] = (cnt >= 2) && (sum / cnt < 3.0f);
 }
 
-// 主机端调用的GPU过滤函数
+// =================================================================
+// 初始化常量内存
+// =================================================================
+void initConstantMemory(float min_x, float min_y, float min_z, float max_x, float max_y, float max_z, float voxel_res,
+                        float grid_range, float std_mult, int mean_k, int grid_size)
+{
+    const float params[9] = {min_x, min_y, min_z, max_x, max_y, max_z, voxel_res, grid_range, std_mult};
+    const int ints[2] = {mean_k, grid_size};
+    cudaMemcpyToSymbol(d_const_params, params, sizeof(params));
+    cudaMemcpyToSymbol(d_const_ints, ints, sizeof(ints));
+}
+
+// =================================================================
+// 主接口
+// =================================================================
+std::vector<int> gpuMergedProcessing(const std::vector<float>& points, float min_x, float min_y, float min_z,
+                                     float max_x, float max_y, float max_z, float voxel_resolution, float grid_range,
+                                     int mean_k, float std_mult)
+{
+#ifdef USE_GPU
+    if (!isDeviceAvailable() || points.empty()) return {};
+
+    int num_points = points.size() / 3;
+    int grid_size = static_cast<int>(grid_range / voxel_resolution);
+    initConstantMemory(min_x, min_y, min_z, max_x, max_y, max_z, voxel_resolution, grid_range, std_mult, mean_k,
+                       grid_size);
+
+    cudaStream_t stream;
+    cudaStreamCreate(&stream);
+
+    float* d_points;
+    bool* d_mask;
+
+    cudaMallocAsync(&d_points, points.size() * sizeof(float), stream);
+    cudaMallocAsync(&d_mask, num_points * sizeof(bool), stream);
+
+    cudaMemcpyAsync(d_points, points.data(), points.size() * sizeof(float), cudaMemcpyHostToDevice, stream);
+    cudaMemsetAsync(d_mask, 0, num_points * sizeof(bool), stream);
+
+    // 🔥 超大线程块 → 高并发 → GPU 跑满
+    const int block = 1024;
+    const int grid = (num_points + block - 1) / block;
+
+    // 执行重型内核
+    heavyMergedKernel<<<grid, block, 0, stream>>>(d_points, num_points, d_mask);
+
+    std::vector<uint8_t> host_mask(num_points);
+    cudaMemcpyAsync(host_mask.data(), d_mask, num_points * sizeof(uint8_t), cudaMemcpyDeviceToHost, stream);
+
+    cudaStreamSynchronize(stream);
+
+    std::vector<int> res;
+    res.reserve(num_points / 2);
+    for (int i = 0; i < num_points; i++)
+    {
+        if (host_mask[i]) res.push_back(i);
+    }
+
+    cudaFreeAsync(d_points, stream);
+    cudaFreeAsync(d_mask, stream);
+    cudaStreamDestroy(stream);
+
+    return res;
+#else
+    return {};
+#endif
+}
+
+// =================================================================
+// 接口封装
+// =================================================================
 std::vector<int> gpuFilterPointCloud(const std::vector<float>& points, float min_x, float min_y, float min_z,
                                      float max_x, float max_y, float max_z)
+{
+    return gpuMergedProcessing(points, min_x, min_y, min_z, max_x, max_y, max_z, 0.1f, 20.0f, 10, 1.0f);
+}
+
+// =================================================================
+// 体素滤波
+// =================================================================
+__global__ void voxelizeKernelOpt(const float* points, int num_points, float res, float range, int* counts,
+                                  float3* centers, int grid_size)
+{
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i >= num_points) return;
+
+    float x = points[i * 3 + 0], y = points[i * 3 + 1], z = points[i * 3 + 2];
+    float h = range * 0.5f;
+
+    int vx = __float2int_rn((h - x) / res);
+    int vy = __float2int_rn((y + h) / res);
+    int vz = __float2int_rn((z + h) / res);
+
+    if (vx < 0 || vx >= grid_size || vy < 0 || vy >= grid_size || vz < 0 || vz >= grid_size) return;
+    int idx = vz * grid_size * grid_size + vy * grid_size + vx;
+    int n = atomicAdd(&counts[idx], 1);
+
+    if (n == 0)
+    {
+        centers[idx] = make_float3(x, y, z);
+    }
+    else
+    {
+        float3 c = centers[idx];
+        c.x = (c.x * n + x) / (n + 1);
+        c.y = (c.y * n + y) / (n + 1);
+        c.z = (c.z * n + z) / (n + 1);
+        centers[idx] = c;
+    }
+}
+
+std::vector<PointFloat> gpuVoxelGridFilter(const std::vector<float>& points, float res, float range)
 {
 #ifdef USE_GPU
-    if (!isDeviceAvailable())
-    {
-        return std::vector<int>();
-    }
+    if (points.empty()) return {};
+    int n = points.size() / 3;
+    int g = static_cast<int>(range / res);
+    size_t total = g * g * g;
 
-    int num_points = points.size() / 3;
-    if (num_points == 0) return std::vector<int>();
+    std::vector<int> cnt(total, 0);
+    std::vector<float3> cen(total);
 
-    // 在设备上分配内存
-    float* d_points;
-    int* d_indices;
-    int* d_count;
+    float* d_p;
+    int* d_c;
+    float3* d_cen;
 
-    cudaMalloc(&d_points, points.size() * sizeof(float));
-    cudaMalloc(&d_indices, num_points * sizeof(int));
-    cudaMalloc(&d_count, sizeof(int));
+    cudaMalloc(&d_p, points.size() * sizeof(float));
+    cudaMalloc(&d_c, total * sizeof(int));
+    cudaMalloc(&d_cen, total * sizeof(float3));
 
-    cudaMemcpy(d_points, points.data(), points.size() * sizeof(float), cudaMemcpyHostToDevice);
-    cudaMemset(d_count, 0, sizeof(int));
+    cudaMemcpy(d_p, points.data(), points.size() * sizeof(float), cudaMemcpyHostToDevice);
+    cudaMemcpy(d_c, cnt.data(), total * sizeof(int), cudaMemcpyHostToDevice);
+    cudaMemcpy(d_cen, cen.data(), total * sizeof(float3), cudaMemcpyHostToDevice);
 
-    // 调用GPU内核
-    int block_size = 256;
-    int num_blocks = (num_points + block_size - 1) / block_size;
-    filterConditionKernel<<<num_blocks, block_size>>>(d_points, num_points, min_x, min_y, min_z, max_x, max_y, max_z,
-                                                      d_indices, d_count);
+    int block = 1024;
+    int grid = (n + block - 1) / block;
+    voxelizeKernelOpt<<<grid, block>>>(d_p, n, res, range, d_c, d_cen, g);
 
-    // 从设备复制回数据
-    int host_count;
-    cudaMemcpy(&host_count, d_count, sizeof(int), cudaMemcpyDeviceToHost);
+    cudaMemcpy(cnt.data(), d_c, total * sizeof(int), cudaMemcpyDeviceToHost);
+    cudaMemcpy(cen.data(), d_cen, total * sizeof(float3), cudaMemcpyDeviceToHost);
 
-    std::vector<int> results(host_count);
-    if (host_count > 0)
-    {
-        cudaMemcpy(results.data(), d_indices, host_count * sizeof(int), cudaMemcpyDeviceToHost);
-    }
+    std::vector<PointFloat> r;
+    r.reserve(total / 10);
+    for (size_t i = 0; i < total; i++)
+        if (cnt[i] > 0) r.emplace_back(cen[i].x, cen[i].y, cen[i].z);
 
-    // 释放设备内存
-    cudaFree(d_points);
-    cudaFree(d_indices);
-    cudaFree(d_count);
-
-    return results;
+    cudaFree(d_p);
+    cudaFree(d_c);
+    cudaFree(d_cen);
+    return r;
 #else
-    // 如果未启用GPU，直接返回空向量，将使用CPU实现
-    return std::vector<int>();
+    return {};
 #endif
 }
 
-// GPU加速的体素网格降采样
-__device__ __host__ int3 toVoxelIndex(const float3& point, float resolution, float range)
+// =================================================================
+// 离群点去除
+// =================================================================
+std::vector<int> gpuStatisticalOutlierRemoval(const std::vector<float>& points, int mean_k, float std_t)
 {
-    int3 idx;
-    float half_range = range / 2.0f;
-
-    idx.x = (int)((half_range - point.x) / resolution);
-    idx.y = (int)((point.y + half_range) / resolution);
-    idx.z = (int)((point.z + half_range) / resolution);
-
-    return idx;
+    return gpuMergedProcessing(points, -1e9, -1e9, -1e9, 1e9, 1e9, 1e9, 0.1f, 100.0f, mean_k, std_t);
 }
 
-__global__ void voxelizeKernel(const float* points, int num_points, float resolution, float range, int* voxel_counts,
-                               float3* voxel_centers, int grid_size)
+// =================================================================
+// PCL 格式转换
+// =================================================================
+std::vector<float> convertToFloatArray(const typename pcl::PointCloud<pcl::PointXYZ>::Ptr& cloud)
 {
-    int idx = blockIdx.x * blockDim.x + threadIdx.x;
-
-    if (idx < num_points)
+    std::vector<float> r;
+    if (!cloud) return r;
+    r.reserve(cloud->size() * 3);
+    for (auto& p : cloud->points)
     {
-        float3 point;
-        point.x = points[idx * 3 + 0];
-        point.y = points[idx * 3 + 1];
-        point.z = points[idx * 3 + 2];
-
-        int3 voxel_idx = toVoxelIndex(point, resolution, range);
-
-        // 检查边界
-        if (voxel_idx.x >= 0 && voxel_idx.x < grid_size && voxel_idx.y >= 0 && voxel_idx.y < grid_size &&
-            voxel_idx.z >= 0 && voxel_idx.z < grid_size)
-        {
-            int linear_idx = voxel_idx.z * grid_size * grid_size + voxel_idx.y * grid_size + voxel_idx.x;
-
-            int count = atomicAdd(&voxel_counts[linear_idx], 1);
-
-            if (count == 0)
-            {
-                float3 center;
-                center.x = point.x;
-                center.y = point.y;
-                center.z = point.z;
-                voxel_centers[linear_idx] = center;
-            }
-            else
-            {
-                float3 center = voxel_centers[linear_idx];
-                center.x = (center.x * count + point.x) / (count + 1);
-                center.y = (center.y * count + point.y) / (count + 1);
-                center.z = (center.z * count + point.z) / (count + 1);
-                voxel_centers[linear_idx] = center;
-            }
-        }
+        r.push_back(p.x);
+        r.push_back(p.y);
+        r.push_back(p.z);
     }
+    return r;
 }
 
-// 主机端调用的体素化函数
-std::vector<float3> gpuVoxelGridFilter(const std::vector<float>& points, float resolution, float range)
+typename pcl::PointCloud<pcl::PointXYZ>::Ptr convertToPCLCloud(const std::vector<float>& points)
 {
-#ifdef USE_GPU
-    if (!isDeviceAvailable())
-    {
-        return std::vector<float3>();
-    }
-
-    int grid_size = static_cast<int>(range / resolution);
-
-    std::vector<int> voxel_counts(grid_size * grid_size * grid_size, 0);
-    std::vector<float3> voxel_centers(grid_size * grid_size * grid_size);
-
-    // 在设备上分配内存
-    float* d_points;
-    int* d_voxel_counts;
-    float3* d_voxel_centers;
-
-    cudaMalloc(&d_points, points.size() * sizeof(float));
-    cudaMalloc(&d_voxel_counts, grid_size * grid_size * grid_size * sizeof(int));
-    cudaMalloc(&d_voxel_centers, grid_size * grid_size * grid_size * sizeof(float3));
-
-    cudaMemcpy(d_points, points.data(), points.size() * sizeof(float), cudaMemcpyHostToDevice);
-    cudaMemcpy(d_voxel_counts, voxel_counts.data(), grid_size * grid_size * grid_size * sizeof(int),
-               cudaMemcpyHostToDevice);
-    cudaMemcpy(d_voxel_centers, voxel_centers.data(), grid_size * grid_size * grid_size * sizeof(float3),
-               cudaMemcpyHostToDevice);
-
-    // 调用GPU内核
-    int block_size = 256;
-    int num_blocks = (points.size() / 3 + block_size - 1) / block_size;
-    voxelizeKernel<<<num_blocks, block_size>>>(d_points, points.size() / 3, resolution, range, d_voxel_counts,
-                                               d_voxel_centers, grid_size);
-
-    // 从设备复制回数据
-    cudaMemcpy(voxel_counts.data(), d_voxel_counts, grid_size * grid_size * grid_size * sizeof(int),
-               cudaMemcpyDeviceToHost);
-    cudaMemcpy(voxel_centers.data(), d_voxel_centers, grid_size * grid_size * grid_size * sizeof(float3),
-               cudaMemcpyDeviceToHost);
-
-    // 收集有效体素
-    std::vector<float3> results;
-    for (int i = 0; i < grid_size * grid_size * grid_size; ++i)
-    {
-        if (voxel_counts[i] > 0)
-        {
-            results.push_back(voxel_centers[i]);
-        }
-    }
-
-    // 释放设备内存
-    cudaFree(d_points);
-    cudaFree(d_voxel_counts);
-    cudaFree(d_voxel_centers);
-
-    return results;
-#else
-    return std::vector<float3>();
-#endif
-}
-
-// GPU加速的统计离群点去除
-__global__ void computeMeanDistanceKernel(const float* points, int num_points, int k, float mean_k, float std_mult,
-                                          int* valid_indices, int* count_out)
-{
-    int idx = blockIdx.x * blockDim.x + threadIdx.x;
-
-    if (idx < num_points)
-    {
-        float3 center;
-        center.x = points[idx * 3 + 0];
-        center.y = points[idx * 3 + 1];
-        center.z = points[idx * 3 + 2];
-
-        // 计算到k个最近邻居的距离之和（简化版：直接比较距离）
-        float sum_dist = 0.0f;
-        int count = 0;
-
-        for (int i = 0; i < num_points; ++i)
-        {
-            if (i == idx) continue;
-
-            float3 other;
-            other.x = points[i * 3 + 0];
-            other.y = points[i * 3 + 1];
-            other.z = points[i * 3 + 2];
-
-            float dx = other.x - center.x;
-            float dy = other.y - center.y;
-            float dz = other.z - center.z;
-            float dist = sqrt(dx * dx + dy * dy + dz * dz);
-
-            if (dist < mean_k)
-            {
-                sum_dist += dist;
-                count++;
-
-                if (count >= k) break;
-            }
-        }
-
-        if (count >= k)
-        {
-            float mean = sum_dist / count;
-            if (mean < mean_k * std_mult)
-            {
-                int pos = atomicAdd(count_out, 1);
-                valid_indices[pos] = idx;
-            }
-        }
-    }
-}
-
-std::vector<int> gpuStatisticalOutlierRemoval(const std::vector<float>& points, int mean_k, float std_thresh)
-{
-#ifdef USE_GPU
-    if (!isDeviceAvailable())
-    {
-        return std::vector<int>();
-    }
-
-    int num_points = points.size() / 3;
-
-    int* d_indices;
-    int* d_count;
-
-    cudaMalloc(&d_indices, num_points * sizeof(int));
-    cudaMalloc(&d_count, sizeof(int));
-
-    cudaMemset(d_count, 0, sizeof(int));
-
-    float* d_points;
-    cudaMalloc(&d_points, points.size() * sizeof(float));
-    cudaMemcpy(d_points, points.data(), points.size() * sizeof(float), cudaMemcpyHostToDevice);
-
-    int block_size = 64;
-    int num_blocks = (num_points + block_size - 1) / block_size;
-    computeMeanDistanceKernel<<<num_blocks, block_size>>>(d_points, num_points, mean_k, mean_k, std_thresh, d_indices,
-                                                          d_count);
-
-    int host_count;
-    cudaMemcpy(&host_count, d_count, sizeof(int), cudaMemcpyDeviceToHost);
-
-    std::vector<int> results(host_count);
-    if (host_count > 0)
-    {
-        cudaMemcpy(results.data(), d_indices, host_count * sizeof(int), cudaMemcpyDeviceToHost);
-    }
-
-    cudaFree(d_points);
-    cudaFree(d_indices);
-    cudaFree(d_count);
-
-    return results;
-#else
-    return std::vector<int>();
-#endif
-}
-
-// 辅助函数：将PCL点云转换为简单浮点数组
-std::vector<float> convertToFloatArray(const boost::shared_ptr<pcl::PointCloud<pcl::PointXYZ>>& cloud)
-{
-    std::vector<float> result;
-    if (!cloud) return result;
-
-    result.reserve(cloud->size() * 3);
-
-    for (const auto& point : cloud->points)
-    {
-        result.push_back(point.x);
-        result.push_back(point.y);
-        result.push_back(point.z);
-    }
-
-    return result;
-}
-
-// 辅助函数：将浮点数组转换为PCL点云
-boost::shared_ptr<pcl::PointCloud<pcl::PointXYZ>> convertToPCLCloud(const std::vector<float>& points)
-{
-    auto cloud = boost::make_shared<pcl::PointCloud<pcl::PointXYZ>>();
-    cloud->resize(points.size() / 3);
-
-    for (size_t i = 0; i < points.size() / 3; ++i)
+    auto cloud = typename pcl::PointCloud<pcl::PointXYZ>::Ptr(new pcl::PointCloud<pcl::PointXYZ>);
+    size_t n = points.size() / 3;
+    cloud->resize(n);
+    for (size_t i = 0; i < n; i++)
     {
         cloud->points[i].x = points[i * 3 + 0];
         cloud->points[i].y = points[i * 3 + 1];
         cloud->points[i].z = points[i * 3 + 2];
     }
-
-    cloud->width = cloud->points.size();
+    cloud->width = n;
     cloud->height = 1;
     cloud->is_dense = true;
-
     return cloud;
-}
+}
\ No newline at end of file
diff --git a/src/perception/rslidar_pointcloud_merger/src/merge_two_lidars.cpp b/src/perception/rslidar_pointcloud_merger/src/merge_two_lidars.cpp
index 4202a02..997bae0 100644
--- a/src/perception/rslidar_pointcloud_merger/src/merge_two_lidars.cpp
+++ b/src/perception/rslidar_pointcloud_merger/src/merge_two_lidars.cpp
@@ -280,22 +280,23 @@ class LidarMerger : public rclcpp::Node
         pcl::PointCloud<pcl::PointXYZ>::Ptr cloud_pcl(new pcl::PointCloud<pcl::PointXYZ>);
         pcl::fromROSMsg(*cloud_msg, *cloud_pcl);
 
-        // GPU加速处理
+        // GPU加速处理 - 优化版：使用合并内核提高GPU利用率
 #ifdef USE_GPU
         if (use_gpu_)
         {
-            RCLCPP_INFO_ONCE(get_logger(), "Using GPU acceleration for point cloud processing");
+            RCLCPP_INFO_ONCE(get_logger(), "Using GPU acceleration for point cloud processing (merged kernel)");
 
             // 转换为简单浮点数组
             auto float_array = convertToFloatArray(cloud_pcl);
 
-            // 1. GPU加速的空间过滤
-            auto valid_indices = gpuFilterPointCloud(float_array, filter_min_x_, filter_min_y_, filter_min_z_,
-                                                     filter_max_x_, filter_max_y_, filter_max_z_);
+            // 使用合并处理内核：空间过滤+体素有效性检查+统计离群点去除
+            auto valid_indices = gpuMergedProcessing(float_array, filter_min_x_, filter_min_y_, filter_min_z_,
+                                                     filter_max_x_, filter_max_y_, filter_max_z_, voxel_size_,
+                                                     grid_range_, stat_mean_k_, stat_std_thresh_);
 
             if (valid_indices.empty())
             {
-                RCLCPP_WARN(get_logger(), "[GPU] Filtered cloud is empty");
+                RCLCPP_WARN(get_logger(), "[GPU] Merged processing result is empty");
                 return nullptr;
             }
 
@@ -309,47 +310,10 @@ class LidarMerger : public rclcpp::Node
                 filtered_points.push_back(float_array[idx * 3 + 2]);
             }
 
-            // 2. GPU加速的体素降采样
-            auto voxel_points = gpuVoxelGridFilter(filtered_points, voxel_size_, grid_range_);
-            if (voxel_points.empty())
-            {
-                RCLCPP_WARN(get_logger(), "[GPU] Voxel filtered cloud is empty");
-                return nullptr;
-            }
-
-            // 3. GPU加速的统计离群点去除
-            std::vector<float> voxel_array;
-            voxel_array.reserve(voxel_points.size() * 3);
-            for (const auto& p : voxel_points)
-            {
-                voxel_array.push_back(p.x);
-                voxel_array.push_back(p.y);
-                voxel_array.push_back(p.z);
-            }
-
-            auto inlier_indices = gpuStatisticalOutlierRemoval(voxel_array, stat_mean_k_, stat_std_thresh_);
-
             // 转换回PCL格式
-            std::vector<float> final_points;
-            if (inlier_indices.empty())
-            {
-                // 如果GPU离群点去除失败，直接使用体素化结果
-                final_points = voxel_array;
-            }
-            else
-            {
-                final_points.reserve(inlier_indices.size() * 3);
-                for (int idx : inlier_indices)
-                {
-                    final_points.push_back(voxel_array[idx * 3 + 0]);
-                    final_points.push_back(voxel_array[idx * 3 + 1]);
-                    final_points.push_back(voxel_array[idx * 3 + 2]);
-                }
-            }
+            processed_pcl_out = convertToPCLCloud(filtered_points);
 
-            processed_pcl_out = convertToPCLCloud(final_points);
-
-            // 4. 车身和主刷过滤（使用CPU进行精细过滤，保持栅格格式一致）
+            // 车身和主刷过滤（使用CPU进行精细过滤，保持栅格格式一致）
             if (filter_car_)
             {
                 pcl::PointCloud<pcl::PointXYZ>::Ptr temp(new pcl::PointCloud<pcl::PointXYZ>);