GPU

2026-03-11 17:22:18 +08:00 · 2026-03-11 17:22:18 +08:00 · 0fc2f051f4
commit 0fc2f051f4
parent b3ee1a647d
5 changed files with 292 additions and 371 deletions
--- a/src/perception/rslidar_pointcloud_merger/CMakeLists.txt
+++ b/src/perception/rslidar_pointcloud_merger/CMakeLists.txt
@ -12,7 +12,7 @@ if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang
 endif()
 # 检查是否启用GPU加速
-option(USE_GPU "Enable GPU acceleration (requires CUDA)" OFF)
+option(USE_GPU "Enable GPU acceleration (requires CUDA)" ON)
 find_package(ament_cmake REQUIRED)
 find_package(rclcpp REQUIRED)
@ -29,6 +29,9 @@ if(USE_GPU)
  message(STATUS "GPU acceleration enabled")
  enable_language(CUDA)
  # 设置CUDA架构（Orin使用sm_87）
  set(CMAKE_CUDA_ARCHITECTURES "87")
  # 查找CUDA
  find_package(CUDAToolkit QUIET)
  if(CUDAToolkit_FOUND)
@ -41,10 +44,7 @@ if(USE_GPU)
  endif()
 endif()
-include_directories(
+include_directories(include ${PCL_INCLUDE_DIRS})
  include
  ${PCL_INCLUDE_DIRS}
 )
 # 源文件列表
 set(SOURCES src/merge_two_lidars.cpp)
@ -55,10 +55,15 @@ if(USE_GPU)
 endif()
 add_executable(merge_two_lidars ${SOURCES})
-ament_target_dependencies(merge_two_lidars
+ament_target_dependencies(
-  rclcpp sensor_msgs pcl_conversions pcl_ros
+  merge_two_lidars
-  tf2 tf2_ros tf2_sensor_msgs
+  rclcpp
-)
+  sensor_msgs
  pcl_conversions
  pcl_ros
  tf2
  tf2_ros
  tf2_sensor_msgs)
 target_link_libraries(merge_two_lidars ${PCL_LIBRARIES})
@ -72,10 +77,8 @@ if(USE_GPU)
  endif()
 endif()
-install(TARGETS merge_two_lidars
+install(TARGETS merge_two_lidars DESTINATION lib/${PROJECT_NAME})
  DESTINATION lib/${PROJECT_NAME})
-install(DIRECTORY launch
+install(DIRECTORY launch DESTINATION share/${PROJECT_NAME})
  DESTINATION share/${PROJECT_NAME})
 ament_package()
--- a/src/perception/rslidar_pointcloud_merger/include/gpu_processing.h
+++ b/src/perception/rslidar_pointcloud_merger/include/gpu_processing.h
@ -8,18 +8,26 @@
 #include <vector>
-struct float3
+// 定义__device__和__host__函数装饰器
 #ifdef __CUDACC__
 #define DEVICE_HOST __device__ __host__
 #else
 #define DEVICE_HOST
 #endif
 // 避免与CUDA的float3/int3冲突，使用自定义名称
 struct PointFloat
 {
    float x, y, z;
-    float3() : x(0), y(0), z(0) {}
+    DEVICE_HOST PointFloat() : x(0), y(0), z(0) {}
-    float3(float x_, float y_, float z_) : x(x_), y(y_), z(z_) {}
+    DEVICE_HOST PointFloat(float x_, float y_, float z_) : x(x_), y(y_), z(z_) {}
 };
-struct int3
+struct PointInt
 {
    int x, y, z;
-    int3() : x(0), y(0), z(0) {}
+    DEVICE_HOST PointInt() : x(0), y(0), z(0) {}
-    int3(int x_, int y_, int z_) : x(x_), y(y_), z(z_) {}
+    DEVICE_HOST PointInt(int x_, int y_, int z_) : x(x_), y(y_), z(z_) {}
 };
 // 检查GPU是否可用
@ -30,15 +38,20 @@ std::vector<int> gpuFilterPointCloud(const std::vector<float>& points, float min
                                     float max_x, float max_y, float max_z);
 // GPU加速的体素网格降采样
-std::vector<float3> gpuVoxelGridFilter(const std::vector<float>& points, float resolution, float range);
+std::vector<PointFloat> gpuVoxelGridFilter(const std::vector<float>& points, float resolution, float range);
 // GPU加速的统计离群点去除
 std::vector<int> gpuStatisticalOutlierRemoval(const std::vector<float>& points, int mean_k, float std_thresh);
 // GPU加速的合并处理（空间过滤+体素化+离群点去除）
 std::vector<int> gpuMergedProcessing(const std::vector<float>& points, float min_x, float min_y, float min_z,
                                     float max_x, float max_y, float max_z, float voxel_resolution, float grid_range,
                                     int mean_k, float std_mult);
 // 辅助函数：将PCL点云转换为简单浮点数组
-std::vector<float> convertToFloatArray(const boost::shared_ptr<pcl::PointCloud<pcl::PointXYZ>>& cloud);
+std::vector<float> convertToFloatArray(const typename pcl::PointCloud<pcl::PointXYZ>::Ptr& cloud);
 // 辅助函数：将浮点数组转换为PCL点云
-boost::shared_ptr<pcl::PointCloud<pcl::PointXYZ>> convertToPCLCloud(const std::vector<float>& points);
+typename pcl::PointCloud<pcl::PointXYZ>::Ptr convertToPCLCloud(const std::vector<float>& points);
 #endif  // GPU_PROCESSING_H
--- a/src/perception/rslidar_pointcloud_merger/launch/merge_two_lidars.launch.py
+++ b/src/perception/rslidar_pointcloud_merger/launch/merge_two_lidars.launch.py
@ -79,6 +79,7 @@ def generate_launch_description():
                        "grid_range": 15.0,  # 栅格覆盖的实际空间范围（米）
                        # 单元格尺寸：由 grid_range / grid_size 决定
                        "enable_print": False,  # 是否打印栅格
                        "use_gpu": False,  # 是否使用GPU加速
                        # 车身过滤参数
                        "filter_car": True,  # 是否启用车身过滤
                        "car_length": 1.8,  # 车长（米）
--- a/src/perception/rslidar_pointcloud_merger/src/gpu_processing.cu
+++ b/src/perception/rslidar_pointcloud_merger/src/gpu_processing.cu
@ -1,25 +1,22 @@
-/* GPU加速的点云处理 */
+/* GPU加速的点云处理 - 超高负载优化版 (Jetson 专用) */
 #include <cuda_runtime.h>
 #include <algorithm>
 #include <cmath>
 #include <vector>
 #include "gpu_processing.h"
 // CUDA头文件
 #ifdef USE_GPU
 #include <cuda_runtime.h>
 #endif
 // Boost头文件
 #include <boost/shared_ptr.hpp>
 // 定义__device__和__host__函数
 #ifdef __CUDACC__
 #define DEVICE_FUNC __device__ __host__
 #else
 #define DEVICE_FUNC inline
 #endif
-// 在CPU上调用此函数以检查设备
+__constant__ float d_const_params[9];
 __constant__ int d_const_ints[2];
 DEVICE_FUNC bool isDeviceAvailable()
 {
 #if defined(USE_GPU) && defined(__CUDACC__)
@ -31,341 +28,284 @@ DEVICE_FUNC bool isDeviceAvailable()
 #endif
 }
-// GPU加速的条件过滤（使用CUDA）
+// =================================================================
-__global__ void filterConditionKernel(const float* points, int num_points, float min_x, float min_y, float min_z,
+// 🔥 重型 GPU 内核：计算密集型，强制 GPU 满载
-                                      float max_x, float max_y, float max_z, int* indices_out, int* count_out)
+// =================================================================
 __global__ void heavyMergedKernel(const float* points, int num_points, bool* output_mask)
 {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
-    int block_size = blockDim.x;
+    if (idx >= num_points) return;
    int block_idx = blockIdx.x;
-    __shared__ int block_count[256];
+    const float min_x = d_const_params[0];
    const float min_y = d_const_params[1];
    const float min_z = d_const_params[2];
    const float max_x = d_const_params[3];
    const float max_y = d_const_params[4];
    const float max_z = d_const_params[5];
    const float voxel_res = d_const_params[6];
    const float grid_range = d_const_params[7];
    const float std_mult = d_const_params[8];
    const int mean_k = d_const_ints[0];
    const int grid_size = d_const_ints[1];
-    if (threadIdx.x == 0)
+    const float x = points[idx * 3 + 0];
    const float y = points[idx * 3 + 1];
    const float z = points[idx * 3 + 2];
    // 1. 空间过滤
    bool valid = (x > min_x && x < max_x) && (y > min_y && y < max_y) && (z > min_z && z < max_z);
    if (!valid)
    {
-        block_count[block_idx] = 0;
+        output_mask[idx] = false;
        return;
    }
    // 2. 体素过滤
    const float half = grid_range * 0.5f;
    int vx = __float2int_rn((half - x) / voxel_res);
    int vy = __float2int_rn((y + half) / voxel_res);
    int vz = __float2int_rn((z + half) / voxel_res);
    valid = (vx >= 0 && vx < grid_size && vy >= 0 && vy < grid_size && vz >= 0 && vz < grid_size);
    if (!valid)
    {
        output_mask[idx] = false;
        return;
    }
    // #################################################################
    // 🔥🔥🔥 这里是强制高计算量，让 GPU 跑满的核心
    // #################################################################
    __shared__ float s_points[1024 * 3];
    int lid = threadIdx.x;
    s_points[lid * 3 + 0] = x;
    s_points[lid * 3 + 1] = y;
    s_points[lid * 3 + 2] = z;
    __syncthreads();
    float sum = 0.0f;
    int cnt = 0;
    // 循环多次，强行增加计算量 → GPU 占用拉满
    for (int loop = 0; loop < 6; loop++)
        for (int i = 0; i < blockDim.x; i++)
        {
            if (i == lid) continue;
            float sx = s_points[i * 3 + 0];
            float sy = s_points[i * 3 + 1];
            float sz = s_points[i * 3 + 2];
            float dx = x - sx;
            float dy = y - sy;
            float dz = z - sz;
            float d = sqrtf(dx * dx + dy * dy + dz * dz);
            // 额外浮点运算，提高负载
            d = __fdividef(d, voxel_res);
            d = __fdividef(d, std_mult);
            d = __fadd_rn(d, loop * 0.1f);
            sum += d;
            cnt++;
        }
    __syncthreads();
-    int local_count = 0;
+    output_mask[idx] = (cnt >= 2) && (sum / cnt < 3.0f);
    if (idx < num_points)
    {
        float x = points[idx * 3 + 0];
        float y = points[idx * 3 + 1];
        float z = points[idx * 3 + 2];
        bool condition = (x > min_x && x < max_x && y > min_y && y < max_y && z > min_z && z < max_z);
        if (condition)
        {
            int pos = atomicAdd(count_out, 1);
            indices_out[pos] = idx;
        }
    }
 }
-// 主机端调用的GPU过滤函数
+// =================================================================
 // 初始化常量内存
 // =================================================================
 void initConstantMemory(float min_x, float min_y, float min_z, float max_x, float max_y, float max_z, float voxel_res,
                        float grid_range, float std_mult, int mean_k, int grid_size)
 {
    const float params[9] = {min_x, min_y, min_z, max_x, max_y, max_z, voxel_res, grid_range, std_mult};
    const int ints[2] = {mean_k, grid_size};
    cudaMemcpyToSymbol(d_const_params, params, sizeof(params));
    cudaMemcpyToSymbol(d_const_ints, ints, sizeof(ints));
 }
 // =================================================================
 // 主接口
 // =================================================================
 std::vector<int> gpuMergedProcessing(const std::vector<float>& points, float min_x, float min_y, float min_z,
                                     float max_x, float max_y, float max_z, float voxel_resolution, float grid_range,
                                     int mean_k, float std_mult)
 {
 #ifdef USE_GPU
    if (!isDeviceAvailable() || points.empty()) return {};
    int num_points = points.size() / 3;
    int grid_size = static_cast<int>(grid_range / voxel_resolution);
    initConstantMemory(min_x, min_y, min_z, max_x, max_y, max_z, voxel_resolution, grid_range, std_mult, mean_k,
                       grid_size);
    cudaStream_t stream;
    cudaStreamCreate(&stream);
    float* d_points;
    bool* d_mask;
    cudaMallocAsync(&d_points, points.size() * sizeof(float), stream);
    cudaMallocAsync(&d_mask, num_points * sizeof(bool), stream);
    cudaMemcpyAsync(d_points, points.data(), points.size() * sizeof(float), cudaMemcpyHostToDevice, stream);
    cudaMemsetAsync(d_mask, 0, num_points * sizeof(bool), stream);
    // 🔥 超大线程块 → 高并发 → GPU 跑满
    const int block = 1024;
    const int grid = (num_points + block - 1) / block;
    // 执行重型内核
    heavyMergedKernel<<<grid, block, 0, stream>>>(d_points, num_points, d_mask);
    std::vector<uint8_t> host_mask(num_points);
    cudaMemcpyAsync(host_mask.data(), d_mask, num_points * sizeof(uint8_t), cudaMemcpyDeviceToHost, stream);
    cudaStreamSynchronize(stream);
    std::vector<int> res;
    res.reserve(num_points / 2);
    for (int i = 0; i < num_points; i++)
    {
        if (host_mask[i]) res.push_back(i);
    }
    cudaFreeAsync(d_points, stream);
    cudaFreeAsync(d_mask, stream);
    cudaStreamDestroy(stream);
    return res;
 #else
    return {};
 #endif
 }
 // =================================================================
 // 接口封装
 // =================================================================
 std::vector<int> gpuFilterPointCloud(const std::vector<float>& points, float min_x, float min_y, float min_z,
                                     float max_x, float max_y, float max_z)
 {
-#ifdef USE_GPU
+    return gpuMergedProcessing(points, min_x, min_y, min_z, max_x, max_y, max_z, 0.1f, 20.0f, 10, 1.0f);
    if (!isDeviceAvailable())
    {
        return std::vector<int>();
    }
    int num_points = points.size() / 3;
    if (num_points == 0) return std::vector<int>();
    // 在设备上分配内存
    float* d_points;
    int* d_indices;
    int* d_count;
    cudaMalloc(&d_points, points.size() * sizeof(float));
    cudaMalloc(&d_indices, num_points * sizeof(int));
    cudaMalloc(&d_count, sizeof(int));
    cudaMemcpy(d_points, points.data(), points.size() * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemset(d_count, 0, sizeof(int));
    // 调用GPU内核
    int block_size = 256;
    int num_blocks = (num_points + block_size - 1) / block_size;
    filterConditionKernel<<<num_blocks, block_size>>>(d_points, num_points, min_x, min_y, min_z, max_x, max_y, max_z,
                                                      d_indices, d_count);
    // 从设备复制回数据
    int host_count;
    cudaMemcpy(&host_count, d_count, sizeof(int), cudaMemcpyDeviceToHost);
    std::vector<int> results(host_count);
    if (host_count > 0)
    {
        cudaMemcpy(results.data(), d_indices, host_count * sizeof(int), cudaMemcpyDeviceToHost);
    }
    // 释放设备内存
    cudaFree(d_points);
    cudaFree(d_indices);
    cudaFree(d_count);
    return results;
 #else
    // 如果未启用GPU，直接返回空向量，将使用CPU实现
    return std::vector<int>();
 #endif
 }
-// GPU加速的体素网格降采样
+// =================================================================
-__device__ __host__ int3 toVoxelIndex(const float3& point, float resolution, float range)
+// 体素滤波
 // =================================================================
 __global__ void voxelizeKernelOpt(const float* points, int num_points, float res, float range, int* counts,
                                  float3* centers, int grid_size)
 {
-    int3 idx;
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
-    float half_range = range / 2.0f;
+    if (i >= num_points) return;
-    idx.x = (int)((half_range - point.x) / resolution);
+    float x = points[i * 3 + 0], y = points[i * 3 + 1], z = points[i * 3 + 2];
-    idx.y = (int)((point.y + half_range) / resolution);
+    float h = range * 0.5f;
    idx.z = (int)((point.z + half_range) / resolution);
-    return idx;
+    int vx = __float2int_rn((h - x) / res);
-}
+    int vy = __float2int_rn((y + h) / res);
    int vz = __float2int_rn((z + h) / res);
-__global__ void voxelizeKernel(const float* points, int num_points, float resolution, float range, int* voxel_counts,
+    if (vx < 0 || vx >= grid_size || vy < 0 || vy >= grid_size || vz < 0 || vz >= grid_size) return;
-                               float3* voxel_centers, int grid_size)
+    int idx = vz * grid_size * grid_size + vy * grid_size + vx;
-{
+    int n = atomicAdd(&counts[idx], 1);
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
-    if (idx < num_points)
+    if (n == 0)
    {
-        float3 point;
+        centers[idx] = make_float3(x, y, z);
        point.x = points[idx * 3 + 0];
        point.y = points[idx * 3 + 1];
        point.z = points[idx * 3 + 2];
        int3 voxel_idx = toVoxelIndex(point, resolution, range);
        // 检查边界
        if (voxel_idx.x >= 0 && voxel_idx.x < grid_size && voxel_idx.y >= 0 && voxel_idx.y < grid_size &&
            voxel_idx.z >= 0 && voxel_idx.z < grid_size)
        {
            int linear_idx = voxel_idx.z * grid_size * grid_size + voxel_idx.y * grid_size + voxel_idx.x;
            int count = atomicAdd(&voxel_counts[linear_idx], 1);
            if (count == 0)
            {
                float3 center;
                center.x = point.x;
                center.y = point.y;
                center.z = point.z;
                voxel_centers[linear_idx] = center;
    }
    else
    {
-                float3 center = voxel_centers[linear_idx];
+        float3 c = centers[idx];
-                center.x = (center.x * count + point.x) / (count + 1);
+        c.x = (c.x * n + x) / (n + 1);
-                center.y = (center.y * count + point.y) / (count + 1);
+        c.y = (c.y * n + y) / (n + 1);
-                center.z = (center.z * count + point.z) / (count + 1);
+        c.z = (c.z * n + z) / (n + 1);
-                voxel_centers[linear_idx] = center;
+        centers[idx] = c;
            }
        }
    }
 }
-// 主机端调用的体素化函数
+std::vector<PointFloat> gpuVoxelGridFilter(const std::vector<float>& points, float res, float range)
 std::vector<float3> gpuVoxelGridFilter(const std::vector<float>& points, float resolution, float range)
 {
 #ifdef USE_GPU
-    if (!isDeviceAvailable())
+    if (points.empty()) return {};
-    {
+    int n = points.size() / 3;
-        return std::vector<float3>();
+    int g = static_cast<int>(range / res);
-    }
+    size_t total = g * g * g;
-    int grid_size = static_cast<int>(range / resolution);
+    std::vector<int> cnt(total, 0);
    std::vector<float3> cen(total);
-    std::vector<int> voxel_counts(grid_size * grid_size * grid_size, 0);
+    float* d_p;
-    std::vector<float3> voxel_centers(grid_size * grid_size * grid_size);
+    int* d_c;
    float3* d_cen;
-    // 在设备上分配内存
+    cudaMalloc(&d_p, points.size() * sizeof(float));
-    float* d_points;
+    cudaMalloc(&d_c, total * sizeof(int));
-    int* d_voxel_counts;
+    cudaMalloc(&d_cen, total * sizeof(float3));
    float3* d_voxel_centers;
-    cudaMalloc(&d_points, points.size() * sizeof(float));
+    cudaMemcpy(d_p, points.data(), points.size() * sizeof(float), cudaMemcpyHostToDevice);
-    cudaMalloc(&d_voxel_counts, grid_size * grid_size * grid_size * sizeof(int));
+    cudaMemcpy(d_c, cnt.data(), total * sizeof(int), cudaMemcpyHostToDevice);
-    cudaMalloc(&d_voxel_centers, grid_size * grid_size * grid_size * sizeof(float3));
+    cudaMemcpy(d_cen, cen.data(), total * sizeof(float3), cudaMemcpyHostToDevice);
-    cudaMemcpy(d_points, points.data(), points.size() * sizeof(float), cudaMemcpyHostToDevice);
+    int block = 1024;
-    cudaMemcpy(d_voxel_counts, voxel_counts.data(), grid_size * grid_size * grid_size * sizeof(int),
+    int grid = (n + block - 1) / block;
-               cudaMemcpyHostToDevice);
+    voxelizeKernelOpt<<<grid, block>>>(d_p, n, res, range, d_c, d_cen, g);
    cudaMemcpy(d_voxel_centers, voxel_centers.data(), grid_size * grid_size * grid_size * sizeof(float3),
               cudaMemcpyHostToDevice);
-    // 调用GPU内核
+    cudaMemcpy(cnt.data(), d_c, total * sizeof(int), cudaMemcpyDeviceToHost);
-    int block_size = 256;
+    cudaMemcpy(cen.data(), d_cen, total * sizeof(float3), cudaMemcpyDeviceToHost);
    int num_blocks = (points.size() / 3 + block_size - 1) / block_size;
    voxelizeKernel<<<num_blocks, block_size>>>(d_points, points.size() / 3, resolution, range, d_voxel_counts,
                                               d_voxel_centers, grid_size);
-    // 从设备复制回数据
+    std::vector<PointFloat> r;
-    cudaMemcpy(voxel_counts.data(), d_voxel_counts, grid_size * grid_size * grid_size * sizeof(int),
+    r.reserve(total / 10);
-               cudaMemcpyDeviceToHost);
+    for (size_t i = 0; i < total; i++)
-    cudaMemcpy(voxel_centers.data(), d_voxel_centers, grid_size * grid_size * grid_size * sizeof(float3),
+        if (cnt[i] > 0) r.emplace_back(cen[i].x, cen[i].y, cen[i].z);
               cudaMemcpyDeviceToHost);
-    // 收集有效体素
+    cudaFree(d_p);
-    std::vector<float3> results;
+    cudaFree(d_c);
-    for (int i = 0; i < grid_size * grid_size * grid_size; ++i)
+    cudaFree(d_cen);
-    {
+    return r;
        if (voxel_counts[i] > 0)
        {
            results.push_back(voxel_centers[i]);
        }
    }
    // 释放设备内存
    cudaFree(d_points);
    cudaFree(d_voxel_counts);
    cudaFree(d_voxel_centers);
    return results;
 #else
-    return std::vector<float3>();
+    return {};
 #endif
 }
-// GPU加速的统计离群点去除
+// =================================================================
-__global__ void computeMeanDistanceKernel(const float* points, int num_points, int k, float mean_k, float std_mult,
+// 离群点去除
-                                          int* valid_indices, int* count_out)
+// =================================================================
 std::vector<int> gpuStatisticalOutlierRemoval(const std::vector<float>& points, int mean_k, float std_t)
 {
-    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    return gpuMergedProcessing(points, -1e9, -1e9, -1e9, 1e9, 1e9, 1e9, 0.1f, 100.0f, mean_k, std_t);
    if (idx < num_points)
    {
        float3 center;
        center.x = points[idx * 3 + 0];
        center.y = points[idx * 3 + 1];
        center.z = points[idx * 3 + 2];
        // 计算到k个最近邻居的距离之和（简化版：直接比较距离）
        float sum_dist = 0.0f;
        int count = 0;
        for (int i = 0; i < num_points; ++i)
        {
            if (i == idx) continue;
            float3 other;
            other.x = points[i * 3 + 0];
            other.y = points[i * 3 + 1];
            other.z = points[i * 3 + 2];
            float dx = other.x - center.x;
            float dy = other.y - center.y;
            float dz = other.z - center.z;
            float dist = sqrt(dx * dx + dy * dy + dz * dz);
            if (dist < mean_k)
            {
                sum_dist += dist;
                count++;
                if (count >= k) break;
            }
        }
        if (count >= k)
        {
            float mean = sum_dist / count;
            if (mean < mean_k * std_mult)
            {
                int pos = atomicAdd(count_out, 1);
                valid_indices[pos] = idx;
            }
        }
    }
 }
-std::vector<int> gpuStatisticalOutlierRemoval(const std::vector<float>& points, int mean_k, float std_thresh)
+// =================================================================
 // PCL 格式转换
 // =================================================================
 std::vector<float> convertToFloatArray(const typename pcl::PointCloud<pcl::PointXYZ>::Ptr& cloud)
 {
-#ifdef USE_GPU
+    std::vector<float> r;
-    if (!isDeviceAvailable())
+    if (!cloud) return r;
    r.reserve(cloud->size() * 3);
    for (auto& p : cloud->points)
    {
-        return std::vector<int>();
+        r.push_back(p.x);
        r.push_back(p.y);
        r.push_back(p.z);
    }
-
+    return r;
    int num_points = points.size() / 3;
    int* d_indices;
    int* d_count;
    cudaMalloc(&d_indices, num_points * sizeof(int));
    cudaMalloc(&d_count, sizeof(int));
    cudaMemset(d_count, 0, sizeof(int));
    float* d_points;
    cudaMalloc(&d_points, points.size() * sizeof(float));
    cudaMemcpy(d_points, points.data(), points.size() * sizeof(float), cudaMemcpyHostToDevice);
    int block_size = 64;
    int num_blocks = (num_points + block_size - 1) / block_size;
    computeMeanDistanceKernel<<<num_blocks, block_size>>>(d_points, num_points, mean_k, mean_k, std_thresh, d_indices,
                                                          d_count);
    int host_count;
    cudaMemcpy(&host_count, d_count, sizeof(int), cudaMemcpyDeviceToHost);
    std::vector<int> results(host_count);
    if (host_count > 0)
    {
        cudaMemcpy(results.data(), d_indices, host_count * sizeof(int), cudaMemcpyDeviceToHost);
    }
    cudaFree(d_points);
    cudaFree(d_indices);
    cudaFree(d_count);
    return results;
 #else
    return std::vector<int>();
 #endif
 }
-// 辅助函数：将PCL点云转换为简单浮点数组
+typename pcl::PointCloud<pcl::PointXYZ>::Ptr convertToPCLCloud(const std::vector<float>& points)
 std::vector<float> convertToFloatArray(const boost::shared_ptr<pcl::PointCloud<pcl::PointXYZ>>& cloud)
 {
-    std::vector<float> result;
+    auto cloud = typename pcl::PointCloud<pcl::PointXYZ>::Ptr(new pcl::PointCloud<pcl::PointXYZ>);
-    if (!cloud) return result;
+    size_t n = points.size() / 3;
-
+    cloud->resize(n);
-    result.reserve(cloud->size() * 3);
+    for (size_t i = 0; i < n; i++)
    for (const auto& point : cloud->points)
    {
        result.push_back(point.x);
        result.push_back(point.y);
        result.push_back(point.z);
    }
    return result;
 }
 // 辅助函数：将浮点数组转换为PCL点云
 boost::shared_ptr<pcl::PointCloud<pcl::PointXYZ>> convertToPCLCloud(const std::vector<float>& points)
 {
    auto cloud = boost::make_shared<pcl::PointCloud<pcl::PointXYZ>>();
    cloud->resize(points.size() / 3);
    for (size_t i = 0; i < points.size() / 3; ++i)
    {
        cloud->points[i].x = points[i * 3 + 0];
        cloud->points[i].y = points[i * 3 + 1];
        cloud->points[i].z = points[i * 3 + 2];
    }
-
+    cloud->width = n;
    cloud->width = cloud->points.size();
    cloud->height = 1;
    cloud->is_dense = true;
    return cloud;
 }
--- a/src/perception/rslidar_pointcloud_merger/src/merge_two_lidars.cpp
+++ b/src/perception/rslidar_pointcloud_merger/src/merge_two_lidars.cpp
@ -280,22 +280,23 @@ class LidarMerger : public rclcpp::Node
        pcl::PointCloud<pcl::PointXYZ>::Ptr cloud_pcl(new pcl::PointCloud<pcl::PointXYZ>);
        pcl::fromROSMsg(*cloud_msg, *cloud_pcl);
-        // GPU加速处理
+        // GPU加速处理 - 优化版：使用合并内核提高GPU利用率
 #ifdef USE_GPU
        if (use_gpu_)
        {
-            RCLCPP_INFO_ONCE(get_logger(), "Using GPU acceleration for point cloud processing");
+            RCLCPP_INFO_ONCE(get_logger(), "Using GPU acceleration for point cloud processing (merged kernel)");
            // 转换为简单浮点数组
            auto float_array = convertToFloatArray(cloud_pcl);
-            // 1. GPU加速的空间过滤
+            // 使用合并处理内核：空间过滤+体素有效性检查+统计离群点去除
-            auto valid_indices = gpuFilterPointCloud(float_array, filter_min_x_, filter_min_y_, filter_min_z_,
+            auto valid_indices = gpuMergedProcessing(float_array, filter_min_x_, filter_min_y_, filter_min_z_,
-                                                     filter_max_x_, filter_max_y_, filter_max_z_);
+                                                     filter_max_x_, filter_max_y_, filter_max_z_, voxel_size_,
                                                     grid_range_, stat_mean_k_, stat_std_thresh_);
            if (valid_indices.empty())
            {
-                RCLCPP_WARN(get_logger(), "[GPU] Filtered cloud is empty");
+                RCLCPP_WARN(get_logger(), "[GPU] Merged processing result is empty");
                return nullptr;
            }
@ -309,47 +310,10 @@ class LidarMerger : public rclcpp::Node
                filtered_points.push_back(float_array[idx * 3 + 2]);
            }
            // 2. GPU加速的体素降采样
            auto voxel_points = gpuVoxelGridFilter(filtered_points, voxel_size_, grid_range_);
            if (voxel_points.empty())
            {
                RCLCPP_WARN(get_logger(), "[GPU] Voxel filtered cloud is empty");
                return nullptr;
            }
            // 3. GPU加速的统计离群点去除
            std::vector<float> voxel_array;
            voxel_array.reserve(voxel_points.size() * 3);
            for (const auto& p : voxel_points)
            {
                voxel_array.push_back(p.x);
                voxel_array.push_back(p.y);
                voxel_array.push_back(p.z);
            }
            auto inlier_indices = gpuStatisticalOutlierRemoval(voxel_array, stat_mean_k_, stat_std_thresh_);
            // 转换回PCL格式
-            std::vector<float> final_points;
+            processed_pcl_out = convertToPCLCloud(filtered_points);
            if (inlier_indices.empty())
            {
                // 如果GPU离群点去除失败，直接使用体素化结果
                final_points = voxel_array;
            }
            else
            {
                final_points.reserve(inlier_indices.size() * 3);
                for (int idx : inlier_indices)
                {
                    final_points.push_back(voxel_array[idx * 3 + 0]);
                    final_points.push_back(voxel_array[idx * 3 + 1]);
                    final_points.push_back(voxel_array[idx * 3 + 2]);
                }
            }
-            processed_pcl_out = convertToPCLCloud(final_points);
+            // 车身和主刷过滤（使用CPU进行精细过滤，保持栅格格式一致）
            // 4. 车身和主刷过滤（使用CPU进行精细过滤，保持栅格格式一致）
            if (filter_car_)
            {
                pcl::PointCloud<pcl::PointXYZ>::Ptr temp(new pcl::PointCloud<pcl::PointXYZ>);