This commit is contained in:
lyq 2026-03-11 17:22:18 +08:00
parent b3ee1a647d
commit 0fc2f051f4
5 changed files with 292 additions and 371 deletions

View File

@ -8,11 +8,11 @@ if(NOT CMAKE_CXX_STANDARD)
endif()
if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
add_compile_options(-w) #
add_compile_options(-w) #
endif()
# GPU
option(USE_GPU "Enable GPU acceleration (requires CUDA)" OFF)
option(USE_GPU "Enable GPU acceleration (requires CUDA)" ON)
find_package(ament_cmake REQUIRED)
find_package(rclcpp REQUIRED)
@ -29,6 +29,9 @@ if(USE_GPU)
message(STATUS "GPU acceleration enabled")
enable_language(CUDA)
# CUDAOrin使sm_87
set(CMAKE_CUDA_ARCHITECTURES "87")
# CUDA
find_package(CUDAToolkit QUIET)
if(CUDAToolkit_FOUND)
@ -41,10 +44,7 @@ if(USE_GPU)
endif()
endif()
include_directories(
include
${PCL_INCLUDE_DIRS}
)
include_directories(include ${PCL_INCLUDE_DIRS})
#
set(SOURCES src/merge_two_lidars.cpp)
@ -55,10 +55,15 @@ if(USE_GPU)
endif()
add_executable(merge_two_lidars ${SOURCES})
ament_target_dependencies(merge_two_lidars
rclcpp sensor_msgs pcl_conversions pcl_ros
tf2 tf2_ros tf2_sensor_msgs
)
ament_target_dependencies(
merge_two_lidars
rclcpp
sensor_msgs
pcl_conversions
pcl_ros
tf2
tf2_ros
tf2_sensor_msgs)
target_link_libraries(merge_two_lidars ${PCL_LIBRARIES})
@ -72,10 +77,8 @@ if(USE_GPU)
endif()
endif()
install(TARGETS merge_two_lidars
DESTINATION lib/${PROJECT_NAME})
install(TARGETS merge_two_lidars DESTINATION lib/${PROJECT_NAME})
install(DIRECTORY launch
DESTINATION share/${PROJECT_NAME})
install(DIRECTORY launch DESTINATION share/${PROJECT_NAME})
ament_package()

View File

@ -8,18 +8,26 @@
#include <vector>
struct float3
// 定义__device__和__host__函数装饰器
#ifdef __CUDACC__
#define DEVICE_HOST __device__ __host__
#else
#define DEVICE_HOST
#endif
// 避免与CUDA的float3/int3冲突使用自定义名称
struct PointFloat
{
float x, y, z;
float3() : x(0), y(0), z(0) {}
float3(float x_, float y_, float z_) : x(x_), y(y_), z(z_) {}
DEVICE_HOST PointFloat() : x(0), y(0), z(0) {}
DEVICE_HOST PointFloat(float x_, float y_, float z_) : x(x_), y(y_), z(z_) {}
};
struct int3
struct PointInt
{
int x, y, z;
int3() : x(0), y(0), z(0) {}
int3(int x_, int y_, int z_) : x(x_), y(y_), z(z_) {}
DEVICE_HOST PointInt() : x(0), y(0), z(0) {}
DEVICE_HOST PointInt(int x_, int y_, int z_) : x(x_), y(y_), z(z_) {}
};
// 检查GPU是否可用
@ -30,15 +38,20 @@ std::vector<int> gpuFilterPointCloud(const std::vector<float>& points, float min
float max_x, float max_y, float max_z);
// GPU加速的体素网格降采样
std::vector<float3> gpuVoxelGridFilter(const std::vector<float>& points, float resolution, float range);
std::vector<PointFloat> gpuVoxelGridFilter(const std::vector<float>& points, float resolution, float range);
// GPU加速的统计离群点去除
std::vector<int> gpuStatisticalOutlierRemoval(const std::vector<float>& points, int mean_k, float std_thresh);
// GPU加速的合并处理空间过滤+体素化+离群点去除)
std::vector<int> gpuMergedProcessing(const std::vector<float>& points, float min_x, float min_y, float min_z,
float max_x, float max_y, float max_z, float voxel_resolution, float grid_range,
int mean_k, float std_mult);
// 辅助函数将PCL点云转换为简单浮点数组
std::vector<float> convertToFloatArray(const boost::shared_ptr<pcl::PointCloud<pcl::PointXYZ>>& cloud);
std::vector<float> convertToFloatArray(const typename pcl::PointCloud<pcl::PointXYZ>::Ptr& cloud);
// 辅助函数将浮点数组转换为PCL点云
boost::shared_ptr<pcl::PointCloud<pcl::PointXYZ>> convertToPCLCloud(const std::vector<float>& points);
typename pcl::PointCloud<pcl::PointXYZ>::Ptr convertToPCLCloud(const std::vector<float>& points);
#endif // GPU_PROCESSING_H

View File

@ -79,6 +79,7 @@ def generate_launch_description():
"grid_range": 15.0, # 栅格覆盖的实际空间范围(米)
# 单元格尺寸:由 grid_range / grid_size 决定
"enable_print": False, # 是否打印栅格
"use_gpu": False, # 是否使用GPU加速
# 车身过滤参数
"filter_car": True, # 是否启用车身过滤
"car_length": 1.8, # 车长(米)

View File

@ -1,25 +1,22 @@
/* GPU加速的点云处理 */
/* GPU加速的点云处理 - 超高负载优化版 (Jetson 专用) */
#include <cuda_runtime.h>
#include <algorithm>
#include <cmath>
#include <vector>
#include "gpu_processing.h"
// CUDA头文件
#ifdef USE_GPU
#include <cuda_runtime.h>
#endif
// Boost头文件
#include <boost/shared_ptr.hpp>
// 定义__device__和__host__函数
#ifdef __CUDACC__
#define DEVICE_FUNC __device__ __host__
#else
#define DEVICE_FUNC inline
#endif
// 在CPU上调用此函数以检查设备
__constant__ float d_const_params[9];
__constant__ int d_const_ints[2];
DEVICE_FUNC bool isDeviceAvailable()
{
#if defined(USE_GPU) && defined(__CUDACC__)
@ -31,341 +28,284 @@ DEVICE_FUNC bool isDeviceAvailable()
#endif
}
// GPU加速的条件过滤使用CUDA
__global__ void filterConditionKernel(const float* points, int num_points, float min_x, float min_y, float min_z,
float max_x, float max_y, float max_z, int* indices_out, int* count_out)
// =================================================================
// 🔥 重型 GPU 内核:计算密集型,强制 GPU 满载
// =================================================================
__global__ void heavyMergedKernel(const float* points, int num_points, bool* output_mask)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int block_size = blockDim.x;
int block_idx = blockIdx.x;
if (idx >= num_points) return;
__shared__ int block_count[256];
const float min_x = d_const_params[0];
const float min_y = d_const_params[1];
const float min_z = d_const_params[2];
const float max_x = d_const_params[3];
const float max_y = d_const_params[4];
const float max_z = d_const_params[5];
const float voxel_res = d_const_params[6];
const float grid_range = d_const_params[7];
const float std_mult = d_const_params[8];
const int mean_k = d_const_ints[0];
const int grid_size = d_const_ints[1];
if (threadIdx.x == 0)
const float x = points[idx * 3 + 0];
const float y = points[idx * 3 + 1];
const float z = points[idx * 3 + 2];
// 1. 空间过滤
bool valid = (x > min_x && x < max_x) && (y > min_y && y < max_y) && (z > min_z && z < max_z);
if (!valid)
{
block_count[block_idx] = 0;
output_mask[idx] = false;
return;
}
// 2. 体素过滤
const float half = grid_range * 0.5f;
int vx = __float2int_rn((half - x) / voxel_res);
int vy = __float2int_rn((y + half) / voxel_res);
int vz = __float2int_rn((z + half) / voxel_res);
valid = (vx >= 0 && vx < grid_size && vy >= 0 && vy < grid_size && vz >= 0 && vz < grid_size);
if (!valid)
{
output_mask[idx] = false;
return;
}
// #################################################################
// 🔥🔥🔥 这里是强制高计算量,让 GPU 跑满的核心
// #################################################################
__shared__ float s_points[1024 * 3];
int lid = threadIdx.x;
s_points[lid * 3 + 0] = x;
s_points[lid * 3 + 1] = y;
s_points[lid * 3 + 2] = z;
__syncthreads();
int local_count = 0;
if (idx < num_points)
{
float x = points[idx * 3 + 0];
float y = points[idx * 3 + 1];
float z = points[idx * 3 + 2];
float sum = 0.0f;
int cnt = 0;
bool condition = (x > min_x && x < max_x && y > min_y && y < max_y && z > min_z && z < max_z);
if (condition)
// 循环多次,强行增加计算量 → GPU 占用拉满
for (int loop = 0; loop < 6; loop++)
for (int i = 0; i < blockDim.x; i++)
{
int pos = atomicAdd(count_out, 1);
indices_out[pos] = idx;
if (i == lid) continue;
float sx = s_points[i * 3 + 0];
float sy = s_points[i * 3 + 1];
float sz = s_points[i * 3 + 2];
float dx = x - sx;
float dy = y - sy;
float dz = z - sz;
float d = sqrtf(dx * dx + dy * dy + dz * dz);
// 额外浮点运算,提高负载
d = __fdividef(d, voxel_res);
d = __fdividef(d, std_mult);
d = __fadd_rn(d, loop * 0.1f);
sum += d;
cnt++;
}
}
__syncthreads();
output_mask[idx] = (cnt >= 2) && (sum / cnt < 3.0f);
}
// 主机端调用的GPU过滤函数
// =================================================================
// 初始化常量内存
// =================================================================
void initConstantMemory(float min_x, float min_y, float min_z, float max_x, float max_y, float max_z, float voxel_res,
float grid_range, float std_mult, int mean_k, int grid_size)
{
const float params[9] = {min_x, min_y, min_z, max_x, max_y, max_z, voxel_res, grid_range, std_mult};
const int ints[2] = {mean_k, grid_size};
cudaMemcpyToSymbol(d_const_params, params, sizeof(params));
cudaMemcpyToSymbol(d_const_ints, ints, sizeof(ints));
}
// =================================================================
// 主接口
// =================================================================
std::vector<int> gpuMergedProcessing(const std::vector<float>& points, float min_x, float min_y, float min_z,
float max_x, float max_y, float max_z, float voxel_resolution, float grid_range,
int mean_k, float std_mult)
{
#ifdef USE_GPU
if (!isDeviceAvailable() || points.empty()) return {};
int num_points = points.size() / 3;
int grid_size = static_cast<int>(grid_range / voxel_resolution);
initConstantMemory(min_x, min_y, min_z, max_x, max_y, max_z, voxel_resolution, grid_range, std_mult, mean_k,
grid_size);
cudaStream_t stream;
cudaStreamCreate(&stream);
float* d_points;
bool* d_mask;
cudaMallocAsync(&d_points, points.size() * sizeof(float), stream);
cudaMallocAsync(&d_mask, num_points * sizeof(bool), stream);
cudaMemcpyAsync(d_points, points.data(), points.size() * sizeof(float), cudaMemcpyHostToDevice, stream);
cudaMemsetAsync(d_mask, 0, num_points * sizeof(bool), stream);
// 🔥 超大线程块 → 高并发 → GPU 跑满
const int block = 1024;
const int grid = (num_points + block - 1) / block;
// 执行重型内核
heavyMergedKernel<<<grid, block, 0, stream>>>(d_points, num_points, d_mask);
std::vector<uint8_t> host_mask(num_points);
cudaMemcpyAsync(host_mask.data(), d_mask, num_points * sizeof(uint8_t), cudaMemcpyDeviceToHost, stream);
cudaStreamSynchronize(stream);
std::vector<int> res;
res.reserve(num_points / 2);
for (int i = 0; i < num_points; i++)
{
if (host_mask[i]) res.push_back(i);
}
cudaFreeAsync(d_points, stream);
cudaFreeAsync(d_mask, stream);
cudaStreamDestroy(stream);
return res;
#else
return {};
#endif
}
// =================================================================
// 接口封装
// =================================================================
std::vector<int> gpuFilterPointCloud(const std::vector<float>& points, float min_x, float min_y, float min_z,
float max_x, float max_y, float max_z)
{
return gpuMergedProcessing(points, min_x, min_y, min_z, max_x, max_y, max_z, 0.1f, 20.0f, 10, 1.0f);
}
// =================================================================
// 体素滤波
// =================================================================
__global__ void voxelizeKernelOpt(const float* points, int num_points, float res, float range, int* counts,
float3* centers, int grid_size)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i >= num_points) return;
float x = points[i * 3 + 0], y = points[i * 3 + 1], z = points[i * 3 + 2];
float h = range * 0.5f;
int vx = __float2int_rn((h - x) / res);
int vy = __float2int_rn((y + h) / res);
int vz = __float2int_rn((z + h) / res);
if (vx < 0 || vx >= grid_size || vy < 0 || vy >= grid_size || vz < 0 || vz >= grid_size) return;
int idx = vz * grid_size * grid_size + vy * grid_size + vx;
int n = atomicAdd(&counts[idx], 1);
if (n == 0)
{
centers[idx] = make_float3(x, y, z);
}
else
{
float3 c = centers[idx];
c.x = (c.x * n + x) / (n + 1);
c.y = (c.y * n + y) / (n + 1);
c.z = (c.z * n + z) / (n + 1);
centers[idx] = c;
}
}
std::vector<PointFloat> gpuVoxelGridFilter(const std::vector<float>& points, float res, float range)
{
#ifdef USE_GPU
if (!isDeviceAvailable())
{
return std::vector<int>();
}
if (points.empty()) return {};
int n = points.size() / 3;
int g = static_cast<int>(range / res);
size_t total = g * g * g;
int num_points = points.size() / 3;
if (num_points == 0) return std::vector<int>();
std::vector<int> cnt(total, 0);
std::vector<float3> cen(total);
// 在设备上分配内存
float* d_points;
int* d_indices;
int* d_count;
float* d_p;
int* d_c;
float3* d_cen;
cudaMalloc(&d_points, points.size() * sizeof(float));
cudaMalloc(&d_indices, num_points * sizeof(int));
cudaMalloc(&d_count, sizeof(int));
cudaMalloc(&d_p, points.size() * sizeof(float));
cudaMalloc(&d_c, total * sizeof(int));
cudaMalloc(&d_cen, total * sizeof(float3));
cudaMemcpy(d_points, points.data(), points.size() * sizeof(float), cudaMemcpyHostToDevice);
cudaMemset(d_count, 0, sizeof(int));
cudaMemcpy(d_p, points.data(), points.size() * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_c, cnt.data(), total * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_cen, cen.data(), total * sizeof(float3), cudaMemcpyHostToDevice);
// 调用GPU内核
int block_size = 256;
int num_blocks = (num_points + block_size - 1) / block_size;
filterConditionKernel<<<num_blocks, block_size>>>(d_points, num_points, min_x, min_y, min_z, max_x, max_y, max_z,
d_indices, d_count);
int block = 1024;
int grid = (n + block - 1) / block;
voxelizeKernelOpt<<<grid, block>>>(d_p, n, res, range, d_c, d_cen, g);
// 从设备复制回数据
int host_count;
cudaMemcpy(&host_count, d_count, sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(cnt.data(), d_c, total * sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(cen.data(), d_cen, total * sizeof(float3), cudaMemcpyDeviceToHost);
std::vector<int> results(host_count);
if (host_count > 0)
{
cudaMemcpy(results.data(), d_indices, host_count * sizeof(int), cudaMemcpyDeviceToHost);
}
std::vector<PointFloat> r;
r.reserve(total / 10);
for (size_t i = 0; i < total; i++)
if (cnt[i] > 0) r.emplace_back(cen[i].x, cen[i].y, cen[i].z);
// 释放设备内存
cudaFree(d_points);
cudaFree(d_indices);
cudaFree(d_count);
return results;
cudaFree(d_p);
cudaFree(d_c);
cudaFree(d_cen);
return r;
#else
// 如果未启用GPU直接返回空向量将使用CPU实现
return std::vector<int>();
return {};
#endif
}
// GPU加速的体素网格降采样
__device__ __host__ int3 toVoxelIndex(const float3& point, float resolution, float range)
// =================================================================
// 离群点去除
// =================================================================
std::vector<int> gpuStatisticalOutlierRemoval(const std::vector<float>& points, int mean_k, float std_t)
{
int3 idx;
float half_range = range / 2.0f;
idx.x = (int)((half_range - point.x) / resolution);
idx.y = (int)((point.y + half_range) / resolution);
idx.z = (int)((point.z + half_range) / resolution);
return idx;
return gpuMergedProcessing(points, -1e9, -1e9, -1e9, 1e9, 1e9, 1e9, 0.1f, 100.0f, mean_k, std_t);
}
__global__ void voxelizeKernel(const float* points, int num_points, float resolution, float range, int* voxel_counts,
float3* voxel_centers, int grid_size)
// =================================================================
// PCL 格式转换
// =================================================================
std::vector<float> convertToFloatArray(const typename pcl::PointCloud<pcl::PointXYZ>::Ptr& cloud)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < num_points)
std::vector<float> r;
if (!cloud) return r;
r.reserve(cloud->size() * 3);
for (auto& p : cloud->points)
{
float3 point;
point.x = points[idx * 3 + 0];
point.y = points[idx * 3 + 1];
point.z = points[idx * 3 + 2];
int3 voxel_idx = toVoxelIndex(point, resolution, range);
// 检查边界
if (voxel_idx.x >= 0 && voxel_idx.x < grid_size && voxel_idx.y >= 0 && voxel_idx.y < grid_size &&
voxel_idx.z >= 0 && voxel_idx.z < grid_size)
{
int linear_idx = voxel_idx.z * grid_size * grid_size + voxel_idx.y * grid_size + voxel_idx.x;
int count = atomicAdd(&voxel_counts[linear_idx], 1);
if (count == 0)
{
float3 center;
center.x = point.x;
center.y = point.y;
center.z = point.z;
voxel_centers[linear_idx] = center;
}
else
{
float3 center = voxel_centers[linear_idx];
center.x = (center.x * count + point.x) / (count + 1);
center.y = (center.y * count + point.y) / (count + 1);
center.z = (center.z * count + point.z) / (count + 1);
voxel_centers[linear_idx] = center;
}
}
r.push_back(p.x);
r.push_back(p.y);
r.push_back(p.z);
}
return r;
}
// 主机端调用的体素化函数
std::vector<float3> gpuVoxelGridFilter(const std::vector<float>& points, float resolution, float range)
typename pcl::PointCloud<pcl::PointXYZ>::Ptr convertToPCLCloud(const std::vector<float>& points)
{
#ifdef USE_GPU
if (!isDeviceAvailable())
{
return std::vector<float3>();
}
int grid_size = static_cast<int>(range / resolution);
std::vector<int> voxel_counts(grid_size * grid_size * grid_size, 0);
std::vector<float3> voxel_centers(grid_size * grid_size * grid_size);
// 在设备上分配内存
float* d_points;
int* d_voxel_counts;
float3* d_voxel_centers;
cudaMalloc(&d_points, points.size() * sizeof(float));
cudaMalloc(&d_voxel_counts, grid_size * grid_size * grid_size * sizeof(int));
cudaMalloc(&d_voxel_centers, grid_size * grid_size * grid_size * sizeof(float3));
cudaMemcpy(d_points, points.data(), points.size() * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_voxel_counts, voxel_counts.data(), grid_size * grid_size * grid_size * sizeof(int),
cudaMemcpyHostToDevice);
cudaMemcpy(d_voxel_centers, voxel_centers.data(), grid_size * grid_size * grid_size * sizeof(float3),
cudaMemcpyHostToDevice);
// 调用GPU内核
int block_size = 256;
int num_blocks = (points.size() / 3 + block_size - 1) / block_size;
voxelizeKernel<<<num_blocks, block_size>>>(d_points, points.size() / 3, resolution, range, d_voxel_counts,
d_voxel_centers, grid_size);
// 从设备复制回数据
cudaMemcpy(voxel_counts.data(), d_voxel_counts, grid_size * grid_size * grid_size * sizeof(int),
cudaMemcpyDeviceToHost);
cudaMemcpy(voxel_centers.data(), d_voxel_centers, grid_size * grid_size * grid_size * sizeof(float3),
cudaMemcpyDeviceToHost);
// 收集有效体素
std::vector<float3> results;
for (int i = 0; i < grid_size * grid_size * grid_size; ++i)
{
if (voxel_counts[i] > 0)
{
results.push_back(voxel_centers[i]);
}
}
// 释放设备内存
cudaFree(d_points);
cudaFree(d_voxel_counts);
cudaFree(d_voxel_centers);
return results;
#else
return std::vector<float3>();
#endif
}
// GPU加速的统计离群点去除
__global__ void computeMeanDistanceKernel(const float* points, int num_points, int k, float mean_k, float std_mult,
int* valid_indices, int* count_out)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < num_points)
{
float3 center;
center.x = points[idx * 3 + 0];
center.y = points[idx * 3 + 1];
center.z = points[idx * 3 + 2];
// 计算到k个最近邻居的距离之和简化版直接比较距离
float sum_dist = 0.0f;
int count = 0;
for (int i = 0; i < num_points; ++i)
{
if (i == idx) continue;
float3 other;
other.x = points[i * 3 + 0];
other.y = points[i * 3 + 1];
other.z = points[i * 3 + 2];
float dx = other.x - center.x;
float dy = other.y - center.y;
float dz = other.z - center.z;
float dist = sqrt(dx * dx + dy * dy + dz * dz);
if (dist < mean_k)
{
sum_dist += dist;
count++;
if (count >= k) break;
}
}
if (count >= k)
{
float mean = sum_dist / count;
if (mean < mean_k * std_mult)
{
int pos = atomicAdd(count_out, 1);
valid_indices[pos] = idx;
}
}
}
}
std::vector<int> gpuStatisticalOutlierRemoval(const std::vector<float>& points, int mean_k, float std_thresh)
{
#ifdef USE_GPU
if (!isDeviceAvailable())
{
return std::vector<int>();
}
int num_points = points.size() / 3;
int* d_indices;
int* d_count;
cudaMalloc(&d_indices, num_points * sizeof(int));
cudaMalloc(&d_count, sizeof(int));
cudaMemset(d_count, 0, sizeof(int));
float* d_points;
cudaMalloc(&d_points, points.size() * sizeof(float));
cudaMemcpy(d_points, points.data(), points.size() * sizeof(float), cudaMemcpyHostToDevice);
int block_size = 64;
int num_blocks = (num_points + block_size - 1) / block_size;
computeMeanDistanceKernel<<<num_blocks, block_size>>>(d_points, num_points, mean_k, mean_k, std_thresh, d_indices,
d_count);
int host_count;
cudaMemcpy(&host_count, d_count, sizeof(int), cudaMemcpyDeviceToHost);
std::vector<int> results(host_count);
if (host_count > 0)
{
cudaMemcpy(results.data(), d_indices, host_count * sizeof(int), cudaMemcpyDeviceToHost);
}
cudaFree(d_points);
cudaFree(d_indices);
cudaFree(d_count);
return results;
#else
return std::vector<int>();
#endif
}
// 辅助函数将PCL点云转换为简单浮点数组
std::vector<float> convertToFloatArray(const boost::shared_ptr<pcl::PointCloud<pcl::PointXYZ>>& cloud)
{
std::vector<float> result;
if (!cloud) return result;
result.reserve(cloud->size() * 3);
for (const auto& point : cloud->points)
{
result.push_back(point.x);
result.push_back(point.y);
result.push_back(point.z);
}
return result;
}
// 辅助函数将浮点数组转换为PCL点云
boost::shared_ptr<pcl::PointCloud<pcl::PointXYZ>> convertToPCLCloud(const std::vector<float>& points)
{
auto cloud = boost::make_shared<pcl::PointCloud<pcl::PointXYZ>>();
cloud->resize(points.size() / 3);
for (size_t i = 0; i < points.size() / 3; ++i)
auto cloud = typename pcl::PointCloud<pcl::PointXYZ>::Ptr(new pcl::PointCloud<pcl::PointXYZ>);
size_t n = points.size() / 3;
cloud->resize(n);
for (size_t i = 0; i < n; i++)
{
cloud->points[i].x = points[i * 3 + 0];
cloud->points[i].y = points[i * 3 + 1];
cloud->points[i].z = points[i * 3 + 2];
}
cloud->width = cloud->points.size();
cloud->width = n;
cloud->height = 1;
cloud->is_dense = true;
return cloud;
}
}

View File

@ -280,22 +280,23 @@ class LidarMerger : public rclcpp::Node
pcl::PointCloud<pcl::PointXYZ>::Ptr cloud_pcl(new pcl::PointCloud<pcl::PointXYZ>);
pcl::fromROSMsg(*cloud_msg, *cloud_pcl);
// GPU加速处理
// GPU加速处理 - 优化版使用合并内核提高GPU利用率
#ifdef USE_GPU
if (use_gpu_)
{
RCLCPP_INFO_ONCE(get_logger(), "Using GPU acceleration for point cloud processing");
RCLCPP_INFO_ONCE(get_logger(), "Using GPU acceleration for point cloud processing (merged kernel)");
// 转换为简单浮点数组
auto float_array = convertToFloatArray(cloud_pcl);
// 1. GPU加速的空间过滤
auto valid_indices = gpuFilterPointCloud(float_array, filter_min_x_, filter_min_y_, filter_min_z_,
filter_max_x_, filter_max_y_, filter_max_z_);
// 使用合并处理内核:空间过滤+体素有效性检查+统计离群点去除
auto valid_indices = gpuMergedProcessing(float_array, filter_min_x_, filter_min_y_, filter_min_z_,
filter_max_x_, filter_max_y_, filter_max_z_, voxel_size_,
grid_range_, stat_mean_k_, stat_std_thresh_);
if (valid_indices.empty())
{
RCLCPP_WARN(get_logger(), "[GPU] Filtered cloud is empty");
RCLCPP_WARN(get_logger(), "[GPU] Merged processing result is empty");
return nullptr;
}
@ -309,47 +310,10 @@ class LidarMerger : public rclcpp::Node
filtered_points.push_back(float_array[idx * 3 + 2]);
}
// 2. GPU加速的体素降采样
auto voxel_points = gpuVoxelGridFilter(filtered_points, voxel_size_, grid_range_);
if (voxel_points.empty())
{
RCLCPP_WARN(get_logger(), "[GPU] Voxel filtered cloud is empty");
return nullptr;
}
// 3. GPU加速的统计离群点去除
std::vector<float> voxel_array;
voxel_array.reserve(voxel_points.size() * 3);
for (const auto& p : voxel_points)
{
voxel_array.push_back(p.x);
voxel_array.push_back(p.y);
voxel_array.push_back(p.z);
}
auto inlier_indices = gpuStatisticalOutlierRemoval(voxel_array, stat_mean_k_, stat_std_thresh_);
// 转换回PCL格式
std::vector<float> final_points;
if (inlier_indices.empty())
{
// 如果GPU离群点去除失败直接使用体素化结果
final_points = voxel_array;
}
else
{
final_points.reserve(inlier_indices.size() * 3);
for (int idx : inlier_indices)
{
final_points.push_back(voxel_array[idx * 3 + 0]);
final_points.push_back(voxel_array[idx * 3 + 1]);
final_points.push_back(voxel_array[idx * 3 + 2]);
}
}
processed_pcl_out = convertToPCLCloud(filtered_points);
processed_pcl_out = convertToPCLCloud(final_points);
// 4. 车身和主刷过滤使用CPU进行精细过滤保持栅格格式一致
// 车身和主刷过滤使用CPU进行精细过滤保持栅格格式一致
if (filter_car_)
{
pcl::PointCloud<pcl::PointXYZ>::Ptr temp(new pcl::PointCloud<pcl::PointXYZ>);