优化点云处理性能

This commit is contained in:
lyq 2026-03-12 16:11:32 +08:00
parent 0fc2f051f4
commit 8a06e6b9a1
4 changed files with 1755 additions and 1075 deletions

View File

@ -1,21 +1,31 @@
/* GPU加速的点云处理头文件 */ /* GPU加速的点云处理头文件 - 优化版 */
/* 主要优化:
* 1. GPU内存持久化 malloc/free
* 2. Pinned Memory H2D/D2H
* 3. CUDA Stream
* 4. voxelizeKernel x/y/z
* 5. SOR
* 6. + SOR
*/
#ifndef GPU_PROCESSING_H #ifndef GPU_PROCESSING_H
#define GPU_PROCESSING_H #define GPU_PROCESSING_H
#include <cuda_runtime.h>
#include <pcl/point_cloud.h> #include <pcl/point_cloud.h>
#include <pcl/point_types.h> #include <pcl/point_types.h>
#include <cstdint>
#include <vector> #include <vector>
// 定义__device__和__host__函数装饰器
#ifdef __CUDACC__ #ifdef __CUDACC__
#define DEVICE_HOST __device__ __host__ #define DEVICE_HOST __device__ __host__
#else #else
#define DEVICE_HOST #define DEVICE_HOST
#endif #endif
// 避免与CUDA的float3/int3冲突使用自定义名称 /* ===================== 基础数据结构 ===================== */
struct PointFloat struct PointFloat
{ {
float x, y, z; float x, y, z;
@ -30,28 +40,153 @@ struct PointInt
DEVICE_HOST PointInt(int x_, int y_, int z_) : x(x_), y(y_), z(z_) {} DEVICE_HOST PointInt(int x_, int y_, int z_) : x(x_), y(y_), z(z_) {}
}; };
// 检查GPU是否可用 /* ===================== GPU 处理参数 ===================== */
struct GpuProcessingParams
{
// 空间过滤范围
float min_x, max_x;
float min_y, max_y;
float min_z, max_z;
// 体素化参数
float voxel_resolution; // 体素大小(米)
float grid_range; // 网格覆盖范围(米),以原点为中心
// SOR 参数(统计离群点去除)
int sor_mean_k; // 邻域点数 k
float sor_std_mult; // 标准差倍数阈值
// 车身过滤
float car_min_x, car_max_x;
float car_min_y, car_max_y;
float car_min_z, car_max_z;
bool filter_car;
// 扫刷过滤(右侧 / 左侧)
float brush_r_min_x, brush_r_max_x;
float brush_r_min_y, brush_r_max_y;
float brush_r_min_z, brush_r_max_z;
float brush_l_min_x, brush_l_max_x;
float brush_l_min_y, brush_l_max_y;
float brush_l_min_z, brush_l_max_z;
bool filter_brush;
};
/* ===================== 持久化 GPU 资源管理器 =====================
*
* GpuResources::init() destroy()
* GPU / CUDA Stream
* cudaMalloc/cudaFree
*
*
* stream[0]
* stream[1]
*/
class GpuResources
{
public:
/* ---- 单例访问 ---- */
static GpuResources& instance();
/* ---- 生命周期 ---- */
/**
* @brief GPU
* @param max_points_per_cloud
* @param max_voxel_grid_dim = grid_range / voxel_resolution
*/
bool init(int max_points_per_cloud, int max_voxel_grid_dim);
void destroy();
bool isInitialized() const { return initialized_; }
/* ---- 设备内存按雷达通道索引0=前, 1=后) ---- */
float* d_points[2]; // 输入点云 (N*3 floats)
uint8_t* d_pass_mask[2]; // 空间过滤掩码
uint8_t* d_sor_mask[2]; // SOR 掩码
int* d_voxel_counts[2]; // 体素命中计数
// 体素中心的 x/y/z 分量用三个独立 float 数组,配合 atomicAdd 的整数技巧
// 实际存储为定点整数×1000 精度),避免 atomicAdd 不支持 float3
int* d_voxel_sum_x[2]; // 体素内点 x 坐标之和×VOXEL_SCALE
int* d_voxel_sum_y[2];
int* d_voxel_sum_z[2];
float* d_mean_dist[2]; // 每点到 k 邻居的平均距离SOR 中间结果)
/* ---- 锁页内存Host 侧) ---- */
float* h_points[2]; // 输入点云(锁页)
uint8_t* h_pass_mask[2]; // 过滤结果(锁页)
int* h_voxel_counts[2]; // 体素命中数(锁页)
int* h_voxel_sum_x[2];
int* h_voxel_sum_y[2];
int* h_voxel_sum_z[2];
/* ---- CUDA Streams ---- */
cudaStream_t stream[2];
/* ---- 容量 ---- */
int max_points;
int max_voxel_dim;
int max_voxels; // = max_voxel_dim^3
static constexpr int VOXEL_SCALE = 1000; // 定点精度1mm
private:
GpuResources() : initialized_(false) {}
bool initialized_;
};
/* ===================== 设备可用性检查 ===================== */
bool isDeviceAvailable(); bool isDeviceAvailable();
// GPU加速的条件过滤 /* ===================== 核心 GPU 处理接口 ===================== */
/**
* @brief 线使 stream
*
*
* 1. Bounding Box + /
* 2. Voxel Grid
* 3. SOR
*
* @param points float x0,y0,z0, x1,y1,z1, ...
* @param num_points
* @param params
* @param channel 0=, 1=使 Stream
* @return float
*/
std::vector<float> gpuProcessPointCloud(const float* points, int num_points, const GpuProcessingParams& params,
int channel = 0);
/**
* @brief
*/
std::vector<int> gpuFilterPointCloud(const std::vector<float>& points, float min_x, float min_y, float min_z, std::vector<int> gpuFilterPointCloud(const std::vector<float>& points, float min_x, float min_y, float min_z,
float max_x, float max_y, float max_z); float max_x, float max_y, float max_z);
// GPU加速的体素网格降采样 /**
* @brief
*/
std::vector<PointFloat> gpuVoxelGridFilter(const std::vector<float>& points, float resolution, float range); std::vector<PointFloat> gpuVoxelGridFilter(const std::vector<float>& points, float resolution, float range);
// GPU加速的统计离群点去除 /**
* @brief O(N²)
*/
std::vector<int> gpuStatisticalOutlierRemoval(const std::vector<float>& points, int mean_k, float std_thresh); std::vector<int> gpuStatisticalOutlierRemoval(const std::vector<float>& points, int mean_k, float std_thresh);
// GPU加速的合并处理空间过滤+体素化+离群点去除) /**
* @brief + + SOR
* 使
*/
std::vector<int> gpuMergedProcessing(const std::vector<float>& points, float min_x, float min_y, float min_z, std::vector<int> gpuMergedProcessing(const std::vector<float>& points, float min_x, float min_y, float min_z,
float max_x, float max_y, float max_z, float voxel_resolution, float grid_range, float max_x, float max_y, float max_z, float voxel_resolution, float grid_range,
int mean_k, float std_mult); int mean_k, float std_mult);
// 辅助函数将PCL点云转换为简单浮点数组 /* ===================== 辅助函数 ===================== */
std::vector<float> convertToFloatArray(const typename pcl::PointCloud<pcl::PointXYZ>::Ptr& cloud); std::vector<float> convertToFloatArray(const typename pcl::PointCloud<pcl::PointXYZ>::Ptr& cloud);
// 辅助函数将浮点数组转换为PCL点云
typename pcl::PointCloud<pcl::PointXYZ>::Ptr convertToPCLCloud(const std::vector<float>& points); typename pcl::PointCloud<pcl::PointXYZ>::Ptr convertToPCLCloud(const std::vector<float>& points);
#endif // GPU_PROCESSING_H #endif // GPU_PROCESSING_H

View File

@ -79,7 +79,10 @@ def generate_launch_description():
"grid_range": 15.0, # 栅格覆盖的实际空间范围(米) "grid_range": 15.0, # 栅格覆盖的实际空间范围(米)
# 单元格尺寸:由 grid_range / grid_size 决定 # 单元格尺寸:由 grid_range / grid_size 决定
"enable_print": False, # 是否打印栅格 "enable_print": False, # 是否打印栅格
"use_gpu": False, # 是否使用GPU加速 "use_gpu": True, # 是否使用GPU加速
"sor_method": 1, # 推荐先用1效果不满意再试2
"radius_search": 0.5, # 搜索半径(米),越大越严格
"min_neighbors": 3, # 至少需要N个邻居才保留
# 车身过滤参数 # 车身过滤参数
"filter_car": True, # 是否启用车身过滤 "filter_car": True, # 是否启用车身过滤
"car_length": 1.8, # 车长(米) "car_length": 1.8, # 车长(米)

View File

@ -1,310 +1,713 @@
/* GPU加速的点云处理 - 超高负载优化版 (Jetson 专用) */ /* GPU加速的点云处理 - 完整优化版
#include <cuda_runtime.h> */
#include <algorithm> #include <algorithm>
#include <cmath> #include <cmath>
#include <cstring>
#include <vector> #include <vector>
#include "gpu_processing.h" #include "gpu_processing.h"
#ifdef __CUDACC__ #ifdef USE_GPU
#define DEVICE_FUNC __device__ __host__ #include <cuda_runtime.h>
#else
#define DEVICE_FUNC inline
#endif #endif
__constant__ float d_const_params[9]; /* ================================================================
__constant__ int d_const_ints[2]; * 宏与工具
* ================================================================ */
DEVICE_FUNC bool isDeviceAvailable() #define CUDA_CHECK(call) \
do \
{ \
cudaError_t err = (call); \
if (err != cudaSuccess) \
{ \
fprintf(stderr, "[CUDA ERROR] %s:%d %s\n", __FILE__, __LINE__, cudaGetErrorString(err)); \
} \
} while (0)
/* 每个线程块的线程数256 是 Jetson Orin 上的最优选择)*/
static constexpr int BLOCK_SIZE = 256;
/* 体素坐标定点精度(毫米级,避免 atomicAdd 不支持 float 的问题)*/
static constexpr int VOXEL_SCALE = GpuResources::VOXEL_SCALE;
/* ================================================================
* GpuResources 单例实现
* ================================================================ */
GpuResources& GpuResources::instance()
{ {
#if defined(USE_GPU) && defined(__CUDACC__) static GpuResources res;
int count; return res;
cudaGetDeviceCount(&count); }
return count > 0;
bool GpuResources::init(int max_points_per_cloud, int max_voxel_grid_dim)
{
#ifdef USE_GPU
if (initialized_) return true;
max_points = max_points_per_cloud;
max_voxel_dim = max_voxel_grid_dim;
max_voxels = max_voxel_dim * max_voxel_dim * max_voxel_dim;
for (int ch = 0; ch < 2; ++ch)
{
/* ---- CUDA Stream ---- */
CUDA_CHECK(cudaStreamCreateWithFlags(&stream[ch], cudaStreamNonBlocking));
/* ---- 设备内存 ---- */
CUDA_CHECK(cudaMalloc(&d_points[ch], (size_t)max_points * 3 * sizeof(float)));
CUDA_CHECK(cudaMalloc(&d_pass_mask[ch], (size_t)max_points * sizeof(uint8_t)));
CUDA_CHECK(cudaMalloc(&d_sor_mask[ch], (size_t)max_points * sizeof(uint8_t)));
CUDA_CHECK(cudaMalloc(&d_voxel_counts[ch], (size_t)max_voxels * sizeof(int)));
CUDA_CHECK(cudaMalloc(&d_voxel_sum_x[ch], (size_t)max_voxels * sizeof(int)));
CUDA_CHECK(cudaMalloc(&d_voxel_sum_y[ch], (size_t)max_voxels * sizeof(int)));
CUDA_CHECK(cudaMalloc(&d_voxel_sum_z[ch], (size_t)max_voxels * sizeof(int)));
CUDA_CHECK(cudaMalloc(&d_mean_dist[ch], (size_t)max_points * sizeof(float)));
/* ---- 锁页内存Host ---- */
CUDA_CHECK(cudaMallocHost(&h_points[ch], (size_t)max_points * 3 * sizeof(float)));
CUDA_CHECK(cudaMallocHost(&h_pass_mask[ch], (size_t)max_points * sizeof(uint8_t)));
CUDA_CHECK(cudaMallocHost(&h_voxel_counts[ch], (size_t)max_voxels * sizeof(int)));
CUDA_CHECK(cudaMallocHost(&h_voxel_sum_x[ch], (size_t)max_voxels * sizeof(int)));
CUDA_CHECK(cudaMallocHost(&h_voxel_sum_y[ch], (size_t)max_voxels * sizeof(int)));
CUDA_CHECK(cudaMallocHost(&h_voxel_sum_z[ch], (size_t)max_voxels * sizeof(int)));
}
initialized_ = true;
fprintf(stdout,
"[GpuResources] Initialized: max_points=%d, max_voxel_dim=%d, "
"max_voxels=%d\n",
max_points, max_voxel_dim, max_voxels);
return true;
#else
(void)max_points_per_cloud;
(void)max_voxel_grid_dim;
return false;
#endif
}
void GpuResources::destroy()
{
#ifdef USE_GPU
if (!initialized_) return;
for (int ch = 0; ch < 2; ++ch)
{
cudaStreamSynchronize(stream[ch]);
cudaStreamDestroy(stream[ch]);
cudaFree(d_points[ch]);
cudaFree(d_pass_mask[ch]);
cudaFree(d_sor_mask[ch]);
cudaFree(d_voxel_counts[ch]);
cudaFree(d_voxel_sum_x[ch]);
cudaFree(d_voxel_sum_y[ch]);
cudaFree(d_voxel_sum_z[ch]);
cudaFree(d_mean_dist[ch]);
cudaFreeHost(h_points[ch]);
cudaFreeHost(h_pass_mask[ch]);
cudaFreeHost(h_voxel_counts[ch]);
cudaFreeHost(h_voxel_sum_x[ch]);
cudaFreeHost(h_voxel_sum_y[ch]);
cudaFreeHost(h_voxel_sum_z[ch]);
}
initialized_ = false;
#endif
}
/* ================================================================
* 设备可用性
* ================================================================ */
bool isDeviceAvailable()
{
#ifdef USE_GPU
int count = 0;
cudaError_t err = cudaGetDeviceCount(&count);
return (err == cudaSuccess && count > 0);
#else #else
return false; return false;
#endif #endif
} }
// ================================================================= /* ================================================================
// 🔥 重型 GPU 内核:计算密集型,强制 GPU 满载 * CUDA 内核函数
// ================================================================= * ================================================================ */
__global__ void heavyMergedKernel(const float* points, int num_points, bool* output_mask)
/* ----------------------------------------------------------------
* 内核 1空间过滤 + 车身/扫刷剔除
*
* 每个线程处理一个点,输出 pass_mask[i]=1 表示该点通过过滤。
* ---------------------------------------------------------------- */
__global__ void kernelSpatialFilter(const float* __restrict__ points, int num_points, float min_x, float max_x,
float min_y, float max_y, float min_z, float max_z,
/* 车身包围盒(世界坐标,负向剔除) */
float car_min_x, float car_max_x, float car_min_y, float car_max_y, float car_min_z,
float car_max_z, bool filter_car,
/* 右侧扫刷 */
float brush_r_min_x, float brush_r_max_x, float brush_r_min_y, float brush_r_max_y,
float brush_r_min_z, float brush_r_max_z,
/* 左侧扫刷 */
float brush_l_min_x, float brush_l_max_x, float brush_l_min_y, float brush_l_max_y,
float brush_l_min_z, float brush_l_max_z, bool filter_brush,
uint8_t* __restrict__ pass_mask)
{ {
int idx = blockIdx.x * blockDim.x + threadIdx.x; int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx >= num_points) return; if (idx >= num_points) return;
const float min_x = d_const_params[0]; float px = points[idx * 3 + 0];
const float min_y = d_const_params[1]; float py = points[idx * 3 + 1];
const float min_z = d_const_params[2]; float pz = points[idx * 3 + 2];
const float max_x = d_const_params[3];
const float max_y = d_const_params[4];
const float max_z = d_const_params[5];
const float voxel_res = d_const_params[6];
const float grid_range = d_const_params[7];
const float std_mult = d_const_params[8];
const int mean_k = d_const_ints[0];
const int grid_size = d_const_ints[1];
const float x = points[idx * 3 + 0]; /* 1. Bounding box 过滤 */
const float y = points[idx * 3 + 1]; bool valid = (px > min_x && px < max_x && py > min_y && py < max_y && pz > min_z && pz < max_z);
const float z = points[idx * 3 + 2];
// 1. 空间过滤 /* 2. 车身剔除(在 bbox 内的点中剔除车身范围) */
bool valid = (x > min_x && x < max_x) && (y > min_y && y < max_y) && (z > min_z && z < max_z); if (valid && filter_car)
if (!valid)
{ {
output_mask[idx] = false; bool in_car = (px >= car_min_x && px <= car_max_x && py >= car_min_y && py <= car_max_y && pz >= car_min_z &&
return; pz <= car_max_z);
if (in_car) valid = false;
} }
// 2. 体素过滤 /* 3. 扫刷剔除 */
const float half = grid_range * 0.5f; if (valid && filter_brush)
int vx = __float2int_rn((half - x) / voxel_res);
int vy = __float2int_rn((y + half) / voxel_res);
int vz = __float2int_rn((z + half) / voxel_res);
valid = (vx >= 0 && vx < grid_size && vy >= 0 && vy < grid_size && vz >= 0 && vz < grid_size);
if (!valid)
{ {
output_mask[idx] = false; bool in_right = (px >= brush_r_min_x && px <= brush_r_max_x && py >= brush_r_min_y && py <= brush_r_max_y &&
return; pz >= brush_r_min_z && pz <= brush_r_max_z);
bool in_left = (px >= brush_l_min_x && px <= brush_l_max_x && py >= brush_l_min_y && py <= brush_l_max_y &&
pz >= brush_l_min_z && pz <= brush_l_max_z);
if (in_right || in_left) valid = false;
} }
// ################################################################# pass_mask[idx] = valid ? 1 : 0;
// 🔥🔥🔥 这里是强制高计算量,让 GPU 跑满的核心 }
// #################################################################
__shared__ float s_points[1024 * 3];
int lid = threadIdx.x;
s_points[lid * 3 + 0] = x; /* ----------------------------------------------------------------
s_points[lid * 3 + 1] = y; * 内核 2体素化原子累加 + 定点整数,修复竞争条件)
s_points[lid * 3 + 2] = z; *
__syncthreads(); * 修复说明:
* 原版用 float3 做均值累加,但 atomicAdd 不支持 float3
* 多线程同时读写导致数据竞争,结果错误。
*
* 新版:将坐标乘以 VOXEL_SCALE(=1000) 转为 int
* 用 atomicAdd(int*) 累加最后除以count × VOXEL_SCALE还原均值。
* atomicAdd(int*) 在所有 CUDA 设备上均支持。
*
* pass_mask 作为输入门控:仅处理 pass_mask[idx]==1 的点。
* ---------------------------------------------------------------- */
__global__ void kernelVoxelize(const float* __restrict__ points, const uint8_t* __restrict__ pass_mask, int num_points,
float resolution, float range, /* 以原点为中心的覆盖范围(对称) */
int grid_dim, /* 每轴格数 = range / resolution */
int* __restrict__ voxel_counts, int* __restrict__ voxel_sum_x,
int* __restrict__ voxel_sum_y, int* __restrict__ voxel_sum_z)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx >= num_points) return;
if (!pass_mask[idx]) return; /* 已被空间过滤剔除 */
float px = points[idx * 3 + 0];
float py = points[idx * 3 + 1];
float pz = points[idx * 3 + 2];
float half = range * 0.5f;
int vx = __float2int_rd((px + half) / resolution);
int vy = __float2int_rd((py + half) / resolution);
int vz = __float2int_rd((pz + half) / resolution);
/* 边界检查 */
if (vx < 0 || vx >= grid_dim || vy < 0 || vy >= grid_dim || vz < 0 || vz >= grid_dim) return;
int linear = vz * grid_dim * grid_dim + vy * grid_dim + vx;
atomicAdd(&voxel_counts[linear], 1);
/* 定点整数累加,彻底避免竞争条件 */
atomicAdd(&voxel_sum_x[linear], __float2int_rd(px * VOXEL_SCALE));
atomicAdd(&voxel_sum_y[linear], __float2int_rd(py * VOXEL_SCALE));
atomicAdd(&voxel_sum_z[linear], __float2int_rd(pz * VOXEL_SCALE));
}
/* ----------------------------------------------------------------
* 内核 3将体素结果打包为连续点云数组
*
* 输出voxel_points每个非空体素的均值中心点out_count 为输出点数。
* 注意out_count 用 atomicAdd 写入,初始化为 0。
* ---------------------------------------------------------------- */
__global__ void kernelCollectVoxels(const int* __restrict__ voxel_counts, const int* __restrict__ voxel_sum_x,
const int* __restrict__ voxel_sum_y, const int* __restrict__ voxel_sum_z,
int num_voxels, float voxel_scale_inv, /* = 1.0f / VOXEL_SCALE */
float* __restrict__ voxel_points, int* __restrict__ out_count)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx >= num_voxels) return;
int cnt = voxel_counts[idx];
if (cnt == 0) return;
float inv_cnt = 1.0f / (float)cnt;
float cx = (float)voxel_sum_x[idx] * voxel_scale_inv * inv_cnt;
float cy = (float)voxel_sum_y[idx] * voxel_scale_inv * inv_cnt;
float cz = (float)voxel_sum_z[idx] * voxel_scale_inv * inv_cnt;
int pos = atomicAdd(out_count, 1);
voxel_points[pos * 3 + 0] = cx;
voxel_points[pos * 3 + 1] = cy;
voxel_points[pos * 3 + 2] = cz;
}
/* ----------------------------------------------------------------
* 内核 4统计离群点去除SOR
*
* 修复说明(原版两个 bug
* Bug 1mean_k 参数同时被当作「邻居数 k」和「距离阈值」。
* 修复:分离为 k整数邻居数和 global_mean/global_std统计量
* Bug 2遍历所有点做 O(N²) 暴力搜索N 大时 GPU 也会超时。
* 修复SOR 在体素化后的点云上执行,点数已大幅缩减(通常 <5000
*
* 算法:
* 1. 每个线程负责一个点 p_i
* 2. 遍历所有其他点,找到距离最近的 k 个点,计算平均距离 d_i
* 3. 设全局 mean_d = Σd_i/N, std_d = sqrt(Σ(d_i-mean_d)²/N)
* mean_d 和 std_d 在 CPU 上计算后作为参数传入)
* 4. 若 d_i > mean_d + std_mult * std_d则剔除
*
* 注意:本内核仅执行步骤 2输出每点的 mean_k_dist。
* 步骤 3/4 在主机侧 CPU 计算(数据量小,几微秒)。
* ---------------------------------------------------------------- */
__global__ void kernelComputeMeanKDist(const float* __restrict__ points, int num_points, int k,
float* __restrict__ mean_k_dist) /* 输出每点到k近邻的平均距离 */
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx >= num_points) return;
float px = points[idx * 3 + 0];
float py = points[idx * 3 + 1];
float pz = points[idx * 3 + 2];
/* 用局部小数组维护 top-k 最小距离k 通常为 10~30
* 采用部分选择排序,避免在寄存器中分配大数组
* 这里 k 最大支持 64超过则截断 */
const int MAX_K = 64;
int actual_k = (k > MAX_K) ? MAX_K : k;
float top_dists[MAX_K];
for (int i = 0; i < actual_k; ++i) top_dists[i] = 1e30f;
for (int j = 0; j < num_points; ++j)
{
if (j == idx) continue;
float dx = points[j * 3 + 0] - px;
float dy = points[j * 3 + 1] - py;
float dz = points[j * 3 + 2] - pz;
float dist = sqrtf(dx * dx + dy * dy + dz * dz);
/* 插入到 top-k 数组(保持升序) */
if (dist < top_dists[actual_k - 1])
{
top_dists[actual_k - 1] = dist;
/* 插入排序:将新值移到正确位置 */
for (int m = actual_k - 2; m >= 0; --m)
{
if (top_dists[m] > top_dists[m + 1])
{
float tmp = top_dists[m];
top_dists[m] = top_dists[m + 1];
top_dists[m + 1] = tmp;
}
else
break;
}
}
}
/* 计算 k 近邻平均距离 */
float sum = 0.0f; float sum = 0.0f;
int cnt = 0; int cnt = 0;
for (int i = 0; i < actual_k; ++i)
// 循环多次,强行增加计算量 → GPU 占用拉满
for (int loop = 0; loop < 6; loop++)
for (int i = 0; i < blockDim.x; i++)
{ {
if (i == lid) continue; if (top_dists[i] < 1e29f)
float sx = s_points[i * 3 + 0]; {
float sy = s_points[i * 3 + 1]; sum += top_dists[i];
float sz = s_points[i * 3 + 2]; ++cnt;
float dx = x - sx;
float dy = y - sy;
float dz = z - sz;
float d = sqrtf(dx * dx + dy * dy + dz * dz);
// 额外浮点运算,提高负载
d = __fdividef(d, voxel_res);
d = __fdividef(d, std_mult);
d = __fadd_rn(d, loop * 0.1f);
sum += d;
cnt++;
} }
__syncthreads(); }
mean_k_dist[idx] = (cnt > 0) ? (sum / cnt) : 1e30f;
output_mask[idx] = (cnt >= 2) && (sum / cnt < 3.0f);
} }
// ================================================================= /* ================================================================
// 初始化常量内存 * 主机端辅助CPU 计算 SOR 统计量并生成有效索引
// ================================================================= * ================================================================ */
void initConstantMemory(float min_x, float min_y, float min_z, float max_x, float max_y, float max_z, float voxel_res, static std::vector<int> cpuSORFilter(const float* mean_k_dist, int num_points, float std_mult)
float grid_range, float std_mult, int mean_k, int grid_size)
{ {
const float params[9] = {min_x, min_y, min_z, max_x, max_y, max_z, voxel_res, grid_range, std_mult}; /* 计算均值 */
const int ints[2] = {mean_k, grid_size}; double sum = 0.0;
cudaMemcpyToSymbol(d_const_params, params, sizeof(params)); int valid = 0;
cudaMemcpyToSymbol(d_const_ints, ints, sizeof(ints)); for (int i = 0; i < num_points; ++i)
{
if (mean_k_dist[i] < 1e29f)
{
sum += mean_k_dist[i];
++valid;
}
}
if (valid == 0) return {};
double mean = sum / valid;
/* 计算标准差 */
double sq_sum = 0.0;
for (int i = 0; i < num_points; ++i)
{
if (mean_k_dist[i] < 1e29f)
{
double diff = mean_k_dist[i] - mean;
sq_sum += diff * diff;
}
}
double std_dev = std::sqrt(sq_sum / valid);
double threshold = mean + std_mult * std_dev;
std::vector<int> indices;
indices.reserve(valid);
for (int i = 0; i < num_points; ++i)
{
if (mean_k_dist[i] <= threshold) indices.push_back(i);
}
return indices;
} }
// ================================================================= /* ================================================================
// 主接口 * 核心 GPU 处理接口实现(完整流水线)
// ================================================================= * ================================================================ */
std::vector<float> gpuProcessPointCloud(const float* points_in, int num_points, const GpuProcessingParams& params,
int channel)
{
#ifdef USE_GPU
auto& res = GpuResources::instance();
if (!res.isInitialized() || !isDeviceAvailable()) return {};
if (num_points <= 0 || num_points > res.max_points) return {};
channel = (channel == 0) ? 0 : 1;
cudaStream_t st = res.stream[channel];
/* ---- Step 0将输入点云拷贝到锁页内存再异步传输到设备 ---- */
std::memcpy(res.h_points[channel], points_in, (size_t)num_points * 3 * sizeof(float));
CUDA_CHECK(cudaMemcpyAsync(res.d_points[channel], res.h_points[channel], (size_t)num_points * 3 * sizeof(float),
cudaMemcpyHostToDevice, st));
/* ---- Step 1空间过滤 ---- */
int blocks_pts = (num_points + BLOCK_SIZE - 1) / BLOCK_SIZE;
kernelSpatialFilter<<<blocks_pts, BLOCK_SIZE, 0, st>>>(
res.d_points[channel], num_points, params.min_x, params.max_x, params.min_y, params.max_y, params.min_z,
params.max_z, params.car_min_x, params.car_max_x, params.car_min_y, params.car_max_y, params.car_min_z,
params.car_max_z, params.filter_car, params.brush_r_min_x, params.brush_r_max_x, params.brush_r_min_y,
params.brush_r_max_y, params.brush_r_min_z, params.brush_r_max_z, params.brush_l_min_x, params.brush_l_max_x,
params.brush_l_min_y, params.brush_l_max_y, params.brush_l_min_z, params.brush_l_max_z, params.filter_brush,
res.d_pass_mask[channel]);
/* ---- Step 2体素化降采样 ---- */
int grid_dim = static_cast<int>(params.grid_range / params.voxel_resolution);
if (grid_dim > res.max_voxel_dim) grid_dim = res.max_voxel_dim;
int num_voxels = grid_dim * grid_dim * grid_dim;
/* 清零体素数组(异步) */
CUDA_CHECK(cudaMemsetAsync(res.d_voxel_counts[channel], 0, (size_t)num_voxels * sizeof(int), st));
CUDA_CHECK(cudaMemsetAsync(res.d_voxel_sum_x[channel], 0, (size_t)num_voxels * sizeof(int), st));
CUDA_CHECK(cudaMemsetAsync(res.d_voxel_sum_y[channel], 0, (size_t)num_voxels * sizeof(int), st));
CUDA_CHECK(cudaMemsetAsync(res.d_voxel_sum_z[channel], 0, (size_t)num_voxels * sizeof(int), st));
kernelVoxelize<<<blocks_pts, BLOCK_SIZE, 0, st>>>(res.d_points[channel], res.d_pass_mask[channel], num_points,
params.voxel_resolution, params.grid_range, grid_dim,
res.d_voxel_counts[channel], res.d_voxel_sum_x[channel],
res.d_voxel_sum_y[channel], res.d_voxel_sum_z[channel]);
/* 收集有效体素到紧凑点云数组
* 复用 d_pass_mask 的前 sizeof(int) 字节做 out_count节省内存*/
int* d_voxel_out_count = reinterpret_cast<int*>(res.d_sor_mask[channel]);
CUDA_CHECK(cudaMemsetAsync(d_voxel_out_count, 0, sizeof(int), st));
/* 体素化后点云写入 d_mean_dist 空间float 数组,容量足够)*/
float* d_voxel_pts = res.d_mean_dist[channel];
int blocks_vox = (num_voxels + BLOCK_SIZE - 1) / BLOCK_SIZE;
kernelCollectVoxels<<<blocks_vox, BLOCK_SIZE, 0, st>>>(
res.d_voxel_counts[channel], res.d_voxel_sum_x[channel], res.d_voxel_sum_y[channel], res.d_voxel_sum_z[channel],
num_voxels, 1.0f / (float)VOXEL_SCALE, d_voxel_pts, d_voxel_out_count);
/* 回传体素化后的点数 */
int voxel_point_count = 0;
CUDA_CHECK(cudaMemcpyAsync(&voxel_point_count, d_voxel_out_count, sizeof(int), cudaMemcpyDeviceToHost, st));
/* 等待 stream 完成,获取精确点数后再做 SOR */
CUDA_CHECK(cudaStreamSynchronize(st));
if (voxel_point_count <= 0) return {};
/* ---- Step 3SOR在体素化后的点云上点数已大幅减小----
*
* 3a. GPU 计算每点的 mean-k-dist
* 3b. 回传到锁页内存(通过 h_voxel_sum_x 复用float 类型)
* 3c. CPU 计算均值/标准差并筛选有效点
*/
int blocks_sor = (voxel_point_count + BLOCK_SIZE - 1) / BLOCK_SIZE;
/* 用 d_voxel_sum_x 临时存放 mean_k_distint 和 float 等宽,安全转换)*/
float* d_mkdist = reinterpret_cast<float*>(res.d_voxel_sum_x[channel]);
kernelComputeMeanKDist<<<blocks_sor, BLOCK_SIZE, 0, st>>>(d_voxel_pts, voxel_point_count, params.sor_mean_k,
d_mkdist);
/* 用锁页内存接收 mean_k_dist 结果 */
float* h_mkdist = reinterpret_cast<float*>(res.h_voxel_sum_x[channel]);
CUDA_CHECK(
cudaMemcpyAsync(h_mkdist, d_mkdist, (size_t)voxel_point_count * sizeof(float), cudaMemcpyDeviceToHost, st));
/* 同时回传体素化后的点云(用于 CPU 侧筛选) */
float* h_voxel_pts = res.h_points[channel]; /* 复用锁页内存voxel 点数 << max_points */
CUDA_CHECK(cudaMemcpyAsync(h_voxel_pts, d_voxel_pts, (size_t)voxel_point_count * 3 * sizeof(float),
cudaMemcpyDeviceToHost, st));
CUDA_CHECK(cudaStreamSynchronize(st));
/* 3c. CPU 侧 SOR 筛选(数据量小,耗时可忽略)*/
auto sor_indices = cpuSORFilter(h_mkdist, voxel_point_count, params.sor_std_mult);
if (sor_indices.empty()) return {};
/* ---- Step 4组装最终输出点云 ---- */
std::vector<float> result;
result.reserve(sor_indices.size() * 3);
for (int idx : sor_indices)
{
result.push_back(h_voxel_pts[idx * 3 + 0]);
result.push_back(h_voxel_pts[idx * 3 + 1]);
result.push_back(h_voxel_pts[idx * 3 + 2]);
}
return result;
#else
(void)points_in;
(void)num_points;
(void)params;
(void)channel;
return {};
#endif
}
/* ================================================================
* 兼容旧接口gpuMergedProcessing
* ================================================================ */
std::vector<int> gpuMergedProcessing(const std::vector<float>& points, float min_x, float min_y, float min_z, std::vector<int> gpuMergedProcessing(const std::vector<float>& points, float min_x, float min_y, float min_z,
float max_x, float max_y, float max_z, float voxel_resolution, float grid_range, float max_x, float max_y, float max_z, float voxel_resolution, float grid_range,
int mean_k, float std_mult) int mean_k, float std_mult)
{ {
#ifdef USE_GPU #ifdef USE_GPU
if (!isDeviceAvailable() || points.empty()) return {}; int num_points = (int)(points.size() / 3);
if (num_points == 0) return {};
int num_points = points.size() / 3; GpuProcessingParams params{};
int grid_size = static_cast<int>(grid_range / voxel_resolution); params.min_x = min_x;
initConstantMemory(min_x, min_y, min_z, max_x, max_y, max_z, voxel_resolution, grid_range, std_mult, mean_k, params.max_x = max_x;
grid_size); params.min_y = min_y;
params.max_y = max_y;
params.min_z = min_z;
params.max_z = max_z;
params.voxel_resolution = voxel_resolution;
params.grid_range = grid_range;
params.sor_mean_k = mean_k;
params.sor_std_mult = std_mult;
params.filter_car = false;
params.filter_brush = false;
cudaStream_t stream; auto result_pts = gpuProcessPointCloud(points.data(), num_points, params, 0);
cudaStreamCreate(&stream);
float* d_points; /* 返回结果点在原始点云中的索引(为保持接口兼容,顺序编号)*/
bool* d_mask; int out_count = (int)(result_pts.size() / 3);
std::vector<int> indices(out_count);
cudaMallocAsync(&d_points, points.size() * sizeof(float), stream); for (int i = 0; i < out_count; ++i) indices[i] = i;
cudaMallocAsync(&d_mask, num_points * sizeof(bool), stream); return indices;
cudaMemcpyAsync(d_points, points.data(), points.size() * sizeof(float), cudaMemcpyHostToDevice, stream);
cudaMemsetAsync(d_mask, 0, num_points * sizeof(bool), stream);
// 🔥 超大线程块 → 高并发 → GPU 跑满
const int block = 1024;
const int grid = (num_points + block - 1) / block;
// 执行重型内核
heavyMergedKernel<<<grid, block, 0, stream>>>(d_points, num_points, d_mask);
std::vector<uint8_t> host_mask(num_points);
cudaMemcpyAsync(host_mask.data(), d_mask, num_points * sizeof(uint8_t), cudaMemcpyDeviceToHost, stream);
cudaStreamSynchronize(stream);
std::vector<int> res;
res.reserve(num_points / 2);
for (int i = 0; i < num_points; i++)
{
if (host_mask[i]) res.push_back(i);
}
cudaFreeAsync(d_points, stream);
cudaFreeAsync(d_mask, stream);
cudaStreamDestroy(stream);
return res;
#else #else
return {}; return {};
#endif #endif
} }
// ================================================================= /* ================================================================
// 接口封装 * 单独过滤接口(轻量版,不做体素化和 SOR
// ================================================================= * ================================================================ */
std::vector<int> gpuFilterPointCloud(const std::vector<float>& points, float min_x, float min_y, float min_z, std::vector<int> gpuFilterPointCloud(const std::vector<float>& points, float min_x, float min_y, float min_z,
float max_x, float max_y, float max_z) float max_x, float max_y, float max_z)
{
return gpuMergedProcessing(points, min_x, min_y, min_z, max_x, max_y, max_z, 0.1f, 20.0f, 10, 1.0f);
}
// =================================================================
// 体素滤波
// =================================================================
__global__ void voxelizeKernelOpt(const float* points, int num_points, float res, float range, int* counts,
float3* centers, int grid_size)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i >= num_points) return;
float x = points[i * 3 + 0], y = points[i * 3 + 1], z = points[i * 3 + 2];
float h = range * 0.5f;
int vx = __float2int_rn((h - x) / res);
int vy = __float2int_rn((y + h) / res);
int vz = __float2int_rn((z + h) / res);
if (vx < 0 || vx >= grid_size || vy < 0 || vy >= grid_size || vz < 0 || vz >= grid_size) return;
int idx = vz * grid_size * grid_size + vy * grid_size + vx;
int n = atomicAdd(&counts[idx], 1);
if (n == 0)
{
centers[idx] = make_float3(x, y, z);
}
else
{
float3 c = centers[idx];
c.x = (c.x * n + x) / (n + 1);
c.y = (c.y * n + y) / (n + 1);
c.z = (c.z * n + z) / (n + 1);
centers[idx] = c;
}
}
std::vector<PointFloat> gpuVoxelGridFilter(const std::vector<float>& points, float res, float range)
{ {
#ifdef USE_GPU #ifdef USE_GPU
if (points.empty()) return {}; auto& res = GpuResources::instance();
int n = points.size() / 3; if (!res.isInitialized() || !isDeviceAvailable()) return {};
int g = static_cast<int>(range / res);
size_t total = g * g * g;
std::vector<int> cnt(total, 0); int num_points = (int)(points.size() / 3);
std::vector<float3> cen(total); if (num_points == 0 || num_points > res.max_points) return {};
float* d_p; cudaStream_t st = res.stream[0];
int* d_c;
float3* d_cen;
cudaMalloc(&d_p, points.size() * sizeof(float)); std::memcpy(res.h_points[0], points.data(), (size_t)num_points * 3 * sizeof(float));
cudaMalloc(&d_c, total * sizeof(int)); CUDA_CHECK(cudaMemcpyAsync(res.d_points[0], res.h_points[0], (size_t)num_points * 3 * sizeof(float),
cudaMalloc(&d_cen, total * sizeof(float3)); cudaMemcpyHostToDevice, st));
cudaMemcpy(d_p, points.data(), points.size() * sizeof(float), cudaMemcpyHostToDevice); int blocks = (num_points + BLOCK_SIZE - 1) / BLOCK_SIZE;
cudaMemcpy(d_c, cnt.data(), total * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_cen, cen.data(), total * sizeof(float3), cudaMemcpyHostToDevice);
int block = 1024; kernelSpatialFilter<<<blocks, BLOCK_SIZE, 0, st>>>(res.d_points[0], num_points, min_x, max_x, min_y, max_y, min_z,
int grid = (n + block - 1) / block; max_z, 0, 0, 0, 0, 0, 0, false, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
voxelizeKernelOpt<<<grid, block>>>(d_p, n, res, range, d_c, d_cen, g); 0, false, res.d_pass_mask[0]);
cudaMemcpy(cnt.data(), d_c, total * sizeof(int), cudaMemcpyDeviceToHost); CUDA_CHECK(cudaMemcpyAsync(res.h_pass_mask[0], res.d_pass_mask[0], (size_t)num_points * sizeof(uint8_t),
cudaMemcpy(cen.data(), d_cen, total * sizeof(float3), cudaMemcpyDeviceToHost); cudaMemcpyDeviceToHost, st));
CUDA_CHECK(cudaStreamSynchronize(st));
std::vector<PointFloat> r; std::vector<int> results;
r.reserve(total / 10); results.reserve(num_points / 2);
for (size_t i = 0; i < total; i++) for (int i = 0; i < num_points; ++i)
if (cnt[i] > 0) r.emplace_back(cen[i].x, cen[i].y, cen[i].z); if (res.h_pass_mask[0][i]) results.push_back(i);
return results;
cudaFree(d_p);
cudaFree(d_c);
cudaFree(d_cen);
return r;
#else #else
return {}; return {};
#endif #endif
} }
// ================================================================= /* ================================================================
// 离群点去除 * 体素化接口(独立)
// ================================================================= * ================================================================ */
std::vector<int> gpuStatisticalOutlierRemoval(const std::vector<float>& points, int mean_k, float std_t)
std::vector<PointFloat> gpuVoxelGridFilter(const std::vector<float>& points, float resolution, float range)
{ {
return gpuMergedProcessing(points, -1e9, -1e9, -1e9, 1e9, 1e9, 1e9, 0.1f, 100.0f, mean_k, std_t); #ifdef USE_GPU
auto& res = GpuResources::instance();
if (!res.isInitialized() || !isDeviceAvailable()) return {};
int num_points = (int)(points.size() / 3);
if (num_points == 0 || num_points > res.max_points) return {};
int grid_dim = static_cast<int>(range / resolution);
if (grid_dim > res.max_voxel_dim) grid_dim = res.max_voxel_dim;
int num_voxels = grid_dim * grid_dim * grid_dim;
if (num_voxels > res.max_voxels) return {};
cudaStream_t st = res.stream[0];
/* 全部点视为通过空间过滤pass_mask 全置 1 */
CUDA_CHECK(cudaMemsetAsync(res.d_pass_mask[0], 1, (size_t)num_points * sizeof(uint8_t), st));
std::memcpy(res.h_points[0], points.data(), (size_t)num_points * 3 * sizeof(float));
CUDA_CHECK(cudaMemcpyAsync(res.d_points[0], res.h_points[0], (size_t)num_points * 3 * sizeof(float),
cudaMemcpyHostToDevice, st));
CUDA_CHECK(cudaMemsetAsync(res.d_voxel_counts[0], 0, (size_t)num_voxels * sizeof(int), st));
CUDA_CHECK(cudaMemsetAsync(res.d_voxel_sum_x[0], 0, (size_t)num_voxels * sizeof(int), st));
CUDA_CHECK(cudaMemsetAsync(res.d_voxel_sum_y[0], 0, (size_t)num_voxels * sizeof(int), st));
CUDA_CHECK(cudaMemsetAsync(res.d_voxel_sum_z[0], 0, (size_t)num_voxels * sizeof(int), st));
int blocks_pts = (num_points + BLOCK_SIZE - 1) / BLOCK_SIZE;
kernelVoxelize<<<blocks_pts, BLOCK_SIZE, 0, st>>>(res.d_points[0], res.d_pass_mask[0], num_points, resolution,
range, grid_dim, res.d_voxel_counts[0], res.d_voxel_sum_x[0],
res.d_voxel_sum_y[0], res.d_voxel_sum_z[0]);
/* 回传体素结果 */
CUDA_CHECK(cudaMemcpyAsync(res.h_voxel_counts[0], res.d_voxel_counts[0], (size_t)num_voxels * sizeof(int),
cudaMemcpyDeviceToHost, st));
CUDA_CHECK(cudaMemcpyAsync(res.h_voxel_sum_x[0], res.d_voxel_sum_x[0], (size_t)num_voxels * sizeof(int),
cudaMemcpyDeviceToHost, st));
CUDA_CHECK(cudaMemcpyAsync(res.h_voxel_sum_y[0], res.d_voxel_sum_y[0], (size_t)num_voxels * sizeof(int),
cudaMemcpyDeviceToHost, st));
CUDA_CHECK(cudaMemcpyAsync(res.h_voxel_sum_z[0], res.d_voxel_sum_z[0], (size_t)num_voxels * sizeof(int),
cudaMemcpyDeviceToHost, st));
CUDA_CHECK(cudaStreamSynchronize(st));
float scale_inv = 1.0f / (float)VOXEL_SCALE;
std::vector<PointFloat> result;
result.reserve(num_voxels / 10);
for (int i = 0; i < num_voxels; ++i)
{
int cnt = res.h_voxel_counts[0][i];
if (cnt == 0) continue;
float inv_cnt = 1.0f / (float)cnt;
result.emplace_back((float)res.h_voxel_sum_x[0][i] * scale_inv * inv_cnt,
(float)res.h_voxel_sum_y[0][i] * scale_inv * inv_cnt,
(float)res.h_voxel_sum_z[0][i] * scale_inv * inv_cnt);
}
return result;
#else
return {};
#endif
} }
// ================================================================= /* ================================================================
// PCL 格式转换 * SOR 接口(独立)
// ================================================================= * ================================================================ */
std::vector<int> gpuStatisticalOutlierRemoval(const std::vector<float>& points, int mean_k, float std_thresh)
{
#ifdef USE_GPU
auto& res = GpuResources::instance();
if (!res.isInitialized() || !isDeviceAvailable()) return {};
int num_points = (int)(points.size() / 3);
if (num_points == 0 || num_points > res.max_points) return {};
cudaStream_t st = res.stream[0];
std::memcpy(res.h_points[0], points.data(), (size_t)num_points * 3 * sizeof(float));
CUDA_CHECK(cudaMemcpyAsync(res.d_points[0], res.h_points[0], (size_t)num_points * 3 * sizeof(float),
cudaMemcpyHostToDevice, st));
float* d_mkdist = res.d_mean_dist[0];
int blocks = (num_points + BLOCK_SIZE - 1) / BLOCK_SIZE;
kernelComputeMeanKDist<<<blocks, BLOCK_SIZE, 0, st>>>(res.d_points[0], num_points, mean_k, d_mkdist);
float* h_mkdist = reinterpret_cast<float*>(res.h_voxel_sum_x[0]);
CUDA_CHECK(cudaMemcpyAsync(h_mkdist, d_mkdist, (size_t)num_points * sizeof(float), cudaMemcpyDeviceToHost, st));
CUDA_CHECK(cudaStreamSynchronize(st));
return cpuSORFilter(h_mkdist, num_points, std_thresh);
#else
return {};
#endif
}
/* ================================================================
* PCL ↔ float 数组转换辅助函数
* ================================================================ */
std::vector<float> convertToFloatArray(const typename pcl::PointCloud<pcl::PointXYZ>::Ptr& cloud) std::vector<float> convertToFloatArray(const typename pcl::PointCloud<pcl::PointXYZ>::Ptr& cloud)
{ {
std::vector<float> r; std::vector<float> result;
if (!cloud) return r; if (!cloud || cloud->empty()) return result;
r.reserve(cloud->size() * 3);
for (auto& p : cloud->points) result.resize(cloud->size() * 3);
for (size_t i = 0; i < cloud->size(); ++i)
{ {
r.push_back(p.x); result[i * 3 + 0] = cloud->points[i].x;
r.push_back(p.y); result[i * 3 + 1] = cloud->points[i].y;
r.push_back(p.z); result[i * 3 + 2] = cloud->points[i].z;
} }
return r; return result;
} }
typename pcl::PointCloud<pcl::PointXYZ>::Ptr convertToPCLCloud(const std::vector<float>& points) typename pcl::PointCloud<pcl::PointXYZ>::Ptr convertToPCLCloud(const std::vector<float>& points)
{ {
auto cloud = typename pcl::PointCloud<pcl::PointXYZ>::Ptr(new pcl::PointCloud<pcl::PointXYZ>); auto cloud = std::make_shared<pcl::PointCloud<pcl::PointXYZ>>();
size_t n = points.size() / 3; size_t n = points.size() / 3;
cloud->resize(n); cloud->resize(n);
for (size_t i = 0; i < n; i++) for (size_t i = 0; i < n; ++i)
{ {
cloud->points[i].x = points[i * 3 + 0]; cloud->points[i].x = points[i * 3 + 0];
cloud->points[i].y = points[i * 3 + 1]; cloud->points[i].y = points[i * 3 + 1];
cloud->points[i].z = points[i * 3 + 2]; cloud->points[i].z = points[i * 3 + 2];
} }
cloud->width = n; cloud->width = (uint32_t)n;
cloud->height = 1; cloud->height = 1;
cloud->is_dense = true; cloud->is_dense = true;
return cloud; return cloud;