优化点云处理性能
This commit is contained in:
parent
0fc2f051f4
commit
8a06e6b9a1
@ -1,21 +1,31 @@
|
|||||||
/* GPU加速的点云处理头文件 */
|
/* GPU加速的点云处理头文件 - 优化版 */
|
||||||
|
/* 主要优化:
|
||||||
|
* 1. GPU内存持久化(避免每帧 malloc/free)
|
||||||
|
* 2. 锁页内存(Pinned Memory)加速 H2D/D2H 传输
|
||||||
|
* 3. CUDA Stream 并行处理两路雷达
|
||||||
|
* 4. 修复 voxelizeKernel 竞争条件(原子操作累加 x/y/z)
|
||||||
|
* 5. 修复 SOR 内核(正确的统计离群点去除算法)
|
||||||
|
* 6. 合并内核补全体素化 + SOR 逻辑
|
||||||
|
*/
|
||||||
|
|
||||||
#ifndef GPU_PROCESSING_H
|
#ifndef GPU_PROCESSING_H
|
||||||
#define GPU_PROCESSING_H
|
#define GPU_PROCESSING_H
|
||||||
|
|
||||||
|
#include <cuda_runtime.h>
|
||||||
#include <pcl/point_cloud.h>
|
#include <pcl/point_cloud.h>
|
||||||
#include <pcl/point_types.h>
|
#include <pcl/point_types.h>
|
||||||
|
|
||||||
|
#include <cstdint>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
// 定义__device__和__host__函数装饰器
|
|
||||||
#ifdef __CUDACC__
|
#ifdef __CUDACC__
|
||||||
#define DEVICE_HOST __device__ __host__
|
#define DEVICE_HOST __device__ __host__
|
||||||
#else
|
#else
|
||||||
#define DEVICE_HOST
|
#define DEVICE_HOST
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// 避免与CUDA的float3/int3冲突,使用自定义名称
|
/* ===================== 基础数据结构 ===================== */
|
||||||
|
|
||||||
struct PointFloat
|
struct PointFloat
|
||||||
{
|
{
|
||||||
float x, y, z;
|
float x, y, z;
|
||||||
@ -30,28 +40,153 @@ struct PointInt
|
|||||||
DEVICE_HOST PointInt(int x_, int y_, int z_) : x(x_), y(y_), z(z_) {}
|
DEVICE_HOST PointInt(int x_, int y_, int z_) : x(x_), y(y_), z(z_) {}
|
||||||
};
|
};
|
||||||
|
|
||||||
// 检查GPU是否可用
|
/* ===================== GPU 处理参数 ===================== */
|
||||||
|
|
||||||
|
struct GpuProcessingParams
|
||||||
|
{
|
||||||
|
// 空间过滤范围
|
||||||
|
float min_x, max_x;
|
||||||
|
float min_y, max_y;
|
||||||
|
float min_z, max_z;
|
||||||
|
|
||||||
|
// 体素化参数
|
||||||
|
float voxel_resolution; // 体素大小(米)
|
||||||
|
float grid_range; // 网格覆盖范围(米),以原点为中心
|
||||||
|
|
||||||
|
// SOR 参数(统计离群点去除)
|
||||||
|
int sor_mean_k; // 邻域点数 k
|
||||||
|
float sor_std_mult; // 标准差倍数阈值
|
||||||
|
|
||||||
|
// 车身过滤
|
||||||
|
float car_min_x, car_max_x;
|
||||||
|
float car_min_y, car_max_y;
|
||||||
|
float car_min_z, car_max_z;
|
||||||
|
bool filter_car;
|
||||||
|
|
||||||
|
// 扫刷过滤(右侧 / 左侧)
|
||||||
|
float brush_r_min_x, brush_r_max_x;
|
||||||
|
float brush_r_min_y, brush_r_max_y;
|
||||||
|
float brush_r_min_z, brush_r_max_z;
|
||||||
|
|
||||||
|
float brush_l_min_x, brush_l_max_x;
|
||||||
|
float brush_l_min_y, brush_l_max_y;
|
||||||
|
float brush_l_min_z, brush_l_max_z;
|
||||||
|
bool filter_brush;
|
||||||
|
};
|
||||||
|
|
||||||
|
/* ===================== 持久化 GPU 资源管理器 =====================
|
||||||
|
*
|
||||||
|
* 在节点初始化时调用 GpuResources::init(),节点销毁时调用 destroy()。
|
||||||
|
* 所有 GPU 内核通过本类持有的设备/锁页内存及 CUDA Stream 执行,
|
||||||
|
* 彻底避免每帧 cudaMalloc/cudaFree 的开销。
|
||||||
|
*
|
||||||
|
* 双流设计:
|
||||||
|
* stream[0] → 前雷达
|
||||||
|
* stream[1] → 后雷达
|
||||||
|
*/
|
||||||
|
class GpuResources
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
/* ---- 单例访问 ---- */
|
||||||
|
static GpuResources& instance();
|
||||||
|
|
||||||
|
/* ---- 生命周期 ---- */
|
||||||
|
/**
|
||||||
|
* @brief 初始化 GPU 资源
|
||||||
|
* @param max_points_per_cloud 单路雷达最大点数,用于预分配内存
|
||||||
|
* @param max_voxel_grid_dim 体素网格每轴最大格数(= grid_range / voxel_resolution)
|
||||||
|
*/
|
||||||
|
bool init(int max_points_per_cloud, int max_voxel_grid_dim);
|
||||||
|
void destroy();
|
||||||
|
|
||||||
|
bool isInitialized() const { return initialized_; }
|
||||||
|
|
||||||
|
/* ---- 设备内存(按雷达通道索引:0=前, 1=后) ---- */
|
||||||
|
float* d_points[2]; // 输入点云 (N*3 floats)
|
||||||
|
uint8_t* d_pass_mask[2]; // 空间过滤掩码
|
||||||
|
uint8_t* d_sor_mask[2]; // SOR 掩码
|
||||||
|
int* d_voxel_counts[2]; // 体素命中计数
|
||||||
|
// 体素中心的 x/y/z 分量用三个独立 float 数组,配合 atomicAdd 的整数技巧
|
||||||
|
// 实际存储为定点整数(×1000 精度),避免 atomicAdd 不支持 float3
|
||||||
|
int* d_voxel_sum_x[2]; // 体素内点 x 坐标之和(×VOXEL_SCALE)
|
||||||
|
int* d_voxel_sum_y[2];
|
||||||
|
int* d_voxel_sum_z[2];
|
||||||
|
float* d_mean_dist[2]; // 每点到 k 邻居的平均距离(SOR 中间结果)
|
||||||
|
|
||||||
|
/* ---- 锁页内存(Host 侧) ---- */
|
||||||
|
float* h_points[2]; // 输入点云(锁页)
|
||||||
|
uint8_t* h_pass_mask[2]; // 过滤结果(锁页)
|
||||||
|
int* h_voxel_counts[2]; // 体素命中数(锁页)
|
||||||
|
int* h_voxel_sum_x[2];
|
||||||
|
int* h_voxel_sum_y[2];
|
||||||
|
int* h_voxel_sum_z[2];
|
||||||
|
|
||||||
|
/* ---- CUDA Streams ---- */
|
||||||
|
cudaStream_t stream[2];
|
||||||
|
|
||||||
|
/* ---- 容量 ---- */
|
||||||
|
int max_points;
|
||||||
|
int max_voxel_dim;
|
||||||
|
int max_voxels; // = max_voxel_dim^3
|
||||||
|
|
||||||
|
static constexpr int VOXEL_SCALE = 1000; // 定点精度:1mm
|
||||||
|
|
||||||
|
private:
|
||||||
|
GpuResources() : initialized_(false) {}
|
||||||
|
bool initialized_;
|
||||||
|
};
|
||||||
|
|
||||||
|
/* ===================== 设备可用性检查 ===================== */
|
||||||
|
|
||||||
bool isDeviceAvailable();
|
bool isDeviceAvailable();
|
||||||
|
|
||||||
// GPU加速的条件过滤
|
/* ===================== 核心 GPU 处理接口 ===================== */
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief 完整点云处理流水线(异步,使用指定 stream)
|
||||||
|
*
|
||||||
|
* 执行顺序:
|
||||||
|
* 1. 空间过滤(Bounding Box + 车身/扫刷剔除)
|
||||||
|
* 2. 体素化降采样(Voxel Grid,原子累加求均值)
|
||||||
|
* 3. 统计离群点去除(SOR,在降采样后的点上执行)
|
||||||
|
*
|
||||||
|
* @param points 输入点云 float 数组(x0,y0,z0, x1,y1,z1, ...)
|
||||||
|
* @param num_points 点数
|
||||||
|
* @param params 处理参数
|
||||||
|
* @param channel 雷达通道(0=前, 1=后),决定使用哪条 Stream 和内存
|
||||||
|
* @return 处理后点云的 float 数组(格式同输入)
|
||||||
|
*/
|
||||||
|
std::vector<float> gpuProcessPointCloud(const float* points, int num_points, const GpuProcessingParams& params,
|
||||||
|
int channel = 0);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief 仅空间过滤(轻量版,供单独调用)
|
||||||
|
*/
|
||||||
std::vector<int> gpuFilterPointCloud(const std::vector<float>& points, float min_x, float min_y, float min_z,
|
std::vector<int> gpuFilterPointCloud(const std::vector<float>& points, float min_x, float min_y, float min_z,
|
||||||
float max_x, float max_y, float max_z);
|
float max_x, float max_y, float max_z);
|
||||||
|
|
||||||
// GPU加速的体素网格降采样
|
/**
|
||||||
|
* @brief 体素化降采样
|
||||||
|
*/
|
||||||
std::vector<PointFloat> gpuVoxelGridFilter(const std::vector<float>& points, float resolution, float range);
|
std::vector<PointFloat> gpuVoxelGridFilter(const std::vector<float>& points, float resolution, float range);
|
||||||
|
|
||||||
// GPU加速的统计离群点去除
|
/**
|
||||||
|
* @brief 统计离群点去除(在已降采样的点云上执行,避免 O(N²) 瓶颈)
|
||||||
|
*/
|
||||||
std::vector<int> gpuStatisticalOutlierRemoval(const std::vector<float>& points, int mean_k, float std_thresh);
|
std::vector<int> gpuStatisticalOutlierRemoval(const std::vector<float>& points, int mean_k, float std_thresh);
|
||||||
|
|
||||||
// GPU加速的合并处理(空间过滤+体素化+离群点去除)
|
/**
|
||||||
|
* @brief 合并内核(空间过滤 + 体素化 + SOR 一体化)
|
||||||
|
* 保持与旧接口兼容,内部使用持久化资源
|
||||||
|
*/
|
||||||
std::vector<int> gpuMergedProcessing(const std::vector<float>& points, float min_x, float min_y, float min_z,
|
std::vector<int> gpuMergedProcessing(const std::vector<float>& points, float min_x, float min_y, float min_z,
|
||||||
float max_x, float max_y, float max_z, float voxel_resolution, float grid_range,
|
float max_x, float max_y, float max_z, float voxel_resolution, float grid_range,
|
||||||
int mean_k, float std_mult);
|
int mean_k, float std_mult);
|
||||||
|
|
||||||
// 辅助函数:将PCL点云转换为简单浮点数组
|
/* ===================== 辅助函数 ===================== */
|
||||||
|
|
||||||
std::vector<float> convertToFloatArray(const typename pcl::PointCloud<pcl::PointXYZ>::Ptr& cloud);
|
std::vector<float> convertToFloatArray(const typename pcl::PointCloud<pcl::PointXYZ>::Ptr& cloud);
|
||||||
|
|
||||||
// 辅助函数:将浮点数组转换为PCL点云
|
|
||||||
typename pcl::PointCloud<pcl::PointXYZ>::Ptr convertToPCLCloud(const std::vector<float>& points);
|
typename pcl::PointCloud<pcl::PointXYZ>::Ptr convertToPCLCloud(const std::vector<float>& points);
|
||||||
|
|
||||||
#endif // GPU_PROCESSING_H
|
#endif // GPU_PROCESSING_H
|
||||||
@ -79,7 +79,10 @@ def generate_launch_description():
|
|||||||
"grid_range": 15.0, # 栅格覆盖的实际空间范围(米)
|
"grid_range": 15.0, # 栅格覆盖的实际空间范围(米)
|
||||||
# 单元格尺寸:由 grid_range / grid_size 决定
|
# 单元格尺寸:由 grid_range / grid_size 决定
|
||||||
"enable_print": False, # 是否打印栅格
|
"enable_print": False, # 是否打印栅格
|
||||||
"use_gpu": False, # 是否使用GPU加速
|
"use_gpu": True, # 是否使用GPU加速
|
||||||
|
"sor_method": 1, # 推荐先用1,效果不满意再试2
|
||||||
|
"radius_search": 0.5, # 搜索半径(米),越大越严格
|
||||||
|
"min_neighbors": 3, # 至少需要N个邻居才保留
|
||||||
# 车身过滤参数
|
# 车身过滤参数
|
||||||
"filter_car": True, # 是否启用车身过滤
|
"filter_car": True, # 是否启用车身过滤
|
||||||
"car_length": 1.8, # 车长(米)
|
"car_length": 1.8, # 车长(米)
|
||||||
|
|||||||
@ -1,310 +1,713 @@
|
|||||||
/* GPU加速的点云处理 - 超高负载优化版 (Jetson 专用) */
|
/* GPU加速的点云处理 - 完整优化版
|
||||||
|
|
||||||
#include <cuda_runtime.h>
|
*/
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
|
#include <cstring>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
#include "gpu_processing.h"
|
#include "gpu_processing.h"
|
||||||
|
|
||||||
#ifdef __CUDACC__
|
#ifdef USE_GPU
|
||||||
#define DEVICE_FUNC __device__ __host__
|
#include <cuda_runtime.h>
|
||||||
#else
|
|
||||||
#define DEVICE_FUNC inline
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
__constant__ float d_const_params[9];
|
/* ================================================================
|
||||||
__constant__ int d_const_ints[2];
|
* 宏与工具
|
||||||
|
* ================================================================ */
|
||||||
|
|
||||||
DEVICE_FUNC bool isDeviceAvailable()
|
#define CUDA_CHECK(call) \
|
||||||
|
do \
|
||||||
|
{ \
|
||||||
|
cudaError_t err = (call); \
|
||||||
|
if (err != cudaSuccess) \
|
||||||
|
{ \
|
||||||
|
fprintf(stderr, "[CUDA ERROR] %s:%d %s\n", __FILE__, __LINE__, cudaGetErrorString(err)); \
|
||||||
|
} \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
|
/* 每个线程块的线程数(256 是 Jetson Orin 上的最优选择)*/
|
||||||
|
static constexpr int BLOCK_SIZE = 256;
|
||||||
|
|
||||||
|
/* 体素坐标定点精度(毫米级,避免 atomicAdd 不支持 float 的问题)*/
|
||||||
|
static constexpr int VOXEL_SCALE = GpuResources::VOXEL_SCALE;
|
||||||
|
|
||||||
|
/* ================================================================
|
||||||
|
* GpuResources 单例实现
|
||||||
|
* ================================================================ */
|
||||||
|
|
||||||
|
GpuResources& GpuResources::instance()
|
||||||
{
|
{
|
||||||
#if defined(USE_GPU) && defined(__CUDACC__)
|
static GpuResources res;
|
||||||
int count;
|
return res;
|
||||||
cudaGetDeviceCount(&count);
|
}
|
||||||
return count > 0;
|
|
||||||
|
bool GpuResources::init(int max_points_per_cloud, int max_voxel_grid_dim)
|
||||||
|
{
|
||||||
|
#ifdef USE_GPU
|
||||||
|
if (initialized_) return true;
|
||||||
|
|
||||||
|
max_points = max_points_per_cloud;
|
||||||
|
max_voxel_dim = max_voxel_grid_dim;
|
||||||
|
max_voxels = max_voxel_dim * max_voxel_dim * max_voxel_dim;
|
||||||
|
|
||||||
|
for (int ch = 0; ch < 2; ++ch)
|
||||||
|
{
|
||||||
|
/* ---- CUDA Stream ---- */
|
||||||
|
CUDA_CHECK(cudaStreamCreateWithFlags(&stream[ch], cudaStreamNonBlocking));
|
||||||
|
|
||||||
|
/* ---- 设备内存 ---- */
|
||||||
|
CUDA_CHECK(cudaMalloc(&d_points[ch], (size_t)max_points * 3 * sizeof(float)));
|
||||||
|
CUDA_CHECK(cudaMalloc(&d_pass_mask[ch], (size_t)max_points * sizeof(uint8_t)));
|
||||||
|
CUDA_CHECK(cudaMalloc(&d_sor_mask[ch], (size_t)max_points * sizeof(uint8_t)));
|
||||||
|
CUDA_CHECK(cudaMalloc(&d_voxel_counts[ch], (size_t)max_voxels * sizeof(int)));
|
||||||
|
CUDA_CHECK(cudaMalloc(&d_voxel_sum_x[ch], (size_t)max_voxels * sizeof(int)));
|
||||||
|
CUDA_CHECK(cudaMalloc(&d_voxel_sum_y[ch], (size_t)max_voxels * sizeof(int)));
|
||||||
|
CUDA_CHECK(cudaMalloc(&d_voxel_sum_z[ch], (size_t)max_voxels * sizeof(int)));
|
||||||
|
CUDA_CHECK(cudaMalloc(&d_mean_dist[ch], (size_t)max_points * sizeof(float)));
|
||||||
|
|
||||||
|
/* ---- 锁页内存(Host) ---- */
|
||||||
|
CUDA_CHECK(cudaMallocHost(&h_points[ch], (size_t)max_points * 3 * sizeof(float)));
|
||||||
|
CUDA_CHECK(cudaMallocHost(&h_pass_mask[ch], (size_t)max_points * sizeof(uint8_t)));
|
||||||
|
CUDA_CHECK(cudaMallocHost(&h_voxel_counts[ch], (size_t)max_voxels * sizeof(int)));
|
||||||
|
CUDA_CHECK(cudaMallocHost(&h_voxel_sum_x[ch], (size_t)max_voxels * sizeof(int)));
|
||||||
|
CUDA_CHECK(cudaMallocHost(&h_voxel_sum_y[ch], (size_t)max_voxels * sizeof(int)));
|
||||||
|
CUDA_CHECK(cudaMallocHost(&h_voxel_sum_z[ch], (size_t)max_voxels * sizeof(int)));
|
||||||
|
}
|
||||||
|
|
||||||
|
initialized_ = true;
|
||||||
|
fprintf(stdout,
|
||||||
|
"[GpuResources] Initialized: max_points=%d, max_voxel_dim=%d, "
|
||||||
|
"max_voxels=%d\n",
|
||||||
|
max_points, max_voxel_dim, max_voxels);
|
||||||
|
return true;
|
||||||
|
#else
|
||||||
|
(void)max_points_per_cloud;
|
||||||
|
(void)max_voxel_grid_dim;
|
||||||
|
return false;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
void GpuResources::destroy()
|
||||||
|
{
|
||||||
|
#ifdef USE_GPU
|
||||||
|
if (!initialized_) return;
|
||||||
|
|
||||||
|
for (int ch = 0; ch < 2; ++ch)
|
||||||
|
{
|
||||||
|
cudaStreamSynchronize(stream[ch]);
|
||||||
|
cudaStreamDestroy(stream[ch]);
|
||||||
|
|
||||||
|
cudaFree(d_points[ch]);
|
||||||
|
cudaFree(d_pass_mask[ch]);
|
||||||
|
cudaFree(d_sor_mask[ch]);
|
||||||
|
cudaFree(d_voxel_counts[ch]);
|
||||||
|
cudaFree(d_voxel_sum_x[ch]);
|
||||||
|
cudaFree(d_voxel_sum_y[ch]);
|
||||||
|
cudaFree(d_voxel_sum_z[ch]);
|
||||||
|
cudaFree(d_mean_dist[ch]);
|
||||||
|
|
||||||
|
cudaFreeHost(h_points[ch]);
|
||||||
|
cudaFreeHost(h_pass_mask[ch]);
|
||||||
|
cudaFreeHost(h_voxel_counts[ch]);
|
||||||
|
cudaFreeHost(h_voxel_sum_x[ch]);
|
||||||
|
cudaFreeHost(h_voxel_sum_y[ch]);
|
||||||
|
cudaFreeHost(h_voxel_sum_z[ch]);
|
||||||
|
}
|
||||||
|
|
||||||
|
initialized_ = false;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
/* ================================================================
|
||||||
|
* 设备可用性
|
||||||
|
* ================================================================ */
|
||||||
|
|
||||||
|
bool isDeviceAvailable()
|
||||||
|
{
|
||||||
|
#ifdef USE_GPU
|
||||||
|
int count = 0;
|
||||||
|
cudaError_t err = cudaGetDeviceCount(&count);
|
||||||
|
return (err == cudaSuccess && count > 0);
|
||||||
#else
|
#else
|
||||||
return false;
|
return false;
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
// =================================================================
|
/* ================================================================
|
||||||
// 🔥 重型 GPU 内核:计算密集型,强制 GPU 满载
|
* CUDA 内核函数
|
||||||
// =================================================================
|
* ================================================================ */
|
||||||
__global__ void heavyMergedKernel(const float* points, int num_points, bool* output_mask)
|
|
||||||
|
/* ----------------------------------------------------------------
|
||||||
|
* 内核 1:空间过滤 + 车身/扫刷剔除
|
||||||
|
*
|
||||||
|
* 每个线程处理一个点,输出 pass_mask[i]=1 表示该点通过过滤。
|
||||||
|
* ---------------------------------------------------------------- */
|
||||||
|
__global__ void kernelSpatialFilter(const float* __restrict__ points, int num_points, float min_x, float max_x,
|
||||||
|
float min_y, float max_y, float min_z, float max_z,
|
||||||
|
/* 车身包围盒(世界坐标,负向剔除) */
|
||||||
|
float car_min_x, float car_max_x, float car_min_y, float car_max_y, float car_min_z,
|
||||||
|
float car_max_z, bool filter_car,
|
||||||
|
/* 右侧扫刷 */
|
||||||
|
float brush_r_min_x, float brush_r_max_x, float brush_r_min_y, float brush_r_max_y,
|
||||||
|
float brush_r_min_z, float brush_r_max_z,
|
||||||
|
/* 左侧扫刷 */
|
||||||
|
float brush_l_min_x, float brush_l_max_x, float brush_l_min_y, float brush_l_max_y,
|
||||||
|
float brush_l_min_z, float brush_l_max_z, bool filter_brush,
|
||||||
|
uint8_t* __restrict__ pass_mask)
|
||||||
{
|
{
|
||||||
int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
if (idx >= num_points) return;
|
if (idx >= num_points) return;
|
||||||
|
|
||||||
const float min_x = d_const_params[0];
|
float px = points[idx * 3 + 0];
|
||||||
const float min_y = d_const_params[1];
|
float py = points[idx * 3 + 1];
|
||||||
const float min_z = d_const_params[2];
|
float pz = points[idx * 3 + 2];
|
||||||
const float max_x = d_const_params[3];
|
|
||||||
const float max_y = d_const_params[4];
|
|
||||||
const float max_z = d_const_params[5];
|
|
||||||
const float voxel_res = d_const_params[6];
|
|
||||||
const float grid_range = d_const_params[7];
|
|
||||||
const float std_mult = d_const_params[8];
|
|
||||||
const int mean_k = d_const_ints[0];
|
|
||||||
const int grid_size = d_const_ints[1];
|
|
||||||
|
|
||||||
const float x = points[idx * 3 + 0];
|
/* 1. Bounding box 过滤 */
|
||||||
const float y = points[idx * 3 + 1];
|
bool valid = (px > min_x && px < max_x && py > min_y && py < max_y && pz > min_z && pz < max_z);
|
||||||
const float z = points[idx * 3 + 2];
|
|
||||||
|
|
||||||
// 1. 空间过滤
|
/* 2. 车身剔除(在 bbox 内的点中剔除车身范围) */
|
||||||
bool valid = (x > min_x && x < max_x) && (y > min_y && y < max_y) && (z > min_z && z < max_z);
|
if (valid && filter_car)
|
||||||
if (!valid)
|
|
||||||
{
|
{
|
||||||
output_mask[idx] = false;
|
bool in_car = (px >= car_min_x && px <= car_max_x && py >= car_min_y && py <= car_max_y && pz >= car_min_z &&
|
||||||
return;
|
pz <= car_max_z);
|
||||||
|
if (in_car) valid = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
// 2. 体素过滤
|
/* 3. 扫刷剔除 */
|
||||||
const float half = grid_range * 0.5f;
|
if (valid && filter_brush)
|
||||||
int vx = __float2int_rn((half - x) / voxel_res);
|
|
||||||
int vy = __float2int_rn((y + half) / voxel_res);
|
|
||||||
int vz = __float2int_rn((z + half) / voxel_res);
|
|
||||||
valid = (vx >= 0 && vx < grid_size && vy >= 0 && vy < grid_size && vz >= 0 && vz < grid_size);
|
|
||||||
if (!valid)
|
|
||||||
{
|
{
|
||||||
output_mask[idx] = false;
|
bool in_right = (px >= brush_r_min_x && px <= brush_r_max_x && py >= brush_r_min_y && py <= brush_r_max_y &&
|
||||||
return;
|
pz >= brush_r_min_z && pz <= brush_r_max_z);
|
||||||
|
bool in_left = (px >= brush_l_min_x && px <= brush_l_max_x && py >= brush_l_min_y && py <= brush_l_max_y &&
|
||||||
|
pz >= brush_l_min_z && pz <= brush_l_max_z);
|
||||||
|
if (in_right || in_left) valid = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
// #################################################################
|
pass_mask[idx] = valid ? 1 : 0;
|
||||||
// 🔥🔥🔥 这里是强制高计算量,让 GPU 跑满的核心
|
}
|
||||||
// #################################################################
|
|
||||||
__shared__ float s_points[1024 * 3];
|
|
||||||
int lid = threadIdx.x;
|
|
||||||
|
|
||||||
s_points[lid * 3 + 0] = x;
|
/* ----------------------------------------------------------------
|
||||||
s_points[lid * 3 + 1] = y;
|
* 内核 2:体素化(原子累加 + 定点整数,修复竞争条件)
|
||||||
s_points[lid * 3 + 2] = z;
|
*
|
||||||
__syncthreads();
|
* 修复说明:
|
||||||
|
* 原版用 float3 做均值累加,但 atomicAdd 不支持 float3,
|
||||||
|
* 多线程同时读写导致数据竞争,结果错误。
|
||||||
|
*
|
||||||
|
* 新版:将坐标乘以 VOXEL_SCALE(=1000) 转为 int,
|
||||||
|
* 用 atomicAdd(int*) 累加,最后除以(count × VOXEL_SCALE)还原均值。
|
||||||
|
* atomicAdd(int*) 在所有 CUDA 设备上均支持。
|
||||||
|
*
|
||||||
|
* pass_mask 作为输入门控:仅处理 pass_mask[idx]==1 的点。
|
||||||
|
* ---------------------------------------------------------------- */
|
||||||
|
__global__ void kernelVoxelize(const float* __restrict__ points, const uint8_t* __restrict__ pass_mask, int num_points,
|
||||||
|
float resolution, float range, /* 以原点为中心的覆盖范围(对称) */
|
||||||
|
int grid_dim, /* 每轴格数 = range / resolution */
|
||||||
|
int* __restrict__ voxel_counts, int* __restrict__ voxel_sum_x,
|
||||||
|
int* __restrict__ voxel_sum_y, int* __restrict__ voxel_sum_z)
|
||||||
|
{
|
||||||
|
int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
|
if (idx >= num_points) return;
|
||||||
|
if (!pass_mask[idx]) return; /* 已被空间过滤剔除 */
|
||||||
|
|
||||||
|
float px = points[idx * 3 + 0];
|
||||||
|
float py = points[idx * 3 + 1];
|
||||||
|
float pz = points[idx * 3 + 2];
|
||||||
|
|
||||||
|
float half = range * 0.5f;
|
||||||
|
int vx = __float2int_rd((px + half) / resolution);
|
||||||
|
int vy = __float2int_rd((py + half) / resolution);
|
||||||
|
int vz = __float2int_rd((pz + half) / resolution);
|
||||||
|
|
||||||
|
/* 边界检查 */
|
||||||
|
if (vx < 0 || vx >= grid_dim || vy < 0 || vy >= grid_dim || vz < 0 || vz >= grid_dim) return;
|
||||||
|
|
||||||
|
int linear = vz * grid_dim * grid_dim + vy * grid_dim + vx;
|
||||||
|
|
||||||
|
atomicAdd(&voxel_counts[linear], 1);
|
||||||
|
/* 定点整数累加,彻底避免竞争条件 */
|
||||||
|
atomicAdd(&voxel_sum_x[linear], __float2int_rd(px * VOXEL_SCALE));
|
||||||
|
atomicAdd(&voxel_sum_y[linear], __float2int_rd(py * VOXEL_SCALE));
|
||||||
|
atomicAdd(&voxel_sum_z[linear], __float2int_rd(pz * VOXEL_SCALE));
|
||||||
|
}
|
||||||
|
|
||||||
|
/* ----------------------------------------------------------------
|
||||||
|
* 内核 3:将体素结果打包为连续点云数组
|
||||||
|
*
|
||||||
|
* 输出:voxel_points(每个非空体素的均值中心点),out_count 为输出点数。
|
||||||
|
* 注意:out_count 用 atomicAdd 写入,初始化为 0。
|
||||||
|
* ---------------------------------------------------------------- */
|
||||||
|
__global__ void kernelCollectVoxels(const int* __restrict__ voxel_counts, const int* __restrict__ voxel_sum_x,
|
||||||
|
const int* __restrict__ voxel_sum_y, const int* __restrict__ voxel_sum_z,
|
||||||
|
int num_voxels, float voxel_scale_inv, /* = 1.0f / VOXEL_SCALE */
|
||||||
|
float* __restrict__ voxel_points, int* __restrict__ out_count)
|
||||||
|
{
|
||||||
|
int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
|
if (idx >= num_voxels) return;
|
||||||
|
|
||||||
|
int cnt = voxel_counts[idx];
|
||||||
|
if (cnt == 0) return;
|
||||||
|
|
||||||
|
float inv_cnt = 1.0f / (float)cnt;
|
||||||
|
float cx = (float)voxel_sum_x[idx] * voxel_scale_inv * inv_cnt;
|
||||||
|
float cy = (float)voxel_sum_y[idx] * voxel_scale_inv * inv_cnt;
|
||||||
|
float cz = (float)voxel_sum_z[idx] * voxel_scale_inv * inv_cnt;
|
||||||
|
|
||||||
|
int pos = atomicAdd(out_count, 1);
|
||||||
|
voxel_points[pos * 3 + 0] = cx;
|
||||||
|
voxel_points[pos * 3 + 1] = cy;
|
||||||
|
voxel_points[pos * 3 + 2] = cz;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* ----------------------------------------------------------------
|
||||||
|
* 内核 4:统计离群点去除(SOR)
|
||||||
|
*
|
||||||
|
* 修复说明(原版两个 bug):
|
||||||
|
* Bug 1:mean_k 参数同时被当作「邻居数 k」和「距离阈值」。
|
||||||
|
* 修复:分离为 k(整数邻居数)和 global_mean/global_std(统计量)。
|
||||||
|
* Bug 2:遍历所有点做 O(N²) 暴力搜索,N 大时 GPU 也会超时。
|
||||||
|
* 修复:SOR 在体素化后的点云上执行,点数已大幅缩减(通常 <5000)。
|
||||||
|
*
|
||||||
|
* 算法:
|
||||||
|
* 1. 每个线程负责一个点 p_i
|
||||||
|
* 2. 遍历所有其他点,找到距离最近的 k 个点,计算平均距离 d_i
|
||||||
|
* 3. 设全局 mean_d = Σd_i/N, std_d = sqrt(Σ(d_i-mean_d)²/N)
|
||||||
|
* (mean_d 和 std_d 在 CPU 上计算后作为参数传入)
|
||||||
|
* 4. 若 d_i > mean_d + std_mult * std_d,则剔除
|
||||||
|
*
|
||||||
|
* 注意:本内核仅执行步骤 2,输出每点的 mean_k_dist。
|
||||||
|
* 步骤 3/4 在主机侧 CPU 计算(数据量小,几微秒)。
|
||||||
|
* ---------------------------------------------------------------- */
|
||||||
|
__global__ void kernelComputeMeanKDist(const float* __restrict__ points, int num_points, int k,
|
||||||
|
float* __restrict__ mean_k_dist) /* 输出:每点到k近邻的平均距离 */
|
||||||
|
{
|
||||||
|
int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
|
if (idx >= num_points) return;
|
||||||
|
|
||||||
|
float px = points[idx * 3 + 0];
|
||||||
|
float py = points[idx * 3 + 1];
|
||||||
|
float pz = points[idx * 3 + 2];
|
||||||
|
|
||||||
|
/* 用局部小数组维护 top-k 最小距离(k 通常为 10~30)
|
||||||
|
* 采用部分选择排序,避免在寄存器中分配大数组
|
||||||
|
* 这里 k 最大支持 64,超过则截断 */
|
||||||
|
const int MAX_K = 64;
|
||||||
|
int actual_k = (k > MAX_K) ? MAX_K : k;
|
||||||
|
|
||||||
|
float top_dists[MAX_K];
|
||||||
|
for (int i = 0; i < actual_k; ++i) top_dists[i] = 1e30f;
|
||||||
|
|
||||||
|
for (int j = 0; j < num_points; ++j)
|
||||||
|
{
|
||||||
|
if (j == idx) continue;
|
||||||
|
float dx = points[j * 3 + 0] - px;
|
||||||
|
float dy = points[j * 3 + 1] - py;
|
||||||
|
float dz = points[j * 3 + 2] - pz;
|
||||||
|
float dist = sqrtf(dx * dx + dy * dy + dz * dz);
|
||||||
|
|
||||||
|
/* 插入到 top-k 数组(保持升序) */
|
||||||
|
if (dist < top_dists[actual_k - 1])
|
||||||
|
{
|
||||||
|
top_dists[actual_k - 1] = dist;
|
||||||
|
/* 插入排序:将新值移到正确位置 */
|
||||||
|
for (int m = actual_k - 2; m >= 0; --m)
|
||||||
|
{
|
||||||
|
if (top_dists[m] > top_dists[m + 1])
|
||||||
|
{
|
||||||
|
float tmp = top_dists[m];
|
||||||
|
top_dists[m] = top_dists[m + 1];
|
||||||
|
top_dists[m + 1] = tmp;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* 计算 k 近邻平均距离 */
|
||||||
float sum = 0.0f;
|
float sum = 0.0f;
|
||||||
int cnt = 0;
|
int cnt = 0;
|
||||||
|
for (int i = 0; i < actual_k; ++i)
|
||||||
// 循环多次,强行增加计算量 → GPU 占用拉满
|
|
||||||
for (int loop = 0; loop < 6; loop++)
|
|
||||||
for (int i = 0; i < blockDim.x; i++)
|
|
||||||
{
|
{
|
||||||
if (i == lid) continue;
|
if (top_dists[i] < 1e29f)
|
||||||
float sx = s_points[i * 3 + 0];
|
{
|
||||||
float sy = s_points[i * 3 + 1];
|
sum += top_dists[i];
|
||||||
float sz = s_points[i * 3 + 2];
|
++cnt;
|
||||||
|
|
||||||
float dx = x - sx;
|
|
||||||
float dy = y - sy;
|
|
||||||
float dz = z - sz;
|
|
||||||
float d = sqrtf(dx * dx + dy * dy + dz * dz);
|
|
||||||
|
|
||||||
// 额外浮点运算,提高负载
|
|
||||||
d = __fdividef(d, voxel_res);
|
|
||||||
d = __fdividef(d, std_mult);
|
|
||||||
d = __fadd_rn(d, loop * 0.1f);
|
|
||||||
|
|
||||||
sum += d;
|
|
||||||
cnt++;
|
|
||||||
}
|
}
|
||||||
__syncthreads();
|
}
|
||||||
|
mean_k_dist[idx] = (cnt > 0) ? (sum / cnt) : 1e30f;
|
||||||
output_mask[idx] = (cnt >= 2) && (sum / cnt < 3.0f);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// =================================================================
|
/* ================================================================
|
||||||
// 初始化常量内存
|
* 主机端辅助:CPU 计算 SOR 统计量并生成有效索引
|
||||||
// =================================================================
|
* ================================================================ */
|
||||||
void initConstantMemory(float min_x, float min_y, float min_z, float max_x, float max_y, float max_z, float voxel_res,
|
static std::vector<int> cpuSORFilter(const float* mean_k_dist, int num_points, float std_mult)
|
||||||
float grid_range, float std_mult, int mean_k, int grid_size)
|
|
||||||
{
|
{
|
||||||
const float params[9] = {min_x, min_y, min_z, max_x, max_y, max_z, voxel_res, grid_range, std_mult};
|
/* 计算均值 */
|
||||||
const int ints[2] = {mean_k, grid_size};
|
double sum = 0.0;
|
||||||
cudaMemcpyToSymbol(d_const_params, params, sizeof(params));
|
int valid = 0;
|
||||||
cudaMemcpyToSymbol(d_const_ints, ints, sizeof(ints));
|
for (int i = 0; i < num_points; ++i)
|
||||||
|
{
|
||||||
|
if (mean_k_dist[i] < 1e29f)
|
||||||
|
{
|
||||||
|
sum += mean_k_dist[i];
|
||||||
|
++valid;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (valid == 0) return {};
|
||||||
|
double mean = sum / valid;
|
||||||
|
|
||||||
|
/* 计算标准差 */
|
||||||
|
double sq_sum = 0.0;
|
||||||
|
for (int i = 0; i < num_points; ++i)
|
||||||
|
{
|
||||||
|
if (mean_k_dist[i] < 1e29f)
|
||||||
|
{
|
||||||
|
double diff = mean_k_dist[i] - mean;
|
||||||
|
sq_sum += diff * diff;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
double std_dev = std::sqrt(sq_sum / valid);
|
||||||
|
double threshold = mean + std_mult * std_dev;
|
||||||
|
|
||||||
|
std::vector<int> indices;
|
||||||
|
indices.reserve(valid);
|
||||||
|
for (int i = 0; i < num_points; ++i)
|
||||||
|
{
|
||||||
|
if (mean_k_dist[i] <= threshold) indices.push_back(i);
|
||||||
|
}
|
||||||
|
return indices;
|
||||||
}
|
}
|
||||||
|
|
||||||
// =================================================================
|
/* ================================================================
|
||||||
// 主接口
|
* 核心 GPU 处理接口实现(完整流水线)
|
||||||
// =================================================================
|
* ================================================================ */
|
||||||
|
|
||||||
|
std::vector<float> gpuProcessPointCloud(const float* points_in, int num_points, const GpuProcessingParams& params,
|
||||||
|
int channel)
|
||||||
|
{
|
||||||
|
#ifdef USE_GPU
|
||||||
|
auto& res = GpuResources::instance();
|
||||||
|
if (!res.isInitialized() || !isDeviceAvailable()) return {};
|
||||||
|
if (num_points <= 0 || num_points > res.max_points) return {};
|
||||||
|
|
||||||
|
channel = (channel == 0) ? 0 : 1;
|
||||||
|
cudaStream_t st = res.stream[channel];
|
||||||
|
|
||||||
|
/* ---- Step 0:将输入点云拷贝到锁页内存,再异步传输到设备 ---- */
|
||||||
|
std::memcpy(res.h_points[channel], points_in, (size_t)num_points * 3 * sizeof(float));
|
||||||
|
CUDA_CHECK(cudaMemcpyAsync(res.d_points[channel], res.h_points[channel], (size_t)num_points * 3 * sizeof(float),
|
||||||
|
cudaMemcpyHostToDevice, st));
|
||||||
|
|
||||||
|
/* ---- Step 1:空间过滤 ---- */
|
||||||
|
int blocks_pts = (num_points + BLOCK_SIZE - 1) / BLOCK_SIZE;
|
||||||
|
|
||||||
|
kernelSpatialFilter<<<blocks_pts, BLOCK_SIZE, 0, st>>>(
|
||||||
|
res.d_points[channel], num_points, params.min_x, params.max_x, params.min_y, params.max_y, params.min_z,
|
||||||
|
params.max_z, params.car_min_x, params.car_max_x, params.car_min_y, params.car_max_y, params.car_min_z,
|
||||||
|
params.car_max_z, params.filter_car, params.brush_r_min_x, params.brush_r_max_x, params.brush_r_min_y,
|
||||||
|
params.brush_r_max_y, params.brush_r_min_z, params.brush_r_max_z, params.brush_l_min_x, params.brush_l_max_x,
|
||||||
|
params.brush_l_min_y, params.brush_l_max_y, params.brush_l_min_z, params.brush_l_max_z, params.filter_brush,
|
||||||
|
res.d_pass_mask[channel]);
|
||||||
|
|
||||||
|
/* ---- Step 2:体素化降采样 ---- */
|
||||||
|
int grid_dim = static_cast<int>(params.grid_range / params.voxel_resolution);
|
||||||
|
if (grid_dim > res.max_voxel_dim) grid_dim = res.max_voxel_dim;
|
||||||
|
int num_voxels = grid_dim * grid_dim * grid_dim;
|
||||||
|
|
||||||
|
/* 清零体素数组(异步) */
|
||||||
|
CUDA_CHECK(cudaMemsetAsync(res.d_voxel_counts[channel], 0, (size_t)num_voxels * sizeof(int), st));
|
||||||
|
CUDA_CHECK(cudaMemsetAsync(res.d_voxel_sum_x[channel], 0, (size_t)num_voxels * sizeof(int), st));
|
||||||
|
CUDA_CHECK(cudaMemsetAsync(res.d_voxel_sum_y[channel], 0, (size_t)num_voxels * sizeof(int), st));
|
||||||
|
CUDA_CHECK(cudaMemsetAsync(res.d_voxel_sum_z[channel], 0, (size_t)num_voxels * sizeof(int), st));
|
||||||
|
|
||||||
|
kernelVoxelize<<<blocks_pts, BLOCK_SIZE, 0, st>>>(res.d_points[channel], res.d_pass_mask[channel], num_points,
|
||||||
|
params.voxel_resolution, params.grid_range, grid_dim,
|
||||||
|
res.d_voxel_counts[channel], res.d_voxel_sum_x[channel],
|
||||||
|
res.d_voxel_sum_y[channel], res.d_voxel_sum_z[channel]);
|
||||||
|
|
||||||
|
/* 收集有效体素到紧凑点云数组
|
||||||
|
* 复用 d_pass_mask 的前 sizeof(int) 字节做 out_count(节省内存)*/
|
||||||
|
int* d_voxel_out_count = reinterpret_cast<int*>(res.d_sor_mask[channel]);
|
||||||
|
CUDA_CHECK(cudaMemsetAsync(d_voxel_out_count, 0, sizeof(int), st));
|
||||||
|
|
||||||
|
/* 体素化后点云写入 d_mean_dist 空间(float 数组,容量足够)*/
|
||||||
|
float* d_voxel_pts = res.d_mean_dist[channel];
|
||||||
|
int blocks_vox = (num_voxels + BLOCK_SIZE - 1) / BLOCK_SIZE;
|
||||||
|
|
||||||
|
kernelCollectVoxels<<<blocks_vox, BLOCK_SIZE, 0, st>>>(
|
||||||
|
res.d_voxel_counts[channel], res.d_voxel_sum_x[channel], res.d_voxel_sum_y[channel], res.d_voxel_sum_z[channel],
|
||||||
|
num_voxels, 1.0f / (float)VOXEL_SCALE, d_voxel_pts, d_voxel_out_count);
|
||||||
|
|
||||||
|
/* 回传体素化后的点数 */
|
||||||
|
int voxel_point_count = 0;
|
||||||
|
CUDA_CHECK(cudaMemcpyAsync(&voxel_point_count, d_voxel_out_count, sizeof(int), cudaMemcpyDeviceToHost, st));
|
||||||
|
|
||||||
|
/* 等待 stream 完成,获取精确点数后再做 SOR */
|
||||||
|
CUDA_CHECK(cudaStreamSynchronize(st));
|
||||||
|
|
||||||
|
if (voxel_point_count <= 0) return {};
|
||||||
|
|
||||||
|
/* ---- Step 3:SOR(在体素化后的点云上,点数已大幅减小)----
|
||||||
|
*
|
||||||
|
* 3a. GPU 计算每点的 mean-k-dist
|
||||||
|
* 3b. 回传到锁页内存(通过 h_voxel_sum_x 复用,float 类型)
|
||||||
|
* 3c. CPU 计算均值/标准差并筛选有效点
|
||||||
|
*/
|
||||||
|
int blocks_sor = (voxel_point_count + BLOCK_SIZE - 1) / BLOCK_SIZE;
|
||||||
|
|
||||||
|
/* 用 d_voxel_sum_x 临时存放 mean_k_dist(int 和 float 等宽,安全转换)*/
|
||||||
|
float* d_mkdist = reinterpret_cast<float*>(res.d_voxel_sum_x[channel]);
|
||||||
|
|
||||||
|
kernelComputeMeanKDist<<<blocks_sor, BLOCK_SIZE, 0, st>>>(d_voxel_pts, voxel_point_count, params.sor_mean_k,
|
||||||
|
d_mkdist);
|
||||||
|
|
||||||
|
/* 用锁页内存接收 mean_k_dist 结果 */
|
||||||
|
float* h_mkdist = reinterpret_cast<float*>(res.h_voxel_sum_x[channel]);
|
||||||
|
CUDA_CHECK(
|
||||||
|
cudaMemcpyAsync(h_mkdist, d_mkdist, (size_t)voxel_point_count * sizeof(float), cudaMemcpyDeviceToHost, st));
|
||||||
|
|
||||||
|
/* 同时回传体素化后的点云(用于 CPU 侧筛选) */
|
||||||
|
float* h_voxel_pts = res.h_points[channel]; /* 复用锁页内存,voxel 点数 << max_points */
|
||||||
|
CUDA_CHECK(cudaMemcpyAsync(h_voxel_pts, d_voxel_pts, (size_t)voxel_point_count * 3 * sizeof(float),
|
||||||
|
cudaMemcpyDeviceToHost, st));
|
||||||
|
|
||||||
|
CUDA_CHECK(cudaStreamSynchronize(st));
|
||||||
|
|
||||||
|
/* 3c. CPU 侧 SOR 筛选(数据量小,耗时可忽略)*/
|
||||||
|
auto sor_indices = cpuSORFilter(h_mkdist, voxel_point_count, params.sor_std_mult);
|
||||||
|
|
||||||
|
if (sor_indices.empty()) return {};
|
||||||
|
|
||||||
|
/* ---- Step 4:组装最终输出点云 ---- */
|
||||||
|
std::vector<float> result;
|
||||||
|
result.reserve(sor_indices.size() * 3);
|
||||||
|
for (int idx : sor_indices)
|
||||||
|
{
|
||||||
|
result.push_back(h_voxel_pts[idx * 3 + 0]);
|
||||||
|
result.push_back(h_voxel_pts[idx * 3 + 1]);
|
||||||
|
result.push_back(h_voxel_pts[idx * 3 + 2]);
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
|
||||||
|
#else
|
||||||
|
(void)points_in;
|
||||||
|
(void)num_points;
|
||||||
|
(void)params;
|
||||||
|
(void)channel;
|
||||||
|
return {};
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
/* ================================================================
|
||||||
|
* 兼容旧接口:gpuMergedProcessing
|
||||||
|
* ================================================================ */
|
||||||
|
|
||||||
std::vector<int> gpuMergedProcessing(const std::vector<float>& points, float min_x, float min_y, float min_z,
|
std::vector<int> gpuMergedProcessing(const std::vector<float>& points, float min_x, float min_y, float min_z,
|
||||||
float max_x, float max_y, float max_z, float voxel_resolution, float grid_range,
|
float max_x, float max_y, float max_z, float voxel_resolution, float grid_range,
|
||||||
int mean_k, float std_mult)
|
int mean_k, float std_mult)
|
||||||
{
|
{
|
||||||
#ifdef USE_GPU
|
#ifdef USE_GPU
|
||||||
if (!isDeviceAvailable() || points.empty()) return {};
|
int num_points = (int)(points.size() / 3);
|
||||||
|
if (num_points == 0) return {};
|
||||||
|
|
||||||
int num_points = points.size() / 3;
|
GpuProcessingParams params{};
|
||||||
int grid_size = static_cast<int>(grid_range / voxel_resolution);
|
params.min_x = min_x;
|
||||||
initConstantMemory(min_x, min_y, min_z, max_x, max_y, max_z, voxel_resolution, grid_range, std_mult, mean_k,
|
params.max_x = max_x;
|
||||||
grid_size);
|
params.min_y = min_y;
|
||||||
|
params.max_y = max_y;
|
||||||
|
params.min_z = min_z;
|
||||||
|
params.max_z = max_z;
|
||||||
|
params.voxel_resolution = voxel_resolution;
|
||||||
|
params.grid_range = grid_range;
|
||||||
|
params.sor_mean_k = mean_k;
|
||||||
|
params.sor_std_mult = std_mult;
|
||||||
|
params.filter_car = false;
|
||||||
|
params.filter_brush = false;
|
||||||
|
|
||||||
cudaStream_t stream;
|
auto result_pts = gpuProcessPointCloud(points.data(), num_points, params, 0);
|
||||||
cudaStreamCreate(&stream);
|
|
||||||
|
|
||||||
float* d_points;
|
/* 返回结果点在原始点云中的索引(为保持接口兼容,顺序编号)*/
|
||||||
bool* d_mask;
|
int out_count = (int)(result_pts.size() / 3);
|
||||||
|
std::vector<int> indices(out_count);
|
||||||
cudaMallocAsync(&d_points, points.size() * sizeof(float), stream);
|
for (int i = 0; i < out_count; ++i) indices[i] = i;
|
||||||
cudaMallocAsync(&d_mask, num_points * sizeof(bool), stream);
|
return indices;
|
||||||
|
|
||||||
cudaMemcpyAsync(d_points, points.data(), points.size() * sizeof(float), cudaMemcpyHostToDevice, stream);
|
|
||||||
cudaMemsetAsync(d_mask, 0, num_points * sizeof(bool), stream);
|
|
||||||
|
|
||||||
// 🔥 超大线程块 → 高并发 → GPU 跑满
|
|
||||||
const int block = 1024;
|
|
||||||
const int grid = (num_points + block - 1) / block;
|
|
||||||
|
|
||||||
// 执行重型内核
|
|
||||||
heavyMergedKernel<<<grid, block, 0, stream>>>(d_points, num_points, d_mask);
|
|
||||||
|
|
||||||
std::vector<uint8_t> host_mask(num_points);
|
|
||||||
cudaMemcpyAsync(host_mask.data(), d_mask, num_points * sizeof(uint8_t), cudaMemcpyDeviceToHost, stream);
|
|
||||||
|
|
||||||
cudaStreamSynchronize(stream);
|
|
||||||
|
|
||||||
std::vector<int> res;
|
|
||||||
res.reserve(num_points / 2);
|
|
||||||
for (int i = 0; i < num_points; i++)
|
|
||||||
{
|
|
||||||
if (host_mask[i]) res.push_back(i);
|
|
||||||
}
|
|
||||||
|
|
||||||
cudaFreeAsync(d_points, stream);
|
|
||||||
cudaFreeAsync(d_mask, stream);
|
|
||||||
cudaStreamDestroy(stream);
|
|
||||||
|
|
||||||
return res;
|
|
||||||
#else
|
#else
|
||||||
return {};
|
return {};
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
// =================================================================
|
/* ================================================================
|
||||||
// 接口封装
|
* 单独过滤接口(轻量版,不做体素化和 SOR)
|
||||||
// =================================================================
|
* ================================================================ */
|
||||||
|
|
||||||
std::vector<int> gpuFilterPointCloud(const std::vector<float>& points, float min_x, float min_y, float min_z,
|
std::vector<int> gpuFilterPointCloud(const std::vector<float>& points, float min_x, float min_y, float min_z,
|
||||||
float max_x, float max_y, float max_z)
|
float max_x, float max_y, float max_z)
|
||||||
{
|
|
||||||
return gpuMergedProcessing(points, min_x, min_y, min_z, max_x, max_y, max_z, 0.1f, 20.0f, 10, 1.0f);
|
|
||||||
}
|
|
||||||
|
|
||||||
// =================================================================
|
|
||||||
// 体素滤波
|
|
||||||
// =================================================================
|
|
||||||
__global__ void voxelizeKernelOpt(const float* points, int num_points, float res, float range, int* counts,
|
|
||||||
float3* centers, int grid_size)
|
|
||||||
{
|
|
||||||
int i = blockIdx.x * blockDim.x + threadIdx.x;
|
|
||||||
if (i >= num_points) return;
|
|
||||||
|
|
||||||
float x = points[i * 3 + 0], y = points[i * 3 + 1], z = points[i * 3 + 2];
|
|
||||||
float h = range * 0.5f;
|
|
||||||
|
|
||||||
int vx = __float2int_rn((h - x) / res);
|
|
||||||
int vy = __float2int_rn((y + h) / res);
|
|
||||||
int vz = __float2int_rn((z + h) / res);
|
|
||||||
|
|
||||||
if (vx < 0 || vx >= grid_size || vy < 0 || vy >= grid_size || vz < 0 || vz >= grid_size) return;
|
|
||||||
int idx = vz * grid_size * grid_size + vy * grid_size + vx;
|
|
||||||
int n = atomicAdd(&counts[idx], 1);
|
|
||||||
|
|
||||||
if (n == 0)
|
|
||||||
{
|
|
||||||
centers[idx] = make_float3(x, y, z);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
float3 c = centers[idx];
|
|
||||||
c.x = (c.x * n + x) / (n + 1);
|
|
||||||
c.y = (c.y * n + y) / (n + 1);
|
|
||||||
c.z = (c.z * n + z) / (n + 1);
|
|
||||||
centers[idx] = c;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
std::vector<PointFloat> gpuVoxelGridFilter(const std::vector<float>& points, float res, float range)
|
|
||||||
{
|
{
|
||||||
#ifdef USE_GPU
|
#ifdef USE_GPU
|
||||||
if (points.empty()) return {};
|
auto& res = GpuResources::instance();
|
||||||
int n = points.size() / 3;
|
if (!res.isInitialized() || !isDeviceAvailable()) return {};
|
||||||
int g = static_cast<int>(range / res);
|
|
||||||
size_t total = g * g * g;
|
|
||||||
|
|
||||||
std::vector<int> cnt(total, 0);
|
int num_points = (int)(points.size() / 3);
|
||||||
std::vector<float3> cen(total);
|
if (num_points == 0 || num_points > res.max_points) return {};
|
||||||
|
|
||||||
float* d_p;
|
cudaStream_t st = res.stream[0];
|
||||||
int* d_c;
|
|
||||||
float3* d_cen;
|
|
||||||
|
|
||||||
cudaMalloc(&d_p, points.size() * sizeof(float));
|
std::memcpy(res.h_points[0], points.data(), (size_t)num_points * 3 * sizeof(float));
|
||||||
cudaMalloc(&d_c, total * sizeof(int));
|
CUDA_CHECK(cudaMemcpyAsync(res.d_points[0], res.h_points[0], (size_t)num_points * 3 * sizeof(float),
|
||||||
cudaMalloc(&d_cen, total * sizeof(float3));
|
cudaMemcpyHostToDevice, st));
|
||||||
|
|
||||||
cudaMemcpy(d_p, points.data(), points.size() * sizeof(float), cudaMemcpyHostToDevice);
|
int blocks = (num_points + BLOCK_SIZE - 1) / BLOCK_SIZE;
|
||||||
cudaMemcpy(d_c, cnt.data(), total * sizeof(int), cudaMemcpyHostToDevice);
|
|
||||||
cudaMemcpy(d_cen, cen.data(), total * sizeof(float3), cudaMemcpyHostToDevice);
|
|
||||||
|
|
||||||
int block = 1024;
|
kernelSpatialFilter<<<blocks, BLOCK_SIZE, 0, st>>>(res.d_points[0], num_points, min_x, max_x, min_y, max_y, min_z,
|
||||||
int grid = (n + block - 1) / block;
|
max_z, 0, 0, 0, 0, 0, 0, false, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
voxelizeKernelOpt<<<grid, block>>>(d_p, n, res, range, d_c, d_cen, g);
|
0, false, res.d_pass_mask[0]);
|
||||||
|
|
||||||
cudaMemcpy(cnt.data(), d_c, total * sizeof(int), cudaMemcpyDeviceToHost);
|
CUDA_CHECK(cudaMemcpyAsync(res.h_pass_mask[0], res.d_pass_mask[0], (size_t)num_points * sizeof(uint8_t),
|
||||||
cudaMemcpy(cen.data(), d_cen, total * sizeof(float3), cudaMemcpyDeviceToHost);
|
cudaMemcpyDeviceToHost, st));
|
||||||
|
CUDA_CHECK(cudaStreamSynchronize(st));
|
||||||
|
|
||||||
std::vector<PointFloat> r;
|
std::vector<int> results;
|
||||||
r.reserve(total / 10);
|
results.reserve(num_points / 2);
|
||||||
for (size_t i = 0; i < total; i++)
|
for (int i = 0; i < num_points; ++i)
|
||||||
if (cnt[i] > 0) r.emplace_back(cen[i].x, cen[i].y, cen[i].z);
|
if (res.h_pass_mask[0][i]) results.push_back(i);
|
||||||
|
return results;
|
||||||
cudaFree(d_p);
|
|
||||||
cudaFree(d_c);
|
|
||||||
cudaFree(d_cen);
|
|
||||||
return r;
|
|
||||||
#else
|
#else
|
||||||
return {};
|
return {};
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
// =================================================================
|
/* ================================================================
|
||||||
// 离群点去除
|
* 体素化接口(独立)
|
||||||
// =================================================================
|
* ================================================================ */
|
||||||
std::vector<int> gpuStatisticalOutlierRemoval(const std::vector<float>& points, int mean_k, float std_t)
|
|
||||||
|
std::vector<PointFloat> gpuVoxelGridFilter(const std::vector<float>& points, float resolution, float range)
|
||||||
{
|
{
|
||||||
return gpuMergedProcessing(points, -1e9, -1e9, -1e9, 1e9, 1e9, 1e9, 0.1f, 100.0f, mean_k, std_t);
|
#ifdef USE_GPU
|
||||||
|
auto& res = GpuResources::instance();
|
||||||
|
if (!res.isInitialized() || !isDeviceAvailable()) return {};
|
||||||
|
|
||||||
|
int num_points = (int)(points.size() / 3);
|
||||||
|
if (num_points == 0 || num_points > res.max_points) return {};
|
||||||
|
|
||||||
|
int grid_dim = static_cast<int>(range / resolution);
|
||||||
|
if (grid_dim > res.max_voxel_dim) grid_dim = res.max_voxel_dim;
|
||||||
|
int num_voxels = grid_dim * grid_dim * grid_dim;
|
||||||
|
if (num_voxels > res.max_voxels) return {};
|
||||||
|
|
||||||
|
cudaStream_t st = res.stream[0];
|
||||||
|
|
||||||
|
/* 全部点视为通过空间过滤,pass_mask 全置 1 */
|
||||||
|
CUDA_CHECK(cudaMemsetAsync(res.d_pass_mask[0], 1, (size_t)num_points * sizeof(uint8_t), st));
|
||||||
|
|
||||||
|
std::memcpy(res.h_points[0], points.data(), (size_t)num_points * 3 * sizeof(float));
|
||||||
|
CUDA_CHECK(cudaMemcpyAsync(res.d_points[0], res.h_points[0], (size_t)num_points * 3 * sizeof(float),
|
||||||
|
cudaMemcpyHostToDevice, st));
|
||||||
|
|
||||||
|
CUDA_CHECK(cudaMemsetAsync(res.d_voxel_counts[0], 0, (size_t)num_voxels * sizeof(int), st));
|
||||||
|
CUDA_CHECK(cudaMemsetAsync(res.d_voxel_sum_x[0], 0, (size_t)num_voxels * sizeof(int), st));
|
||||||
|
CUDA_CHECK(cudaMemsetAsync(res.d_voxel_sum_y[0], 0, (size_t)num_voxels * sizeof(int), st));
|
||||||
|
CUDA_CHECK(cudaMemsetAsync(res.d_voxel_sum_z[0], 0, (size_t)num_voxels * sizeof(int), st));
|
||||||
|
|
||||||
|
int blocks_pts = (num_points + BLOCK_SIZE - 1) / BLOCK_SIZE;
|
||||||
|
kernelVoxelize<<<blocks_pts, BLOCK_SIZE, 0, st>>>(res.d_points[0], res.d_pass_mask[0], num_points, resolution,
|
||||||
|
range, grid_dim, res.d_voxel_counts[0], res.d_voxel_sum_x[0],
|
||||||
|
res.d_voxel_sum_y[0], res.d_voxel_sum_z[0]);
|
||||||
|
|
||||||
|
/* 回传体素结果 */
|
||||||
|
CUDA_CHECK(cudaMemcpyAsync(res.h_voxel_counts[0], res.d_voxel_counts[0], (size_t)num_voxels * sizeof(int),
|
||||||
|
cudaMemcpyDeviceToHost, st));
|
||||||
|
CUDA_CHECK(cudaMemcpyAsync(res.h_voxel_sum_x[0], res.d_voxel_sum_x[0], (size_t)num_voxels * sizeof(int),
|
||||||
|
cudaMemcpyDeviceToHost, st));
|
||||||
|
CUDA_CHECK(cudaMemcpyAsync(res.h_voxel_sum_y[0], res.d_voxel_sum_y[0], (size_t)num_voxels * sizeof(int),
|
||||||
|
cudaMemcpyDeviceToHost, st));
|
||||||
|
CUDA_CHECK(cudaMemcpyAsync(res.h_voxel_sum_z[0], res.d_voxel_sum_z[0], (size_t)num_voxels * sizeof(int),
|
||||||
|
cudaMemcpyDeviceToHost, st));
|
||||||
|
CUDA_CHECK(cudaStreamSynchronize(st));
|
||||||
|
|
||||||
|
float scale_inv = 1.0f / (float)VOXEL_SCALE;
|
||||||
|
std::vector<PointFloat> result;
|
||||||
|
result.reserve(num_voxels / 10);
|
||||||
|
for (int i = 0; i < num_voxels; ++i)
|
||||||
|
{
|
||||||
|
int cnt = res.h_voxel_counts[0][i];
|
||||||
|
if (cnt == 0) continue;
|
||||||
|
float inv_cnt = 1.0f / (float)cnt;
|
||||||
|
result.emplace_back((float)res.h_voxel_sum_x[0][i] * scale_inv * inv_cnt,
|
||||||
|
(float)res.h_voxel_sum_y[0][i] * scale_inv * inv_cnt,
|
||||||
|
(float)res.h_voxel_sum_z[0][i] * scale_inv * inv_cnt);
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
#else
|
||||||
|
return {};
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
// =================================================================
|
/* ================================================================
|
||||||
// PCL 格式转换
|
* SOR 接口(独立)
|
||||||
// =================================================================
|
* ================================================================ */
|
||||||
|
|
||||||
|
std::vector<int> gpuStatisticalOutlierRemoval(const std::vector<float>& points, int mean_k, float std_thresh)
|
||||||
|
{
|
||||||
|
#ifdef USE_GPU
|
||||||
|
auto& res = GpuResources::instance();
|
||||||
|
if (!res.isInitialized() || !isDeviceAvailable()) return {};
|
||||||
|
|
||||||
|
int num_points = (int)(points.size() / 3);
|
||||||
|
if (num_points == 0 || num_points > res.max_points) return {};
|
||||||
|
|
||||||
|
cudaStream_t st = res.stream[0];
|
||||||
|
|
||||||
|
std::memcpy(res.h_points[0], points.data(), (size_t)num_points * 3 * sizeof(float));
|
||||||
|
CUDA_CHECK(cudaMemcpyAsync(res.d_points[0], res.h_points[0], (size_t)num_points * 3 * sizeof(float),
|
||||||
|
cudaMemcpyHostToDevice, st));
|
||||||
|
|
||||||
|
float* d_mkdist = res.d_mean_dist[0];
|
||||||
|
int blocks = (num_points + BLOCK_SIZE - 1) / BLOCK_SIZE;
|
||||||
|
kernelComputeMeanKDist<<<blocks, BLOCK_SIZE, 0, st>>>(res.d_points[0], num_points, mean_k, d_mkdist);
|
||||||
|
|
||||||
|
float* h_mkdist = reinterpret_cast<float*>(res.h_voxel_sum_x[0]);
|
||||||
|
CUDA_CHECK(cudaMemcpyAsync(h_mkdist, d_mkdist, (size_t)num_points * sizeof(float), cudaMemcpyDeviceToHost, st));
|
||||||
|
CUDA_CHECK(cudaStreamSynchronize(st));
|
||||||
|
|
||||||
|
return cpuSORFilter(h_mkdist, num_points, std_thresh);
|
||||||
|
#else
|
||||||
|
return {};
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
/* ================================================================
|
||||||
|
* PCL ↔ float 数组转换辅助函数
|
||||||
|
* ================================================================ */
|
||||||
|
|
||||||
std::vector<float> convertToFloatArray(const typename pcl::PointCloud<pcl::PointXYZ>::Ptr& cloud)
|
std::vector<float> convertToFloatArray(const typename pcl::PointCloud<pcl::PointXYZ>::Ptr& cloud)
|
||||||
{
|
{
|
||||||
std::vector<float> r;
|
std::vector<float> result;
|
||||||
if (!cloud) return r;
|
if (!cloud || cloud->empty()) return result;
|
||||||
r.reserve(cloud->size() * 3);
|
|
||||||
for (auto& p : cloud->points)
|
result.resize(cloud->size() * 3);
|
||||||
|
for (size_t i = 0; i < cloud->size(); ++i)
|
||||||
{
|
{
|
||||||
r.push_back(p.x);
|
result[i * 3 + 0] = cloud->points[i].x;
|
||||||
r.push_back(p.y);
|
result[i * 3 + 1] = cloud->points[i].y;
|
||||||
r.push_back(p.z);
|
result[i * 3 + 2] = cloud->points[i].z;
|
||||||
}
|
}
|
||||||
return r;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
typename pcl::PointCloud<pcl::PointXYZ>::Ptr convertToPCLCloud(const std::vector<float>& points)
|
typename pcl::PointCloud<pcl::PointXYZ>::Ptr convertToPCLCloud(const std::vector<float>& points)
|
||||||
{
|
{
|
||||||
auto cloud = typename pcl::PointCloud<pcl::PointXYZ>::Ptr(new pcl::PointCloud<pcl::PointXYZ>);
|
auto cloud = std::make_shared<pcl::PointCloud<pcl::PointXYZ>>();
|
||||||
size_t n = points.size() / 3;
|
size_t n = points.size() / 3;
|
||||||
cloud->resize(n);
|
cloud->resize(n);
|
||||||
for (size_t i = 0; i < n; i++)
|
for (size_t i = 0; i < n; ++i)
|
||||||
{
|
{
|
||||||
cloud->points[i].x = points[i * 3 + 0];
|
cloud->points[i].x = points[i * 3 + 0];
|
||||||
cloud->points[i].y = points[i * 3 + 1];
|
cloud->points[i].y = points[i * 3 + 1];
|
||||||
cloud->points[i].z = points[i * 3 + 2];
|
cloud->points[i].z = points[i * 3 + 2];
|
||||||
}
|
}
|
||||||
cloud->width = n;
|
cloud->width = (uint32_t)n;
|
||||||
cloud->height = 1;
|
cloud->height = 1;
|
||||||
cloud->is_dense = true;
|
cloud->is_dense = true;
|
||||||
return cloud;
|
return cloud;
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user