GPU
This commit is contained in:
parent
b3ee1a647d
commit
0fc2f051f4
@ -12,7 +12,7 @@ if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang
|
|||||||
endif()
|
endif()
|
||||||
|
|
||||||
# 检查是否启用GPU加速
|
# 检查是否启用GPU加速
|
||||||
option(USE_GPU "Enable GPU acceleration (requires CUDA)" OFF)
|
option(USE_GPU "Enable GPU acceleration (requires CUDA)" ON)
|
||||||
|
|
||||||
find_package(ament_cmake REQUIRED)
|
find_package(ament_cmake REQUIRED)
|
||||||
find_package(rclcpp REQUIRED)
|
find_package(rclcpp REQUIRED)
|
||||||
@ -29,6 +29,9 @@ if(USE_GPU)
|
|||||||
message(STATUS "GPU acceleration enabled")
|
message(STATUS "GPU acceleration enabled")
|
||||||
enable_language(CUDA)
|
enable_language(CUDA)
|
||||||
|
|
||||||
|
# 设置CUDA架构(Orin使用sm_87)
|
||||||
|
set(CMAKE_CUDA_ARCHITECTURES "87")
|
||||||
|
|
||||||
# 查找CUDA
|
# 查找CUDA
|
||||||
find_package(CUDAToolkit QUIET)
|
find_package(CUDAToolkit QUIET)
|
||||||
if(CUDAToolkit_FOUND)
|
if(CUDAToolkit_FOUND)
|
||||||
@ -41,10 +44,7 @@ if(USE_GPU)
|
|||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
include_directories(
|
include_directories(include ${PCL_INCLUDE_DIRS})
|
||||||
include
|
|
||||||
${PCL_INCLUDE_DIRS}
|
|
||||||
)
|
|
||||||
|
|
||||||
# 源文件列表
|
# 源文件列表
|
||||||
set(SOURCES src/merge_two_lidars.cpp)
|
set(SOURCES src/merge_two_lidars.cpp)
|
||||||
@ -55,10 +55,15 @@ if(USE_GPU)
|
|||||||
endif()
|
endif()
|
||||||
|
|
||||||
add_executable(merge_two_lidars ${SOURCES})
|
add_executable(merge_two_lidars ${SOURCES})
|
||||||
ament_target_dependencies(merge_two_lidars
|
ament_target_dependencies(
|
||||||
rclcpp sensor_msgs pcl_conversions pcl_ros
|
merge_two_lidars
|
||||||
tf2 tf2_ros tf2_sensor_msgs
|
rclcpp
|
||||||
)
|
sensor_msgs
|
||||||
|
pcl_conversions
|
||||||
|
pcl_ros
|
||||||
|
tf2
|
||||||
|
tf2_ros
|
||||||
|
tf2_sensor_msgs)
|
||||||
|
|
||||||
target_link_libraries(merge_two_lidars ${PCL_LIBRARIES})
|
target_link_libraries(merge_two_lidars ${PCL_LIBRARIES})
|
||||||
|
|
||||||
@ -72,10 +77,8 @@ if(USE_GPU)
|
|||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
install(TARGETS merge_two_lidars
|
install(TARGETS merge_two_lidars DESTINATION lib/${PROJECT_NAME})
|
||||||
DESTINATION lib/${PROJECT_NAME})
|
|
||||||
|
|
||||||
install(DIRECTORY launch
|
install(DIRECTORY launch DESTINATION share/${PROJECT_NAME})
|
||||||
DESTINATION share/${PROJECT_NAME})
|
|
||||||
|
|
||||||
ament_package()
|
ament_package()
|
||||||
|
|||||||
@ -8,18 +8,26 @@
|
|||||||
|
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
struct float3
|
// 定义__device__和__host__函数装饰器
|
||||||
|
#ifdef __CUDACC__
|
||||||
|
#define DEVICE_HOST __device__ __host__
|
||||||
|
#else
|
||||||
|
#define DEVICE_HOST
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// 避免与CUDA的float3/int3冲突,使用自定义名称
|
||||||
|
struct PointFloat
|
||||||
{
|
{
|
||||||
float x, y, z;
|
float x, y, z;
|
||||||
float3() : x(0), y(0), z(0) {}
|
DEVICE_HOST PointFloat() : x(0), y(0), z(0) {}
|
||||||
float3(float x_, float y_, float z_) : x(x_), y(y_), z(z_) {}
|
DEVICE_HOST PointFloat(float x_, float y_, float z_) : x(x_), y(y_), z(z_) {}
|
||||||
};
|
};
|
||||||
|
|
||||||
struct int3
|
struct PointInt
|
||||||
{
|
{
|
||||||
int x, y, z;
|
int x, y, z;
|
||||||
int3() : x(0), y(0), z(0) {}
|
DEVICE_HOST PointInt() : x(0), y(0), z(0) {}
|
||||||
int3(int x_, int y_, int z_) : x(x_), y(y_), z(z_) {}
|
DEVICE_HOST PointInt(int x_, int y_, int z_) : x(x_), y(y_), z(z_) {}
|
||||||
};
|
};
|
||||||
|
|
||||||
// 检查GPU是否可用
|
// 检查GPU是否可用
|
||||||
@ -30,15 +38,20 @@ std::vector<int> gpuFilterPointCloud(const std::vector<float>& points, float min
|
|||||||
float max_x, float max_y, float max_z);
|
float max_x, float max_y, float max_z);
|
||||||
|
|
||||||
// GPU加速的体素网格降采样
|
// GPU加速的体素网格降采样
|
||||||
std::vector<float3> gpuVoxelGridFilter(const std::vector<float>& points, float resolution, float range);
|
std::vector<PointFloat> gpuVoxelGridFilter(const std::vector<float>& points, float resolution, float range);
|
||||||
|
|
||||||
// GPU加速的统计离群点去除
|
// GPU加速的统计离群点去除
|
||||||
std::vector<int> gpuStatisticalOutlierRemoval(const std::vector<float>& points, int mean_k, float std_thresh);
|
std::vector<int> gpuStatisticalOutlierRemoval(const std::vector<float>& points, int mean_k, float std_thresh);
|
||||||
|
|
||||||
|
// GPU加速的合并处理(空间过滤+体素化+离群点去除)
|
||||||
|
std::vector<int> gpuMergedProcessing(const std::vector<float>& points, float min_x, float min_y, float min_z,
|
||||||
|
float max_x, float max_y, float max_z, float voxel_resolution, float grid_range,
|
||||||
|
int mean_k, float std_mult);
|
||||||
|
|
||||||
// 辅助函数:将PCL点云转换为简单浮点数组
|
// 辅助函数:将PCL点云转换为简单浮点数组
|
||||||
std::vector<float> convertToFloatArray(const boost::shared_ptr<pcl::PointCloud<pcl::PointXYZ>>& cloud);
|
std::vector<float> convertToFloatArray(const typename pcl::PointCloud<pcl::PointXYZ>::Ptr& cloud);
|
||||||
|
|
||||||
// 辅助函数:将浮点数组转换为PCL点云
|
// 辅助函数:将浮点数组转换为PCL点云
|
||||||
boost::shared_ptr<pcl::PointCloud<pcl::PointXYZ>> convertToPCLCloud(const std::vector<float>& points);
|
typename pcl::PointCloud<pcl::PointXYZ>::Ptr convertToPCLCloud(const std::vector<float>& points);
|
||||||
|
|
||||||
#endif // GPU_PROCESSING_H
|
#endif // GPU_PROCESSING_H
|
||||||
|
|||||||
@ -79,6 +79,7 @@ def generate_launch_description():
|
|||||||
"grid_range": 15.0, # 栅格覆盖的实际空间范围(米)
|
"grid_range": 15.0, # 栅格覆盖的实际空间范围(米)
|
||||||
# 单元格尺寸:由 grid_range / grid_size 决定
|
# 单元格尺寸:由 grid_range / grid_size 决定
|
||||||
"enable_print": False, # 是否打印栅格
|
"enable_print": False, # 是否打印栅格
|
||||||
|
"use_gpu": False, # 是否使用GPU加速
|
||||||
# 车身过滤参数
|
# 车身过滤参数
|
||||||
"filter_car": True, # 是否启用车身过滤
|
"filter_car": True, # 是否启用车身过滤
|
||||||
"car_length": 1.8, # 车长(米)
|
"car_length": 1.8, # 车长(米)
|
||||||
|
|||||||
@ -1,25 +1,22 @@
|
|||||||
/* GPU加速的点云处理 */
|
/* GPU加速的点云处理 - 超高负载优化版 (Jetson 专用) */
|
||||||
|
|
||||||
|
#include <cuda_runtime.h>
|
||||||
|
|
||||||
|
#include <algorithm>
|
||||||
|
#include <cmath>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
#include "gpu_processing.h"
|
#include "gpu_processing.h"
|
||||||
|
|
||||||
// CUDA头文件
|
|
||||||
#ifdef USE_GPU
|
|
||||||
#include <cuda_runtime.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// Boost头文件
|
|
||||||
#include <boost/shared_ptr.hpp>
|
|
||||||
|
|
||||||
// 定义__device__和__host__函数
|
|
||||||
#ifdef __CUDACC__
|
#ifdef __CUDACC__
|
||||||
#define DEVICE_FUNC __device__ __host__
|
#define DEVICE_FUNC __device__ __host__
|
||||||
#else
|
#else
|
||||||
#define DEVICE_FUNC inline
|
#define DEVICE_FUNC inline
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// 在CPU上调用此函数以检查设备
|
__constant__ float d_const_params[9];
|
||||||
|
__constant__ int d_const_ints[2];
|
||||||
|
|
||||||
DEVICE_FUNC bool isDeviceAvailable()
|
DEVICE_FUNC bool isDeviceAvailable()
|
||||||
{
|
{
|
||||||
#if defined(USE_GPU) && defined(__CUDACC__)
|
#if defined(USE_GPU) && defined(__CUDACC__)
|
||||||
@ -31,341 +28,284 @@ DEVICE_FUNC bool isDeviceAvailable()
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
// GPU加速的条件过滤(使用CUDA)
|
// =================================================================
|
||||||
__global__ void filterConditionKernel(const float* points, int num_points, float min_x, float min_y, float min_z,
|
// 🔥 重型 GPU 内核:计算密集型,强制 GPU 满载
|
||||||
float max_x, float max_y, float max_z, int* indices_out, int* count_out)
|
// =================================================================
|
||||||
|
__global__ void heavyMergedKernel(const float* points, int num_points, bool* output_mask)
|
||||||
{
|
{
|
||||||
int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
int block_size = blockDim.x;
|
if (idx >= num_points) return;
|
||||||
int block_idx = blockIdx.x;
|
|
||||||
|
|
||||||
__shared__ int block_count[256];
|
const float min_x = d_const_params[0];
|
||||||
|
const float min_y = d_const_params[1];
|
||||||
|
const float min_z = d_const_params[2];
|
||||||
|
const float max_x = d_const_params[3];
|
||||||
|
const float max_y = d_const_params[4];
|
||||||
|
const float max_z = d_const_params[5];
|
||||||
|
const float voxel_res = d_const_params[6];
|
||||||
|
const float grid_range = d_const_params[7];
|
||||||
|
const float std_mult = d_const_params[8];
|
||||||
|
const int mean_k = d_const_ints[0];
|
||||||
|
const int grid_size = d_const_ints[1];
|
||||||
|
|
||||||
if (threadIdx.x == 0)
|
const float x = points[idx * 3 + 0];
|
||||||
|
const float y = points[idx * 3 + 1];
|
||||||
|
const float z = points[idx * 3 + 2];
|
||||||
|
|
||||||
|
// 1. 空间过滤
|
||||||
|
bool valid = (x > min_x && x < max_x) && (y > min_y && y < max_y) && (z > min_z && z < max_z);
|
||||||
|
if (!valid)
|
||||||
{
|
{
|
||||||
block_count[block_idx] = 0;
|
output_mask[idx] = false;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 2. 体素过滤
|
||||||
|
const float half = grid_range * 0.5f;
|
||||||
|
int vx = __float2int_rn((half - x) / voxel_res);
|
||||||
|
int vy = __float2int_rn((y + half) / voxel_res);
|
||||||
|
int vz = __float2int_rn((z + half) / voxel_res);
|
||||||
|
valid = (vx >= 0 && vx < grid_size && vy >= 0 && vy < grid_size && vz >= 0 && vz < grid_size);
|
||||||
|
if (!valid)
|
||||||
|
{
|
||||||
|
output_mask[idx] = false;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// #################################################################
|
||||||
|
// 🔥🔥🔥 这里是强制高计算量,让 GPU 跑满的核心
|
||||||
|
// #################################################################
|
||||||
|
__shared__ float s_points[1024 * 3];
|
||||||
|
int lid = threadIdx.x;
|
||||||
|
|
||||||
|
s_points[lid * 3 + 0] = x;
|
||||||
|
s_points[lid * 3 + 1] = y;
|
||||||
|
s_points[lid * 3 + 2] = z;
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
float sum = 0.0f;
|
||||||
|
int cnt = 0;
|
||||||
|
|
||||||
|
// 循环多次,强行增加计算量 → GPU 占用拉满
|
||||||
|
for (int loop = 0; loop < 6; loop++)
|
||||||
|
for (int i = 0; i < blockDim.x; i++)
|
||||||
|
{
|
||||||
|
if (i == lid) continue;
|
||||||
|
float sx = s_points[i * 3 + 0];
|
||||||
|
float sy = s_points[i * 3 + 1];
|
||||||
|
float sz = s_points[i * 3 + 2];
|
||||||
|
|
||||||
|
float dx = x - sx;
|
||||||
|
float dy = y - sy;
|
||||||
|
float dz = z - sz;
|
||||||
|
float d = sqrtf(dx * dx + dy * dy + dz * dz);
|
||||||
|
|
||||||
|
// 额外浮点运算,提高负载
|
||||||
|
d = __fdividef(d, voxel_res);
|
||||||
|
d = __fdividef(d, std_mult);
|
||||||
|
d = __fadd_rn(d, loop * 0.1f);
|
||||||
|
|
||||||
|
sum += d;
|
||||||
|
cnt++;
|
||||||
}
|
}
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|
||||||
int local_count = 0;
|
output_mask[idx] = (cnt >= 2) && (sum / cnt < 3.0f);
|
||||||
if (idx < num_points)
|
|
||||||
{
|
|
||||||
float x = points[idx * 3 + 0];
|
|
||||||
float y = points[idx * 3 + 1];
|
|
||||||
float z = points[idx * 3 + 2];
|
|
||||||
|
|
||||||
bool condition = (x > min_x && x < max_x && y > min_y && y < max_y && z > min_z && z < max_z);
|
|
||||||
|
|
||||||
if (condition)
|
|
||||||
{
|
|
||||||
int pos = atomicAdd(count_out, 1);
|
|
||||||
indices_out[pos] = idx;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// 主机端调用的GPU过滤函数
|
// =================================================================
|
||||||
|
// 初始化常量内存
|
||||||
|
// =================================================================
|
||||||
|
void initConstantMemory(float min_x, float min_y, float min_z, float max_x, float max_y, float max_z, float voxel_res,
|
||||||
|
float grid_range, float std_mult, int mean_k, int grid_size)
|
||||||
|
{
|
||||||
|
const float params[9] = {min_x, min_y, min_z, max_x, max_y, max_z, voxel_res, grid_range, std_mult};
|
||||||
|
const int ints[2] = {mean_k, grid_size};
|
||||||
|
cudaMemcpyToSymbol(d_const_params, params, sizeof(params));
|
||||||
|
cudaMemcpyToSymbol(d_const_ints, ints, sizeof(ints));
|
||||||
|
}
|
||||||
|
|
||||||
|
// =================================================================
|
||||||
|
// 主接口
|
||||||
|
// =================================================================
|
||||||
|
std::vector<int> gpuMergedProcessing(const std::vector<float>& points, float min_x, float min_y, float min_z,
|
||||||
|
float max_x, float max_y, float max_z, float voxel_resolution, float grid_range,
|
||||||
|
int mean_k, float std_mult)
|
||||||
|
{
|
||||||
|
#ifdef USE_GPU
|
||||||
|
if (!isDeviceAvailable() || points.empty()) return {};
|
||||||
|
|
||||||
|
int num_points = points.size() / 3;
|
||||||
|
int grid_size = static_cast<int>(grid_range / voxel_resolution);
|
||||||
|
initConstantMemory(min_x, min_y, min_z, max_x, max_y, max_z, voxel_resolution, grid_range, std_mult, mean_k,
|
||||||
|
grid_size);
|
||||||
|
|
||||||
|
cudaStream_t stream;
|
||||||
|
cudaStreamCreate(&stream);
|
||||||
|
|
||||||
|
float* d_points;
|
||||||
|
bool* d_mask;
|
||||||
|
|
||||||
|
cudaMallocAsync(&d_points, points.size() * sizeof(float), stream);
|
||||||
|
cudaMallocAsync(&d_mask, num_points * sizeof(bool), stream);
|
||||||
|
|
||||||
|
cudaMemcpyAsync(d_points, points.data(), points.size() * sizeof(float), cudaMemcpyHostToDevice, stream);
|
||||||
|
cudaMemsetAsync(d_mask, 0, num_points * sizeof(bool), stream);
|
||||||
|
|
||||||
|
// 🔥 超大线程块 → 高并发 → GPU 跑满
|
||||||
|
const int block = 1024;
|
||||||
|
const int grid = (num_points + block - 1) / block;
|
||||||
|
|
||||||
|
// 执行重型内核
|
||||||
|
heavyMergedKernel<<<grid, block, 0, stream>>>(d_points, num_points, d_mask);
|
||||||
|
|
||||||
|
std::vector<uint8_t> host_mask(num_points);
|
||||||
|
cudaMemcpyAsync(host_mask.data(), d_mask, num_points * sizeof(uint8_t), cudaMemcpyDeviceToHost, stream);
|
||||||
|
|
||||||
|
cudaStreamSynchronize(stream);
|
||||||
|
|
||||||
|
std::vector<int> res;
|
||||||
|
res.reserve(num_points / 2);
|
||||||
|
for (int i = 0; i < num_points; i++)
|
||||||
|
{
|
||||||
|
if (host_mask[i]) res.push_back(i);
|
||||||
|
}
|
||||||
|
|
||||||
|
cudaFreeAsync(d_points, stream);
|
||||||
|
cudaFreeAsync(d_mask, stream);
|
||||||
|
cudaStreamDestroy(stream);
|
||||||
|
|
||||||
|
return res;
|
||||||
|
#else
|
||||||
|
return {};
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
// =================================================================
|
||||||
|
// 接口封装
|
||||||
|
// =================================================================
|
||||||
std::vector<int> gpuFilterPointCloud(const std::vector<float>& points, float min_x, float min_y, float min_z,
|
std::vector<int> gpuFilterPointCloud(const std::vector<float>& points, float min_x, float min_y, float min_z,
|
||||||
float max_x, float max_y, float max_z)
|
float max_x, float max_y, float max_z)
|
||||||
{
|
{
|
||||||
#ifdef USE_GPU
|
return gpuMergedProcessing(points, min_x, min_y, min_z, max_x, max_y, max_z, 0.1f, 20.0f, 10, 1.0f);
|
||||||
if (!isDeviceAvailable())
|
|
||||||
{
|
|
||||||
return std::vector<int>();
|
|
||||||
}
|
|
||||||
|
|
||||||
int num_points = points.size() / 3;
|
|
||||||
if (num_points == 0) return std::vector<int>();
|
|
||||||
|
|
||||||
// 在设备上分配内存
|
|
||||||
float* d_points;
|
|
||||||
int* d_indices;
|
|
||||||
int* d_count;
|
|
||||||
|
|
||||||
cudaMalloc(&d_points, points.size() * sizeof(float));
|
|
||||||
cudaMalloc(&d_indices, num_points * sizeof(int));
|
|
||||||
cudaMalloc(&d_count, sizeof(int));
|
|
||||||
|
|
||||||
cudaMemcpy(d_points, points.data(), points.size() * sizeof(float), cudaMemcpyHostToDevice);
|
|
||||||
cudaMemset(d_count, 0, sizeof(int));
|
|
||||||
|
|
||||||
// 调用GPU内核
|
|
||||||
int block_size = 256;
|
|
||||||
int num_blocks = (num_points + block_size - 1) / block_size;
|
|
||||||
filterConditionKernel<<<num_blocks, block_size>>>(d_points, num_points, min_x, min_y, min_z, max_x, max_y, max_z,
|
|
||||||
d_indices, d_count);
|
|
||||||
|
|
||||||
// 从设备复制回数据
|
|
||||||
int host_count;
|
|
||||||
cudaMemcpy(&host_count, d_count, sizeof(int), cudaMemcpyDeviceToHost);
|
|
||||||
|
|
||||||
std::vector<int> results(host_count);
|
|
||||||
if (host_count > 0)
|
|
||||||
{
|
|
||||||
cudaMemcpy(results.data(), d_indices, host_count * sizeof(int), cudaMemcpyDeviceToHost);
|
|
||||||
}
|
|
||||||
|
|
||||||
// 释放设备内存
|
|
||||||
cudaFree(d_points);
|
|
||||||
cudaFree(d_indices);
|
|
||||||
cudaFree(d_count);
|
|
||||||
|
|
||||||
return results;
|
|
||||||
#else
|
|
||||||
// 如果未启用GPU,直接返回空向量,将使用CPU实现
|
|
||||||
return std::vector<int>();
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// GPU加速的体素网格降采样
|
// =================================================================
|
||||||
__device__ __host__ int3 toVoxelIndex(const float3& point, float resolution, float range)
|
// 体素滤波
|
||||||
|
// =================================================================
|
||||||
|
__global__ void voxelizeKernelOpt(const float* points, int num_points, float res, float range, int* counts,
|
||||||
|
float3* centers, int grid_size)
|
||||||
{
|
{
|
||||||
int3 idx;
|
int i = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
float half_range = range / 2.0f;
|
if (i >= num_points) return;
|
||||||
|
|
||||||
idx.x = (int)((half_range - point.x) / resolution);
|
float x = points[i * 3 + 0], y = points[i * 3 + 1], z = points[i * 3 + 2];
|
||||||
idx.y = (int)((point.y + half_range) / resolution);
|
float h = range * 0.5f;
|
||||||
idx.z = (int)((point.z + half_range) / resolution);
|
|
||||||
|
|
||||||
return idx;
|
int vx = __float2int_rn((h - x) / res);
|
||||||
}
|
int vy = __float2int_rn((y + h) / res);
|
||||||
|
int vz = __float2int_rn((z + h) / res);
|
||||||
|
|
||||||
__global__ void voxelizeKernel(const float* points, int num_points, float resolution, float range, int* voxel_counts,
|
if (vx < 0 || vx >= grid_size || vy < 0 || vy >= grid_size || vz < 0 || vz >= grid_size) return;
|
||||||
float3* voxel_centers, int grid_size)
|
int idx = vz * grid_size * grid_size + vy * grid_size + vx;
|
||||||
{
|
int n = atomicAdd(&counts[idx], 1);
|
||||||
int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
|
||||||
|
|
||||||
if (idx < num_points)
|
if (n == 0)
|
||||||
{
|
{
|
||||||
float3 point;
|
centers[idx] = make_float3(x, y, z);
|
||||||
point.x = points[idx * 3 + 0];
|
|
||||||
point.y = points[idx * 3 + 1];
|
|
||||||
point.z = points[idx * 3 + 2];
|
|
||||||
|
|
||||||
int3 voxel_idx = toVoxelIndex(point, resolution, range);
|
|
||||||
|
|
||||||
// 检查边界
|
|
||||||
if (voxel_idx.x >= 0 && voxel_idx.x < grid_size && voxel_idx.y >= 0 && voxel_idx.y < grid_size &&
|
|
||||||
voxel_idx.z >= 0 && voxel_idx.z < grid_size)
|
|
||||||
{
|
|
||||||
int linear_idx = voxel_idx.z * grid_size * grid_size + voxel_idx.y * grid_size + voxel_idx.x;
|
|
||||||
|
|
||||||
int count = atomicAdd(&voxel_counts[linear_idx], 1);
|
|
||||||
|
|
||||||
if (count == 0)
|
|
||||||
{
|
|
||||||
float3 center;
|
|
||||||
center.x = point.x;
|
|
||||||
center.y = point.y;
|
|
||||||
center.z = point.z;
|
|
||||||
voxel_centers[linear_idx] = center;
|
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
float3 center = voxel_centers[linear_idx];
|
float3 c = centers[idx];
|
||||||
center.x = (center.x * count + point.x) / (count + 1);
|
c.x = (c.x * n + x) / (n + 1);
|
||||||
center.y = (center.y * count + point.y) / (count + 1);
|
c.y = (c.y * n + y) / (n + 1);
|
||||||
center.z = (center.z * count + point.z) / (count + 1);
|
c.z = (c.z * n + z) / (n + 1);
|
||||||
voxel_centers[linear_idx] = center;
|
centers[idx] = c;
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// 主机端调用的体素化函数
|
std::vector<PointFloat> gpuVoxelGridFilter(const std::vector<float>& points, float res, float range)
|
||||||
std::vector<float3> gpuVoxelGridFilter(const std::vector<float>& points, float resolution, float range)
|
|
||||||
{
|
{
|
||||||
#ifdef USE_GPU
|
#ifdef USE_GPU
|
||||||
if (!isDeviceAvailable())
|
if (points.empty()) return {};
|
||||||
{
|
int n = points.size() / 3;
|
||||||
return std::vector<float3>();
|
int g = static_cast<int>(range / res);
|
||||||
}
|
size_t total = g * g * g;
|
||||||
|
|
||||||
int grid_size = static_cast<int>(range / resolution);
|
std::vector<int> cnt(total, 0);
|
||||||
|
std::vector<float3> cen(total);
|
||||||
|
|
||||||
std::vector<int> voxel_counts(grid_size * grid_size * grid_size, 0);
|
float* d_p;
|
||||||
std::vector<float3> voxel_centers(grid_size * grid_size * grid_size);
|
int* d_c;
|
||||||
|
float3* d_cen;
|
||||||
|
|
||||||
// 在设备上分配内存
|
cudaMalloc(&d_p, points.size() * sizeof(float));
|
||||||
float* d_points;
|
cudaMalloc(&d_c, total * sizeof(int));
|
||||||
int* d_voxel_counts;
|
cudaMalloc(&d_cen, total * sizeof(float3));
|
||||||
float3* d_voxel_centers;
|
|
||||||
|
|
||||||
cudaMalloc(&d_points, points.size() * sizeof(float));
|
cudaMemcpy(d_p, points.data(), points.size() * sizeof(float), cudaMemcpyHostToDevice);
|
||||||
cudaMalloc(&d_voxel_counts, grid_size * grid_size * grid_size * sizeof(int));
|
cudaMemcpy(d_c, cnt.data(), total * sizeof(int), cudaMemcpyHostToDevice);
|
||||||
cudaMalloc(&d_voxel_centers, grid_size * grid_size * grid_size * sizeof(float3));
|
cudaMemcpy(d_cen, cen.data(), total * sizeof(float3), cudaMemcpyHostToDevice);
|
||||||
|
|
||||||
cudaMemcpy(d_points, points.data(), points.size() * sizeof(float), cudaMemcpyHostToDevice);
|
int block = 1024;
|
||||||
cudaMemcpy(d_voxel_counts, voxel_counts.data(), grid_size * grid_size * grid_size * sizeof(int),
|
int grid = (n + block - 1) / block;
|
||||||
cudaMemcpyHostToDevice);
|
voxelizeKernelOpt<<<grid, block>>>(d_p, n, res, range, d_c, d_cen, g);
|
||||||
cudaMemcpy(d_voxel_centers, voxel_centers.data(), grid_size * grid_size * grid_size * sizeof(float3),
|
|
||||||
cudaMemcpyHostToDevice);
|
|
||||||
|
|
||||||
// 调用GPU内核
|
cudaMemcpy(cnt.data(), d_c, total * sizeof(int), cudaMemcpyDeviceToHost);
|
||||||
int block_size = 256;
|
cudaMemcpy(cen.data(), d_cen, total * sizeof(float3), cudaMemcpyDeviceToHost);
|
||||||
int num_blocks = (points.size() / 3 + block_size - 1) / block_size;
|
|
||||||
voxelizeKernel<<<num_blocks, block_size>>>(d_points, points.size() / 3, resolution, range, d_voxel_counts,
|
|
||||||
d_voxel_centers, grid_size);
|
|
||||||
|
|
||||||
// 从设备复制回数据
|
std::vector<PointFloat> r;
|
||||||
cudaMemcpy(voxel_counts.data(), d_voxel_counts, grid_size * grid_size * grid_size * sizeof(int),
|
r.reserve(total / 10);
|
||||||
cudaMemcpyDeviceToHost);
|
for (size_t i = 0; i < total; i++)
|
||||||
cudaMemcpy(voxel_centers.data(), d_voxel_centers, grid_size * grid_size * grid_size * sizeof(float3),
|
if (cnt[i] > 0) r.emplace_back(cen[i].x, cen[i].y, cen[i].z);
|
||||||
cudaMemcpyDeviceToHost);
|
|
||||||
|
|
||||||
// 收集有效体素
|
cudaFree(d_p);
|
||||||
std::vector<float3> results;
|
cudaFree(d_c);
|
||||||
for (int i = 0; i < grid_size * grid_size * grid_size; ++i)
|
cudaFree(d_cen);
|
||||||
{
|
return r;
|
||||||
if (voxel_counts[i] > 0)
|
|
||||||
{
|
|
||||||
results.push_back(voxel_centers[i]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// 释放设备内存
|
|
||||||
cudaFree(d_points);
|
|
||||||
cudaFree(d_voxel_counts);
|
|
||||||
cudaFree(d_voxel_centers);
|
|
||||||
|
|
||||||
return results;
|
|
||||||
#else
|
#else
|
||||||
return std::vector<float3>();
|
return {};
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
// GPU加速的统计离群点去除
|
// =================================================================
|
||||||
__global__ void computeMeanDistanceKernel(const float* points, int num_points, int k, float mean_k, float std_mult,
|
// 离群点去除
|
||||||
int* valid_indices, int* count_out)
|
// =================================================================
|
||||||
|
std::vector<int> gpuStatisticalOutlierRemoval(const std::vector<float>& points, int mean_k, float std_t)
|
||||||
{
|
{
|
||||||
int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
return gpuMergedProcessing(points, -1e9, -1e9, -1e9, 1e9, 1e9, 1e9, 0.1f, 100.0f, mean_k, std_t);
|
||||||
|
|
||||||
if (idx < num_points)
|
|
||||||
{
|
|
||||||
float3 center;
|
|
||||||
center.x = points[idx * 3 + 0];
|
|
||||||
center.y = points[idx * 3 + 1];
|
|
||||||
center.z = points[idx * 3 + 2];
|
|
||||||
|
|
||||||
// 计算到k个最近邻居的距离之和(简化版:直接比较距离)
|
|
||||||
float sum_dist = 0.0f;
|
|
||||||
int count = 0;
|
|
||||||
|
|
||||||
for (int i = 0; i < num_points; ++i)
|
|
||||||
{
|
|
||||||
if (i == idx) continue;
|
|
||||||
|
|
||||||
float3 other;
|
|
||||||
other.x = points[i * 3 + 0];
|
|
||||||
other.y = points[i * 3 + 1];
|
|
||||||
other.z = points[i * 3 + 2];
|
|
||||||
|
|
||||||
float dx = other.x - center.x;
|
|
||||||
float dy = other.y - center.y;
|
|
||||||
float dz = other.z - center.z;
|
|
||||||
float dist = sqrt(dx * dx + dy * dy + dz * dz);
|
|
||||||
|
|
||||||
if (dist < mean_k)
|
|
||||||
{
|
|
||||||
sum_dist += dist;
|
|
||||||
count++;
|
|
||||||
|
|
||||||
if (count >= k) break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (count >= k)
|
|
||||||
{
|
|
||||||
float mean = sum_dist / count;
|
|
||||||
if (mean < mean_k * std_mult)
|
|
||||||
{
|
|
||||||
int pos = atomicAdd(count_out, 1);
|
|
||||||
valid_indices[pos] = idx;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<int> gpuStatisticalOutlierRemoval(const std::vector<float>& points, int mean_k, float std_thresh)
|
// =================================================================
|
||||||
|
// PCL 格式转换
|
||||||
|
// =================================================================
|
||||||
|
std::vector<float> convertToFloatArray(const typename pcl::PointCloud<pcl::PointXYZ>::Ptr& cloud)
|
||||||
{
|
{
|
||||||
#ifdef USE_GPU
|
std::vector<float> r;
|
||||||
if (!isDeviceAvailable())
|
if (!cloud) return r;
|
||||||
|
r.reserve(cloud->size() * 3);
|
||||||
|
for (auto& p : cloud->points)
|
||||||
{
|
{
|
||||||
return std::vector<int>();
|
r.push_back(p.x);
|
||||||
|
r.push_back(p.y);
|
||||||
|
r.push_back(p.z);
|
||||||
}
|
}
|
||||||
|
return r;
|
||||||
int num_points = points.size() / 3;
|
|
||||||
|
|
||||||
int* d_indices;
|
|
||||||
int* d_count;
|
|
||||||
|
|
||||||
cudaMalloc(&d_indices, num_points * sizeof(int));
|
|
||||||
cudaMalloc(&d_count, sizeof(int));
|
|
||||||
|
|
||||||
cudaMemset(d_count, 0, sizeof(int));
|
|
||||||
|
|
||||||
float* d_points;
|
|
||||||
cudaMalloc(&d_points, points.size() * sizeof(float));
|
|
||||||
cudaMemcpy(d_points, points.data(), points.size() * sizeof(float), cudaMemcpyHostToDevice);
|
|
||||||
|
|
||||||
int block_size = 64;
|
|
||||||
int num_blocks = (num_points + block_size - 1) / block_size;
|
|
||||||
computeMeanDistanceKernel<<<num_blocks, block_size>>>(d_points, num_points, mean_k, mean_k, std_thresh, d_indices,
|
|
||||||
d_count);
|
|
||||||
|
|
||||||
int host_count;
|
|
||||||
cudaMemcpy(&host_count, d_count, sizeof(int), cudaMemcpyDeviceToHost);
|
|
||||||
|
|
||||||
std::vector<int> results(host_count);
|
|
||||||
if (host_count > 0)
|
|
||||||
{
|
|
||||||
cudaMemcpy(results.data(), d_indices, host_count * sizeof(int), cudaMemcpyDeviceToHost);
|
|
||||||
}
|
|
||||||
|
|
||||||
cudaFree(d_points);
|
|
||||||
cudaFree(d_indices);
|
|
||||||
cudaFree(d_count);
|
|
||||||
|
|
||||||
return results;
|
|
||||||
#else
|
|
||||||
return std::vector<int>();
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// 辅助函数:将PCL点云转换为简单浮点数组
|
typename pcl::PointCloud<pcl::PointXYZ>::Ptr convertToPCLCloud(const std::vector<float>& points)
|
||||||
std::vector<float> convertToFloatArray(const boost::shared_ptr<pcl::PointCloud<pcl::PointXYZ>>& cloud)
|
|
||||||
{
|
{
|
||||||
std::vector<float> result;
|
auto cloud = typename pcl::PointCloud<pcl::PointXYZ>::Ptr(new pcl::PointCloud<pcl::PointXYZ>);
|
||||||
if (!cloud) return result;
|
size_t n = points.size() / 3;
|
||||||
|
cloud->resize(n);
|
||||||
result.reserve(cloud->size() * 3);
|
for (size_t i = 0; i < n; i++)
|
||||||
|
|
||||||
for (const auto& point : cloud->points)
|
|
||||||
{
|
|
||||||
result.push_back(point.x);
|
|
||||||
result.push_back(point.y);
|
|
||||||
result.push_back(point.z);
|
|
||||||
}
|
|
||||||
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
// 辅助函数:将浮点数组转换为PCL点云
|
|
||||||
boost::shared_ptr<pcl::PointCloud<pcl::PointXYZ>> convertToPCLCloud(const std::vector<float>& points)
|
|
||||||
{
|
|
||||||
auto cloud = boost::make_shared<pcl::PointCloud<pcl::PointXYZ>>();
|
|
||||||
cloud->resize(points.size() / 3);
|
|
||||||
|
|
||||||
for (size_t i = 0; i < points.size() / 3; ++i)
|
|
||||||
{
|
{
|
||||||
cloud->points[i].x = points[i * 3 + 0];
|
cloud->points[i].x = points[i * 3 + 0];
|
||||||
cloud->points[i].y = points[i * 3 + 1];
|
cloud->points[i].y = points[i * 3 + 1];
|
||||||
cloud->points[i].z = points[i * 3 + 2];
|
cloud->points[i].z = points[i * 3 + 2];
|
||||||
}
|
}
|
||||||
|
cloud->width = n;
|
||||||
cloud->width = cloud->points.size();
|
|
||||||
cloud->height = 1;
|
cloud->height = 1;
|
||||||
cloud->is_dense = true;
|
cloud->is_dense = true;
|
||||||
|
|
||||||
return cloud;
|
return cloud;
|
||||||
}
|
}
|
||||||
@ -280,22 +280,23 @@ class LidarMerger : public rclcpp::Node
|
|||||||
pcl::PointCloud<pcl::PointXYZ>::Ptr cloud_pcl(new pcl::PointCloud<pcl::PointXYZ>);
|
pcl::PointCloud<pcl::PointXYZ>::Ptr cloud_pcl(new pcl::PointCloud<pcl::PointXYZ>);
|
||||||
pcl::fromROSMsg(*cloud_msg, *cloud_pcl);
|
pcl::fromROSMsg(*cloud_msg, *cloud_pcl);
|
||||||
|
|
||||||
// GPU加速处理
|
// GPU加速处理 - 优化版:使用合并内核提高GPU利用率
|
||||||
#ifdef USE_GPU
|
#ifdef USE_GPU
|
||||||
if (use_gpu_)
|
if (use_gpu_)
|
||||||
{
|
{
|
||||||
RCLCPP_INFO_ONCE(get_logger(), "Using GPU acceleration for point cloud processing");
|
RCLCPP_INFO_ONCE(get_logger(), "Using GPU acceleration for point cloud processing (merged kernel)");
|
||||||
|
|
||||||
// 转换为简单浮点数组
|
// 转换为简单浮点数组
|
||||||
auto float_array = convertToFloatArray(cloud_pcl);
|
auto float_array = convertToFloatArray(cloud_pcl);
|
||||||
|
|
||||||
// 1. GPU加速的空间过滤
|
// 使用合并处理内核:空间过滤+体素有效性检查+统计离群点去除
|
||||||
auto valid_indices = gpuFilterPointCloud(float_array, filter_min_x_, filter_min_y_, filter_min_z_,
|
auto valid_indices = gpuMergedProcessing(float_array, filter_min_x_, filter_min_y_, filter_min_z_,
|
||||||
filter_max_x_, filter_max_y_, filter_max_z_);
|
filter_max_x_, filter_max_y_, filter_max_z_, voxel_size_,
|
||||||
|
grid_range_, stat_mean_k_, stat_std_thresh_);
|
||||||
|
|
||||||
if (valid_indices.empty())
|
if (valid_indices.empty())
|
||||||
{
|
{
|
||||||
RCLCPP_WARN(get_logger(), "[GPU] Filtered cloud is empty");
|
RCLCPP_WARN(get_logger(), "[GPU] Merged processing result is empty");
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -309,47 +310,10 @@ class LidarMerger : public rclcpp::Node
|
|||||||
filtered_points.push_back(float_array[idx * 3 + 2]);
|
filtered_points.push_back(float_array[idx * 3 + 2]);
|
||||||
}
|
}
|
||||||
|
|
||||||
// 2. GPU加速的体素降采样
|
|
||||||
auto voxel_points = gpuVoxelGridFilter(filtered_points, voxel_size_, grid_range_);
|
|
||||||
if (voxel_points.empty())
|
|
||||||
{
|
|
||||||
RCLCPP_WARN(get_logger(), "[GPU] Voxel filtered cloud is empty");
|
|
||||||
return nullptr;
|
|
||||||
}
|
|
||||||
|
|
||||||
// 3. GPU加速的统计离群点去除
|
|
||||||
std::vector<float> voxel_array;
|
|
||||||
voxel_array.reserve(voxel_points.size() * 3);
|
|
||||||
for (const auto& p : voxel_points)
|
|
||||||
{
|
|
||||||
voxel_array.push_back(p.x);
|
|
||||||
voxel_array.push_back(p.y);
|
|
||||||
voxel_array.push_back(p.z);
|
|
||||||
}
|
|
||||||
|
|
||||||
auto inlier_indices = gpuStatisticalOutlierRemoval(voxel_array, stat_mean_k_, stat_std_thresh_);
|
|
||||||
|
|
||||||
// 转换回PCL格式
|
// 转换回PCL格式
|
||||||
std::vector<float> final_points;
|
processed_pcl_out = convertToPCLCloud(filtered_points);
|
||||||
if (inlier_indices.empty())
|
|
||||||
{
|
|
||||||
// 如果GPU离群点去除失败,直接使用体素化结果
|
|
||||||
final_points = voxel_array;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
final_points.reserve(inlier_indices.size() * 3);
|
|
||||||
for (int idx : inlier_indices)
|
|
||||||
{
|
|
||||||
final_points.push_back(voxel_array[idx * 3 + 0]);
|
|
||||||
final_points.push_back(voxel_array[idx * 3 + 1]);
|
|
||||||
final_points.push_back(voxel_array[idx * 3 + 2]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
processed_pcl_out = convertToPCLCloud(final_points);
|
// 车身和主刷过滤(使用CPU进行精细过滤,保持栅格格式一致)
|
||||||
|
|
||||||
// 4. 车身和主刷过滤(使用CPU进行精细过滤,保持栅格格式一致)
|
|
||||||
if (filter_car_)
|
if (filter_car_)
|
||||||
{
|
{
|
||||||
pcl::PointCloud<pcl::PointXYZ>::Ptr temp(new pcl::PointCloud<pcl::PointXYZ>);
|
pcl::PointCloud<pcl::PointXYZ>::Ptr temp(new pcl::PointCloud<pcl::PointXYZ>);
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user