CUDA c++ Clock示例代码详细分析如何分析每个块的时间效率
文章目录一、逻辑链路分析1.1 程序整体架构1.2 数据流逻辑1.3 核心算法逻辑1.4 线程块与数据映射二、逐行代码功能分析2.1 头文件和宏定义部分2.2 Kernel函数timedReduction参数列表共享内存声明线程索引获取开始计时数据加载到共享内存并行归约核心循环输出结果结束计时2.3 主函数main常量定义设备选择内存分配数据初始化与传输Kernel启动结果回传与统计分析三、关键技术点总结四、性能优化洞察一、逻辑链路分析// This example shows how to use the clock function to measure the performance of// block of threads of a kernel accurately.//// Blocks are executed in parallel and out of order. Since theres no synchronization// mechanism between blocks, we measure the clock once for each block. The clock// samples are written to device memory.// System includes#includestdio.h#includestdint.h#includeassert.h// CUDA runtime#includecuda_runtime.h// helper functions and utilities to work with CUDA#includehelper_functions.h#includehelper_cuda.h// This kernel computes a standard parallel reduction and evaluates the// time it takes to do that for each block. The timing results are stored// in device memory.__global__staticvoidtimedReduction(constfloat*input,float*output,clock_t*timer){// __shared__ float shared[2 * blockDim.x];extern__shared__floatshared[];constinttidthreadIdx.x;constintbidblockIdx.x;if(tid0)timer[bid]clock();// Copy input.shared[tid]input[tid];shared[tidblockDim.x]input[tidblockDim.x];// Perform reduction to find minimum.for(intdblockDim.x;d0;d/2){__syncthreads();if(tidd){floatf0shared[tid];floatf1shared[tidd];if(f1f0){shared[tid]f1;}}}// Write result.if(tid0)output[bid]shared[0];__syncthreads();if(tid0)timer[bidgridDim.x]clock();}#defineNUM_BLOCKS64#defineNUM_THREADS256// Its interesting to change the number of blocks and the number of threads to// understand how to keep the hardware busy.//// Here are some numbers I get on my G80:// blocks - clocks// 1 - 3096// 8 - 3232// 16 - 3364// 32 - 4615// 64 - 9981//// With less than 16 blocks some of the multiprocessors of the device are idle. With// more than 16 you are using all the multiprocessors, but theres only one block per// multiprocessor and that doesnt allow you to hide the latency of the memory. With// more than 32 the speed scales linearly.// Start the main CUDA Sample hereintmain(intargc,char**argv){printf(CUDA Clock sample\n);// This will pick the best possible CUDA capable deviceintdevfindCudaDevice(argc,(constchar**)argv);float*dinputNULL;float*doutputNULL;clock_t*dtimerNULL;clock_t timer[NUM_BLOCKS*2];floatinput[NUM_THREADS*2];for(inti0;iNUM_THREADS*2;i){input[i](float)i;}checkCudaErrors(cudaMalloc((void**)dinput,sizeof(float)*NUM_THREADS*2));checkCudaErrors(cudaMalloc((void**)doutput,sizeof(float)*NUM_BLOCKS));checkCudaErrors(cudaMalloc((void**)dtimer,sizeof(clock_t)*NUM_BLOCKS*2));checkCudaErrors(cudaMemcpy(dinput,input,sizeof(float)*NUM_THREADS*2,cudaMemcpyHostToDevice));timedReductionNUM_BLOCKS,NUM_THREADS,sizeof(float)*2*NUM_THREADS(dinput,doutput,dtimer);checkCudaErrors(cudaMemcpy(timer,dtimer,sizeof(clock_t)*NUM_BLOCKS*2,cudaMemcpyDeviceToHost));checkCudaErrors(cudaFree(dinput));checkCudaErrors(cudaFree(doutput));checkCudaErrors(cudaFree(dtimer));longdoubleavgElapsedClocks0;for(inti0;iNUM_BLOCKS;i){avgElapsedClocks(longdouble)(timer[iNUM_BLOCKS]-timer[i]);}avgElapsedClocksavgElapsedClocks/NUM_BLOCKS;printf(Average clocks/block %Lf\n,avgElapsedClocks);returnEXIT_SUCCESS;}1.1 程序整体架构该程序通过GPU并行归约计算并利用clock()函数测量每个线程块执行归约操作所消耗的时钟周期数。1.2 数据流逻辑主机端数据初始化 → 拷贝到设备端 → GPU并行归约计算 计时 → 拷贝回主机端 → 统计分析1.3 核心算法逻辑归约算法使用共享内存进行并行归约每个线程块处理2*NUM_THREADS个数据元素计时机制在线程块开始和结束时分别记录时钟值计算差值得到执行时间1.4 线程块与数据映射64个线程块每个块256个线程每个线程块处理512个float数据2*256总共处理64*512 32768个数据点二、逐行代码功能分析2.1 头文件和宏定义部分#includestdio.h#includestdint.h#includeassert.h#includecuda_runtime.h#includehelper_functions.h#includehelper_cuda.h包含标准库和CUDA辅助库helper_cuda.h提供checkCudaErrors()等错误检查宏2.2 Kernel函数timedReduction参数列表__global__staticvoidtimedReduction(constfloat*input,float*output,clock_t*timer)__global__表示这是CUDA内核函数在设备端执行从主机端调用input输入数据指针只读output输出结果指针每个线程块输出一个最小值timer计时数组存储每个块的开始和结束时钟共享内存声明extern__shared__floatshared[];动态分配共享内存大小在kernel调用时通过第三个参数指定每个线程块独有的高速缓存在SM内部线程索引获取constinttidthreadIdx.x;constintbidblockIdx.x;tid块内线程索引0-255bid网格中块索引0-63开始计时if(tid0)timer[bid]clock();只有每个块的第一个线程tid0记录开始时间clock()返回GPU当前时钟周期计数数据加载到共享内存shared[tid]input[tid];shared[tidblockDim.x]input[tidblockDim.x];每个线程加载两个数据到共享内存块0处理input[0-511]块1处理input[512-1023]依此类推并行归约核心循环for(intdblockDim.x;d0;d/2){__syncthreads();if(tidd){floatf0shared[tid];floatf1shared[tidd];if(f1f0){shared[tid]f1;}}}归约流程d256: 线程0-127比较并合并相邻元素d128: 线程0-63继续合并d64: 线程0-31继续合并…直到d1: 线程0最终得出最小值__syncthreads()同步块内所有线程确保数据一致性输出结果if(tid0)output[bid]shared[0];每个块的第一个线程将最小值写入全局内存结束计时__syncthreads();if(tid0)timer[bidgridDim.x]clock();同步确保所有线程完成归约记录结束时钟存储在数组后半部分偏移gridDim.x642.3 主函数main常量定义#defineNUM_BLOCKS64#defineNUM_THREADS256定义网格和块大小设备选择intdevfindCudaDevice(argc,(constchar**)argv);自动选择最佳CUDA设备内存分配checkCudaErrors(cudaMalloc((void**)dinput,sizeof(float)*NUM_THREADS*2));checkCudaErrors(cudaMalloc((void**)doutput,sizeof(float)*NUM_BLOCKS));checkCudaErrors(cudaMalloc((void**)dtimer,sizeof(clock_t)*NUM_BLOCKS*2));设备端内存分配dinput: 512个floatdoutput: 64个float每个块一个结果dtimer: 128个clock_t每个块开始和结束各一个数据初始化与传输for(inti0;iNUM_THREADS*2;i){input[i](float)i;}主机端数据初始化为0-511checkCudaErrors(cudaMemcpy(dinput,input,sizeof(float)*NUM_THREADS*2,cudaMemcpyHostToDevice));将数据从主机拷贝到设备Kernel启动timedReductionNUM_BLOCKS,NUM_THREADS,sizeof(float)*2*NUM_THREADS(dinput,doutput,dtimer);64, 256, 512*sizeof(float)第三个参数指定动态共享内存大小512*42048字节结果回传与统计分析checkCudaErrors(cudaMemcpy(timer,dtimer,sizeof(clock_t)*NUM_BLOCKS*2,cudaMemcpyDeviceToHost));将计时数据拷贝回主机longdoubleavgElapsedClocks0;for(inti0;iNUM_BLOCKS;i){avgElapsedClocks(longdouble)(timer[iNUM_BLOCKS]-timer[i]);}avgElapsedClocksavgElapsedClocks/NUM_BLOCKS;计算每个块的平均执行时钟周期数timer[i]开始时间timer[iNUM_BLOCKS]结束时间三、关键技术点总结动态共享内存使用extern __shared__在运行时指定大小并行归约利用共享内存减少全局内存访问线程同步__syncthreads()确保块内线程同步性能测量clock()函数测量GPU时钟周期原子性操作通过tid0保证单线程写入避免竞争四、性能优化洞察注释中的性能数据展示了块数量从1到64执行时间增加原因块数少时SM利用率低块数多时资源竞争加剧最优块数通常在SM数量的2-4倍之间以隐藏内存延迟该示例很好地展示了CUDA程序性能分析的基础方法。