CANN opbase算子数据Dump接口
aclDumpOpTensors【免费下载链接】opbase本项目是CANN算子库的基础框架库为算子提供公共依赖文件和基础调度能力。项目地址: https://gitcode.com/cann/opbase功能说明模型执行过程中支持Dump算子输入/输出Tensor数据方便算子输入/输出异常数据的问题定位和分析。函数原型aclnnStatus aclDumpOpTensors(const char *opType, const char *opName, aclTensor **tensors, size_t inputTensorNum, size_t outputTensorNum, aclrtStream stream)参数说明参数名输入/输出说明opType输入字符串表示算子类型例如“Add”。opName输入字符串表示算子名称例如“add_custom”。tensors输入一维张量表示待Dump的输入/输出Tensor对象指针。注意Tensor顺序输入Tensor在前输出Tensor在后。inputTensorNum输入表示待Dump的输入Tensor个数。outputTensorNum输入表示待Dump的输出Tensor个数。stream输入指定执行任务的Stream。返回值说明返回0表示成功返回其他值表示失败返回码列表参见公共接口返回码。约束说明本接口需要在开启算子Dump功能时有效您可以通过aclInit接口开启Dump也可以通过aclmdlInitDump、aclmdlSetDump、aclmdlFinalizeDump系列接口开启Dump接口介绍请参见《Runtime运行时 API》。调用示例关键代码示例如下仅供参考不支持直接拷贝运行。通过aclInit接口开启算子Dump功能。关键代码如下// 资源初始化 aclInit(./acl.json); aclrtSetDevice(0); aclrtStream stream nullptr; aclrtCreateStream(stream);acl.json示例如下具体参见aclInit接口文档中模型Dump配置、单算子Dump配置示例{ dump: { dump_path: ./, dump_list: [], dump_mode: all, dump_data: tensor } }调用本接口关键伪代码以torch算子为例如下#include torch/extension.h #include torch_npu/csrc/core/npu/NPUStream.h #include torch_npu/csrc/core/npu/NPUFunctions.h #include torch_npu/csrc/framework/OpCommand.h #include torch_npu/csrc/framework/interface/AclOpCompileInterface.h #include torch_npu/csrc/core/npu/register/OptionsManager.h #include torch_npu/csrc/aten/NPUNativeFunctions.h #include torch_npu/csrc/flopcount/FlopCount.h #include torch_npu/csrc/flopcount/FlopCounter.h #include torch_npu/csrc/core/npu/NpuVariables.h #include kernel_operator.h #include acl/acl_base.h #include aclnn/acl_meta.h constexpr int32_t BUFFER_NUM 2; constexpr int64_t MAX_DIM_NUM 5; constexpr int64_t NCL_DIM_NUM 3; constexpr int64_t NCHW_DIM_NUM 4; constexpr int64_t NCDHW_DIM_NUM 5; // 生成待Dump算子的输入/输出Tensor对象指针一维张量。 #define INIT_ACL_TENSOR_ARRAY(tensors, ...) aclTensor* tensors[] {__VA_ARGS__} // at::Tensor对象转换成aclTensor对象。本函数简化了处理过程具体以实际算子为准。 aclTensor *ConvertTensor(const at::Tensor at_tensor) { aclDataType acl_data_type ACL_FLOAT16; c10::SmallVectorint64_t, MAX_DIM_NUM storageDims; const auto dimNum at_tensor.sizes().size(); aclFormat format ACL_FORMAT_ND; switch (dimNum) { case NCL_DIM_NUM: format ACL_FORMAT_NCL; break; case NCHW_DIM_NUM: format ACL_FORMAT_NCHW; break; case NCDHW_DIM_NUM: format ACL_FORMAT_NCDHW; break; default: format ACL_FORMAT_ND; } // if acl_data_type is ACL_STRING, storageDims is empty. if (acl_data_type ! ACL_STRING) { storageDims.push_back(at_tensor.storage().nbytes() / at_tensor.itemsize()); } auto acl_tensor aclCreateTensor(at_tensor.sizes().data(), at_tensor.sizes().size(), acl_data_type, at_tensor.strides().data(), at_tensor.storage_offset(), format, storageDims.data(), storageDims.size(), const_castvoid *(at_tensor.storage().data())); return acl_tensor; } // 自定义算子实现。具体以实际算子为准。 class KernelAdd { public: __aicore__ inline KernelAdd() {} __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z, uint32_t totalLength) { this-blockLength totalLength / AscendC::GetBlockNum(); this-tileNum 8; this-tileLength this-blockLength / this-tileNum / BUFFER_NUM; xGm.SetGlobalBuffer((__gm__ half *)x this-blockLength * AscendC::GetBlockIdx(), this-blockLength); yGm.SetGlobalBuffer((__gm__ half *)y this-blockLength * AscendC::GetBlockIdx(), this-blockLength); zGm.SetGlobalBuffer((__gm__ half *)z this-blockLength * AscendC::GetBlockIdx(), this-blockLength); pipe.InitBuffer(inQueueX, BUFFER_NUM, this-tileLength * sizeof(half)); pipe.InitBuffer(inQueueY, BUFFER_NUM, this-tileLength * sizeof(half)); pipe.InitBuffer(outQueueZ, BUFFER_NUM, this-tileLength * sizeof(half)); } __aicore__ inline void Process() { int32_t loopCount this-tileNum * BUFFER_NUM; for (int32_t i 0; i loopCount; i) { CopyIn(i); Compute(i); CopyOut(i); } } private: __aicore__ inline void CopyIn(int32_t progress) { AscendC::LocalTensorhalf xLocal inQueueX.AllocTensorhalf(); AscendC::LocalTensorhalf yLocal inQueueY.AllocTensorhalf(); AscendC::DataCopy(xLocal, xGm[progress * this-tileLength], this-tileLength); AscendC::DataCopy(yLocal, yGm[progress * this-tileLength], this-tileLength); inQueueX.EnQue(xLocal); inQueueY.EnQue(yLocal); } __aicore__ inline void Compute(int32_t progress) { AscendC::LocalTensorhalf xLocal inQueueX.DeQuehalf(); AscendC::LocalTensorhalf yLocal inQueueY.DeQuehalf(); AscendC::LocalTensorhalf zLocal outQueueZ.AllocTensorhalf(); AscendC::Add(zLocal, xLocal, yLocal, this-tileLength); outQueueZ.EnQuehalf(zLocal); inQueueX.FreeTensor(xLocal); inQueueY.FreeTensor(yLocal); } __aicore__ inline void CopyOut(int32_t progress) { AscendC::LocalTensorhalf zLocal outQueueZ.DeQuehalf(); AscendC::DataCopy(zGm[progress * this-tileLength], zLocal, this-tileLength); outQueueZ.FreeTensor(zLocal); } private: AscendC::TPipe pipe; AscendC::TQueAscendC::TPosition::VECIN, BUFFER_NUM inQueueX, inQueueY; AscendC::TQueAscendC::TPosition::VECOUT, BUFFER_NUM outQueueZ; AscendC::GlobalTensorhalf xGm; AscendC::GlobalTensorhalf yGm; AscendC::GlobalTensorhalf zGm; uint32_t blockLength; uint32_t tileNum; uint32_t tileLength; }; __global__ __vector__ void add_custom(GM_ADDR x, GM_ADDR y, GM_ADDR z, uint32_t totalLength) { KernelAdd op; op.Init(x, y, z, totalLength); op.Process(); } namespace ascendc_ops { at::Tensor ascendc_add(const at::Tensor x, const at::Tensor y) { auto aclStream c10_npu::getCurrentNPUStream().stream(false); at::Tensor z at::empty_like(x); uint32_t numBlocks 8; uint32_t totalLength 1; for (uint32_t size : x.sizes()) { totalLength * size; } add_customnumBlocks, nullptr, aclStream((uint8_t*)(x.mutable_data_ptr()), (uint8_t*)(y.mutable_data_ptr()), (uint8_t*)(z.mutable_data_ptr()), totalLength); // Dump算子输入/输出Tensor数据。 INIT_ACL_TENSOR_ARRAY(tensors, ConvertTensor(x), ConvertTensor(y), ConvertTensor(z)); aclDumpOpTensors(Add, add_custom, tensors, 2, 1, aclStream); // 释放aclTensor对象。 for (size_t i 0; i 3; i) { aclDestroyTensor(tensors[i]); } return z; } } // namespace ascendc_ops TORCH_LIBRARY(ascendc_ops, m) { m.def(ascendc_add(Tensor x, Tensor y) - Tensor); } TORCH_LIBRARY_IMPL(ascendc_ops, PrivateUse1, m) { m.impl(ascendc_add, TORCH_FN(ascendc_ops::ascendc_add)); }【免费下载链接】opbase本项目是CANN算子库的基础框架库为算子提供公共依赖文件和基础调度能力。项目地址: https://gitcode.com/cann/opbase创作声明:本文部分内容由AI辅助生成(AIGC),仅供参考