CANN学习资源开源仓的算子调试二打印和日志
【摘要】 DumpTensor 在算子执行过程中打印 Tensor 的内容,支持附加自定义信息(desc 参数,仅支持 uint32_t 类型),例如行号或标识符,以便在多处调用时区分不同输出。 参数 说明 tensor 支持 LocalTensor(位于 Unified Buffer/L1/L0C)或 GlobalTenso...
DumpTensor 在算子执行过程中打印 Tensor 的内容,支持附加自定义信息(desc 参数,仅支持 uint32_t 类型),例如行号或标识符,以便在多处调用时区分不同输出。
| 参数 | 说明 |
|---|---|
tensor |
支持 LocalTensor(位于 Unified Buffer/L1/L0C)或 GlobalTensor(位于 Global Memory)。 |
desc |
用户自定义附加信息(如行号),用于区分不同调用位置的输出。 |
dumpSize |
需要打印的元素个数。 |
shapeInfo |
按照Tensor的shape信息进行打印。当 shapeInfo 指定的尺寸大于 dumpSize 时,不足部分用 "-" 填充显示;若小于等于 dumpSize,则按实际形状打印,多余元素不显示。 |
CPU调试用到的printf接口在NPU板上也可以使用,其语法与标准 C/C++ printf 兼容。
在Add算子核函数的Compute方法中,用DumpTensor输出LocalTensor的内容,验证每个AI Core在首次计算时的输入与输出是否符合预期。(xLocal、yLocal和zLocal的前8个元素)。在Init方法中,用printf输出关键的切分参数,验证多核并行场景下的数据划分是否符合预期。(数据总长度、每个核心的数据切块数以及参与计算的AI Core总数),这个我倒觉得没必要,因为tiling是从host传过来的,在host打印更方便,何必到device上呢?
//add_custom.asc
#include <cstdint>
#include <iostream>
#include <vector>
#include <algorithm>
#include <iterator>
#include "acl/acl.h"
#include "kernel_operator.h"
constexpr uint32_t BUFFER_NUM = 2; // tensor num for each queue
struct AddCustomTilingData
{
uint32_t totalLength;
uint32_t tileNum;
};
class KernelAdd {
public:
__aicore__ inline KernelAdd() {}
__aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z, uint32_t totalLength, uint32_t tileNum)
{
this->blockLength = totalLength / AscendC::GetBlockNum();
this->tileNum = tileNum;
this->tileLength = this->blockLength / tileNum / BUFFER_NUM;
// 打印关键参数,验证切分逻辑
AscendC::printf("totalLength: %d, tileNum: %d, blockNum: %lu\n",
totalLength, tileNum, AscendC::GetBlockNum());
xGm.SetGlobalBuffer((__gm__ float *)x + this->blockLength * AscendC::GetBlockIdx(), this->blockLength);
yGm.SetGlobalBuffer((__gm__ float *)y + this->blockLength * AscendC::GetBlockIdx(), this->blockLength);
zGm.SetGlobalBuffer((__gm__ float *)z + this->blockLength * AscendC::GetBlockIdx(), this->blockLength);
pipe.InitBuffer(inQueueX, BUFFER_NUM, this->tileLength * sizeof(float));
pipe.InitBuffer(inQueueY, BUFFER_NUM, this->tileLength * sizeof(float));
pipe.InitBuffer(outQueueZ, BUFFER_NUM, this->tileLength * sizeof(float));
}
__aicore__ inline void Process()
{
int32_t loopCount = this->tileNum * BUFFER_NUM;
for (int32_t i = 0; i < loopCount; i++) {
CopyIn(i);
Compute(i);
CopyOut(i);
}
}
private:
__aicore__ inline void CopyIn(int32_t progress)
{
AscendC::LocalTensor<float> xLocal = inQueueX.AllocTensor<float>();
AscendC::LocalTensor<float> yLocal = inQueueY.AllocTensor<float>();
AscendC::DataCopy(xLocal, xGm[progress * this->tileLength], this->tileLength);
AscendC::DataCopy(yLocal, yGm[progress * this->tileLength], this->tileLength);
inQueueX.EnQue(xLocal);
inQueueY.EnQue(yLocal);
}
__aicore__ inline void Compute(int32_t progress)
{
AscendC::LocalTensor<float> xLocal = inQueueX.DeQue<float>();
AscendC::LocalTensor<float> yLocal = inQueueY.DeQue<float>();
AscendC::LocalTensor<float> zLocal = outQueueZ.AllocTensor<float>();
// 仅在第一次调用时打印输入和输出数据的前8个元素
if (progress == 0) {
AscendC::DumpTensor(xLocal, __LINE__, 8);
AscendC::DumpTensor(yLocal, __LINE__, 8);
}
AscendC::Add(zLocal, xLocal, yLocal, this->tileLength);
if (progress == 0) {
AscendC::DumpTensor(zLocal, __LINE__, 8);
}
outQueueZ.EnQue<float>(zLocal);
inQueueX.FreeTensor(xLocal);
inQueueY.FreeTensor(yLocal);
}
__aicore__ inline void CopyOut(int32_t progress)
{
AscendC::LocalTensor<float> zLocal = outQueueZ.DeQue<float>();
AscendC::DataCopy(zGm[progress * this->tileLength], zLocal, this->tileLength);
outQueueZ.FreeTensor(zLocal);
}
private:
AscendC::TPipe pipe;
AscendC::TQue<AscendC::TPosition::VECIN, BUFFER_NUM> inQueueX, inQueueY;
AscendC::TQue<AscendC::TPosition::VECOUT, BUFFER_NUM> outQueueZ;
AscendC::GlobalTensor<float> xGm;
AscendC::GlobalTensor<float> yGm;
AscendC::GlobalTensor<float> zGm;
uint32_t blockLength;
uint32_t tileNum;
uint32_t tileLength;
};
__global__ __aicore__ void add_custom(GM_ADDR x, GM_ADDR y, GM_ADDR z, AddCustomTilingData tiling)
{
KERNEL_TASK_TYPE_DEFAULT(KERNEL_TYPE_AIV_ONLY);
KernelAdd op;
op.Init(x, y, z, tiling.totalLength, tiling.tileNum);
op.Process();
}
std::vector<float> kernel_add(std::vector<float> &x, std::vector<float> &y)
{
constexpr uint32_t blockDim = 8;
uint32_t totalLength = x.size();
size_t totalByteSize = totalLength * sizeof(float);
int32_t deviceId = 0;
aclrtStream stream = nullptr;
AddCustomTilingData tiling = {/*totalLength:*/totalLength, /*tileNum:*/8};
uint8_t *xHost = reinterpret_cast<uint8_t *>(x.data());
uint8_t *yHost = reinterpret_cast<uint8_t *>(y.data());
uint8_t *zHost = nullptr;
uint8_t *xDevice = nullptr;
uint8_t *yDevice = nullptr;
uint8_t *zDevice = nullptr;
aclInit(nullptr);
aclrtSetDevice(deviceId);
aclrtCreateStream(&stream);
aclrtMallocHost((void **)(&zHost), totalByteSize);
aclrtMalloc((void **)&xDevice, totalByteSize, ACL_MEM_MALLOC_HUGE_FIRST);
aclrtMalloc((void **)&yDevice, totalByteSize, ACL_MEM_MALLOC_HUGE_FIRST);
aclrtMalloc((void **)&zDevice, totalByteSize, ACL_MEM_MALLOC_HUGE_FIRST);
aclrtMemcpy(xDevice, totalByteSize, xHost, totalByteSize, ACL_MEMCPY_HOST_TO_DEVICE);
aclrtMemcpy(yDevice, totalByteSize, yHost, totalByteSize, ACL_MEMCPY_HOST_TO_DEVICE);
add_custom<<<blockDim, nullptr, stream>>>(xDevice, yDevice, zDevice, tiling);
aclrtSynchronizeStream(stream);
aclrtMemcpy(zHost, totalByteSize, zDevice, totalByteSize, ACL_MEMCPY_DEVICE_TO_HOST);
std::vector<float> z((float *)zHost, (float *)(zHost + totalByteSize));
aclrtFree(xDevice);
aclrtFree(yDevice);
aclrtFree(zDevice);
aclrtFreeHost(zHost);
aclrtDestroyStream(stream);
aclrtResetDevice(deviceId);
aclFinalize();
return z;
}
uint32_t VerifyResult(std::vector<float> &output, std::vector<float> &golden)
{
auto printTensor = [](std::vector<float> &tensor, const char *name) {
constexpr size_t maxPrintSize = 20;
std::cout << name << ": ";
std::copy(tensor.begin(), tensor.begin() + std::min(tensor.size(), maxPrintSize),
std::ostream_iterator<float>(std::cout, " "));
if (tensor.size() > maxPrintSize) {
std::cout << "...";
}
std::cout << std::endl;
};
printTensor(output, "Output");
printTensor(golden, "Golden");
if (std::equal(golden.begin(), golden.end(), output.begin())) {
std::cout << "[Success] Case accuracy is verification passed." << std::endl;
return 0;
} else {
std::cout << "[Failed] Case accuracy is verification failed!" << std::endl;
return 1;
}
return 0;
}
int32_t main(int32_t argc, char *argv[])
{
constexpr uint32_t totalLength = 8 * 2048;
constexpr float valueX = 1.2f;
constexpr float valueY = 2.3f;
std::vector<float> x(totalLength, valueX);
std::vector<float> y(totalLength, valueY);
std::vector<float> output = kernel_add(x, y);
std::vector<float> golden(totalLength, valueX + valueY);
return VerifyResult(output, golden);
}
输出:
totalLength: 16384, tileNum: 8, blockNum: 8
DumpTensor: desc=63, addr=0x0, data_type=float32, position=UB, dump_size=8
[1.200000, 1.200000, 1.200000, 1.200000, 1.200000, 1.200000, 1.200000, 1.200000]
DumpTensor: desc=64, addr=0x400, data_type=float32, position=UB, dump_size=8
[2.300000, 2.300000, 2.300000, 2.300000, 2.300000, 2.300000, 2.300000, 2.300000]
DumpTensor: desc=70, addr=0x800, data_type=float32, position=UB, dump_size=8
[3.500000, 3.500000, 3.500000, 3.500000, 3.500000, 3.500000, 3.500000, 3.500000]
...
Ascend EP 形态下日志默认存储在 $HOME/ascend/log/ 目录。RC 形态下默认存储在 /var/log/npu/slog/ 目录。可通过环境变量 ASCEND_PROCESS_LOG_PATH 自定义日志落盘路径(仅适用于 Ascend EP 模式)。关于精度问题的例子:
class KernelAdd {
public:
__aicore__ inline KernelAdd() {}
__aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z, uint32_t totalLength, uint32_t tileNum)
{
this->blockLength = totalLength / AscendC::GetBlockNum();
this->tileNum = tileNum;
this->tileLength = this->blockLength / tileNum / BUFFER_NUM;
xGm.SetGlobalBuffer((__gm__ float *)x + this->blockLength * AscendC::GetBlockIdx(), this->blockLength);
yGm.SetGlobalBuffer((__gm__ float *)y + this->blockLength * AscendC::GetBlockIdx(), this->blockLength);
zGm.SetGlobalBuffer((__gm__ float *)z + this->blockLength * AscendC::GetBlockIdx(), this->blockLength);
pipe.InitBuffer(inQueueX, BUFFER_NUM, this->tileLength); //问题出在这里
pipe.InitBuffer(inQueueY, BUFFER_NUM, this->tileLength * sizeof(float));
pipe.InitBuffer(outQueueZ, BUFFER_NUM, this->tileLength * sizeof(float));
}
//...
private:
__aicore__ inline void Compute(int32_t progress)
{
AscendC::LocalTensor<float> xLocal = inQueueX.DeQue<float>();
AscendC::LocalTensor<float> yLocal = inQueueY.DeQue<float>();
AscendC::LocalTensor<float> zLocal = outQueueZ.AllocTensor<float>();
// 添加打印调试信息:
//xLocal size: 32
//tileLength: 128
AscendC::printf("xLocal size: %d\n", xLocal.GetSize());
AscendC::printf("tileLength: %d\n", this->tileLength);
AscendC::Add(zLocal, xLocal, yLocal, this->tileLength);
outQueueZ.EnQue<float>(zLocal);
inQueueX.FreeTensor(xLocal);
inQueueY.FreeTensor(yLocal);
}
}
关于对齐问题的例子。定位思路
- 检查API使用是否满足通用对齐约束:确认DataCopy、VEC 相关指令的地址对齐要求,检查代码中内存访问的起始地址是否满足约束。
- 打印地址偏移: 在 DataCopy等API前添加printf,打印 UB/GM 的访问起始地址偏移量,验证是否为对齐值的整数倍:
- 孪生调试(CPU 侧)验证: 将算子工程改为核函数调用模式,通过孪生调试的CPU侧调试直接显示错误原因,精准定位错误行。
cd Sources/07.04/log/debug/plog && \
grep -nr "ERROR"
日志输出:
plog-2139_20260401113227121.log:1:[ERROR] RUNTIME(2139,demo):2026-04-01-11:32:27.103.305 [stars_engine.cc:1534]2216 ProcLogicCqReport:Task run failed, device_id=0, stream_id=47, task_id=0, sqe_type=0(ffts), errType=0x1(task exception), sqSwStatus=0
plog-2139_20260401113227121.log:2:[ERROR] RUNTIME(2139,demo):2026-04-01-11:32:27.131.791 [device_error_core_proc.cc:321]2216 AddExceptionRegInfo:add error register: core_id=10, stream_id=47, task_id=0
plog-2139_20260401113227121.log:3:[ERROR] RUNTIME(2139,demo):2026-04-01-11:32:27.131.817 [device_error_core_proc.cc:347]2216 PrintCoreInfo:The error from device(chipId:6, dieId:0), serial number is 9, there is an exception of aivec error, core id is 10, error code = 0, dump info: pc start: 0x12c041200000, current: 0x12c041200780, vec error info: 0xdf0571f120, mte error info: 0xfe, ifu error info: 0x10433f7000c0, ccu error info: 0x164c066855a631e3, cube error info: 0, biu error info: 0, aic error mask: 0x6500020bd00028c, para base: 0x12c100000000.
#重点是下面一条日志里面的 errorStr: instruction address misalign(ADDR_MISALIGN).
plog-2139_20260401113227121.log:4:[ERROR] RUNTIME(2139,demo):2026-04-01-11:32:27.131.870 [device_error_core_proc.cc:360]2216 PrintCoreInfo:The extend info: errcode:(0, 0x10000, 0) errorStr: instruction address misalign(ADDR_MISALIGN). fixp_error0 info: 0xfe, fixp_error1 info: 0, fsmId:0, tslot:6, thread:0, ctxid:0, blk:0, sublk:0, subErrType:4.
原因是下面这段代码的问题:
__aicore__ inline void CopyOut(int32_t progress)
{
AscendC::LocalTensor<float> zLocal = outQueueZ.DeQue<float>();
AscendC::DataCopy(zGm[progress * this->tileLength], zLocal[1], this->tileLength); //这里zLocal[1]就是问题
outQueueZ.FreeTensor(zLocal);
}
最后实践题是找出sinh_custom.asc源码中存在两处代码问题并修正。
【声明】本内容来自华为云开发者社区博主,不代表华为云及华为云开发者社区的观点和立场。转载时必须标注文章的来源(华为云社区)、文章链接、文章作者等基本信息,否则作者和本社区有权追究责任。如果您发现本社区中有涉嫌抄袭的内容,欢迎发送邮件进行举报,并提供相关证据,一经查实,本社区将立刻删除涉嫌侵权内容,举报邮箱:
cloudbbs@huaweicloud.com
- 点赞
- 收藏
- 关注作者
评论(0)