- 微信
- 微博
  
  分享文章到微博
- 复制链接
  
  复制链接到剪贴板

CANN学习资源开源仓的算子调试二打印和日志

黄生发表于 2026/04/01 11:49:03 2026/04/01

【摘要】 DumpTensor 在算子执行过程中打印 Tensor 的内容，支持附加自定义信息（desc 参数，仅支持 uint32_t 类型），例如行号或标识符，以便在多处调用时区分不同输出。参数说明 tensor 支持 LocalTensor（位于 Unified Buffer/L1/L0C）或 GlobalTenso...

DumpTensor 在算子执行过程中打印 Tensor 的内容，支持附加自定义信息（desc 参数，仅支持 uint32_t 类型），例如行号或标识符，以便在多处调用时区分不同输出。

参数	说明
`tensor`	支持 LocalTensor（位于 Unified Buffer/L1/L0C）或 GlobalTensor（位于 Global Memory）。
`desc`	用户自定义附加信息（如行号），用于区分不同调用位置的输出。
`dumpSize`	需要打印的元素个数。
`shapeInfo`	按照Tensor的shape信息进行打印。当 `shapeInfo` 指定的尺寸大于 `dumpSize` 时，不足部分用 "-" 填充显示；若小于等于 `dumpSize`，则按实际形状打印，多余元素不显示。

CPU调试用到的printf接口在NPU板上也可以使用，其语法与标准 C/C++ printf 兼容。

在Add算子核函数的Compute方法中，用DumpTensor输出LocalTensor的内容，验证每个AI Core在首次计算时的输入与输出是否符合预期。（xLocal、yLocal和zLocal的前8个元素）。在Init方法中，用printf输出关键的切分参数，验证多核并行场景下的数据划分是否符合预期。（数据总长度、每个核心的数据切块数以及参与计算的AI Core总数），这个我倒觉得没必要，因为tiling是从host传过来的，在host打印更方便，何必到device上呢？

//add_custom.asc

#include <cstdint>
#include <iostream>
#include <vector>
#include <algorithm>
#include <iterator>
#include "acl/acl.h"
#include "kernel_operator.h"

constexpr uint32_t BUFFER_NUM = 2; // tensor num for each queue

struct AddCustomTilingData
{
    uint32_t totalLength;
    uint32_t tileNum;
};

class KernelAdd {
public:
    __aicore__ inline KernelAdd() {}
    __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z, uint32_t totalLength, uint32_t tileNum)
    {
        this->blockLength = totalLength / AscendC::GetBlockNum();
        this->tileNum = tileNum;
        this->tileLength = this->blockLength / tileNum / BUFFER_NUM;
        // 打印关键参数，验证切分逻辑
        AscendC::printf("totalLength: %d, tileNum: %d, blockNum: %lu\n", 
                        totalLength, tileNum, AscendC::GetBlockNum());
        xGm.SetGlobalBuffer((__gm__ float *)x + this->blockLength * AscendC::GetBlockIdx(), this->blockLength);
        yGm.SetGlobalBuffer((__gm__ float *)y + this->blockLength * AscendC::GetBlockIdx(), this->blockLength);
        zGm.SetGlobalBuffer((__gm__ float *)z + this->blockLength * AscendC::GetBlockIdx(), this->blockLength);
        pipe.InitBuffer(inQueueX, BUFFER_NUM, this->tileLength * sizeof(float));
        pipe.InitBuffer(inQueueY, BUFFER_NUM, this->tileLength * sizeof(float));
        pipe.InitBuffer(outQueueZ, BUFFER_NUM, this->tileLength * sizeof(float));
    }
    __aicore__ inline void Process()
    {
        int32_t loopCount = this->tileNum * BUFFER_NUM;
        for (int32_t i = 0; i < loopCount; i++) {
            CopyIn(i);
            Compute(i);
            CopyOut(i);
        }
    }

private:
    __aicore__ inline void CopyIn(int32_t progress)
    {
        AscendC::LocalTensor<float> xLocal = inQueueX.AllocTensor<float>();
        AscendC::LocalTensor<float> yLocal = inQueueY.AllocTensor<float>();
        AscendC::DataCopy(xLocal, xGm[progress * this->tileLength], this->tileLength);
        AscendC::DataCopy(yLocal, yGm[progress * this->tileLength], this->tileLength);
        inQueueX.EnQue(xLocal);
        inQueueY.EnQue(yLocal);
    }
    __aicore__ inline void Compute(int32_t progress)
    {
        AscendC::LocalTensor<float> xLocal = inQueueX.DeQue<float>();
        AscendC::LocalTensor<float> yLocal = inQueueY.DeQue<float>();
        AscendC::LocalTensor<float> zLocal = outQueueZ.AllocTensor<float>();
        // 仅在第一次调用时打印输入和输出数据的前8个元素
        if (progress == 0) {
            AscendC::DumpTensor(xLocal, __LINE__, 8);
            AscendC::DumpTensor(yLocal, __LINE__, 8);
        }
        
        AscendC::Add(zLocal, xLocal, yLocal, this->tileLength);
        
        if (progress == 0) {
            AscendC::DumpTensor(zLocal, __LINE__, 8);
        }
        outQueueZ.EnQue<float>(zLocal);
        inQueueX.FreeTensor(xLocal);
        inQueueY.FreeTensor(yLocal);
    }
    __aicore__ inline void CopyOut(int32_t progress)
    {
        AscendC::LocalTensor<float> zLocal = outQueueZ.DeQue<float>();
        AscendC::DataCopy(zGm[progress * this->tileLength], zLocal, this->tileLength);
        outQueueZ.FreeTensor(zLocal);
    }

private:
    AscendC::TPipe pipe;
    AscendC::TQue<AscendC::TPosition::VECIN, BUFFER_NUM> inQueueX, inQueueY;
    AscendC::TQue<AscendC::TPosition::VECOUT, BUFFER_NUM> outQueueZ;
    AscendC::GlobalTensor<float> xGm;
    AscendC::GlobalTensor<float> yGm;
    AscendC::GlobalTensor<float> zGm;
    uint32_t blockLength;
    uint32_t tileNum;
    uint32_t tileLength;
};

__global__ __aicore__ void add_custom(GM_ADDR x, GM_ADDR y, GM_ADDR z, AddCustomTilingData tiling)
{
    KERNEL_TASK_TYPE_DEFAULT(KERNEL_TYPE_AIV_ONLY);
    KernelAdd op;
    op.Init(x, y, z, tiling.totalLength, tiling.tileNum);
    op.Process();
}

std::vector<float> kernel_add(std::vector<float> &x, std::vector<float> &y)
{
    constexpr uint32_t blockDim = 8;
    uint32_t totalLength = x.size();
    size_t totalByteSize = totalLength * sizeof(float);
    int32_t deviceId = 0;
    aclrtStream stream = nullptr;
    AddCustomTilingData tiling = {/*totalLength:*/totalLength, /*tileNum:*/8};
    uint8_t *xHost = reinterpret_cast<uint8_t *>(x.data());
    uint8_t *yHost = reinterpret_cast<uint8_t *>(y.data());
    uint8_t *zHost = nullptr;
    uint8_t *xDevice = nullptr;
    uint8_t *yDevice = nullptr;
    uint8_t *zDevice = nullptr;

    aclInit(nullptr);
    aclrtSetDevice(deviceId);
    aclrtCreateStream(&stream);

    aclrtMallocHost((void **)(&zHost), totalByteSize);
    aclrtMalloc((void **)&xDevice, totalByteSize, ACL_MEM_MALLOC_HUGE_FIRST);
    aclrtMalloc((void **)&yDevice, totalByteSize, ACL_MEM_MALLOC_HUGE_FIRST);
    aclrtMalloc((void **)&zDevice, totalByteSize, ACL_MEM_MALLOC_HUGE_FIRST);

    aclrtMemcpy(xDevice, totalByteSize, xHost, totalByteSize, ACL_MEMCPY_HOST_TO_DEVICE);
    aclrtMemcpy(yDevice, totalByteSize, yHost, totalByteSize, ACL_MEMCPY_HOST_TO_DEVICE);

    add_custom<<<blockDim, nullptr, stream>>>(xDevice, yDevice, zDevice, tiling);
    aclrtSynchronizeStream(stream);

    aclrtMemcpy(zHost, totalByteSize, zDevice, totalByteSize, ACL_MEMCPY_DEVICE_TO_HOST);
    std::vector<float> z((float *)zHost, (float *)(zHost + totalByteSize));

    aclrtFree(xDevice);
    aclrtFree(yDevice);
    aclrtFree(zDevice);
    aclrtFreeHost(zHost);

    aclrtDestroyStream(stream);
    aclrtResetDevice(deviceId);
    aclFinalize();

    return z;
}

uint32_t VerifyResult(std::vector<float> &output, std::vector<float> &golden)
{
    auto printTensor = [](std::vector<float> &tensor, const char *name) {
        constexpr size_t maxPrintSize = 20;
        std::cout << name << ": ";
        std::copy(tensor.begin(), tensor.begin() + std::min(tensor.size(), maxPrintSize),
            std::ostream_iterator<float>(std::cout, " "));
        if (tensor.size() > maxPrintSize) {
            std::cout << "...";
        }
        std::cout << std::endl;
    };
    printTensor(output, "Output");
    printTensor(golden, "Golden");
    if (std::equal(golden.begin(), golden.end(), output.begin())) {
        std::cout << "[Success] Case accuracy is verification passed." << std::endl;
        return 0;
    } else {
        std::cout << "[Failed] Case accuracy is verification failed!" << std::endl;
        return 1;
    }
    return 0;
}

int32_t main(int32_t argc, char *argv[])
{
    constexpr uint32_t totalLength = 8 * 2048;
    constexpr float valueX = 1.2f;
    constexpr float valueY = 2.3f;
    std::vector<float> x(totalLength, valueX);
    std::vector<float> y(totalLength, valueY);

    std::vector<float> output = kernel_add(x, y);

    std::vector<float> golden(totalLength, valueX + valueY);
    return VerifyResult(output, golden);
}

输出：

totalLength: 16384, tileNum: 8, blockNum: 8
DumpTensor: desc=63, addr=0x0, data_type=float32, position=UB, dump_size=8
[1.200000, 1.200000, 1.200000, 1.200000, 1.200000, 1.200000, 1.200000, 1.200000]
DumpTensor: desc=64, addr=0x400, data_type=float32, position=UB, dump_size=8
[2.300000, 2.300000, 2.300000, 2.300000, 2.300000, 2.300000, 2.300000, 2.300000]
DumpTensor: desc=70, addr=0x800, data_type=float32, position=UB, dump_size=8
[3.500000, 3.500000, 3.500000, 3.500000, 3.500000, 3.500000, 3.500000, 3.500000]
...

Ascend EP 形态下日志默认存储在 $HOME/ascend/log/ 目录。RC 形态下默认存储在 /var/log/npu/slog/ 目录。可通过环境变量 ASCEND_PROCESS_LOG_PATH 自定义日志落盘路径（仅适用于 Ascend EP 模式）。关于精度问题的例子：

class KernelAdd {
public:
    __aicore__ inline KernelAdd() {}
    __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z, uint32_t totalLength, uint32_t tileNum)
    {
        this->blockLength = totalLength / AscendC::GetBlockNum();
        this->tileNum = tileNum;
        this->tileLength = this->blockLength / tileNum / BUFFER_NUM;
        xGm.SetGlobalBuffer((__gm__ float *)x + this->blockLength * AscendC::GetBlockIdx(), this->blockLength);
        yGm.SetGlobalBuffer((__gm__ float *)y + this->blockLength * AscendC::GetBlockIdx(), this->blockLength);
        zGm.SetGlobalBuffer((__gm__ float *)z + this->blockLength * AscendC::GetBlockIdx(), this->blockLength);
        pipe.InitBuffer(inQueueX, BUFFER_NUM, this->tileLength); //问题出在这里
        pipe.InitBuffer(inQueueY, BUFFER_NUM, this->tileLength * sizeof(float));
        pipe.InitBuffer(outQueueZ, BUFFER_NUM, this->tileLength * sizeof(float));
    }
    //...
    
private:
    __aicore__ inline void Compute(int32_t progress)
    {
        AscendC::LocalTensor<float> xLocal = inQueueX.DeQue<float>();
        AscendC::LocalTensor<float> yLocal = inQueueY.DeQue<float>();
        AscendC::LocalTensor<float> zLocal = outQueueZ.AllocTensor<float>();
        // 添加打印调试信息：
        //xLocal size: 32
		//tileLength: 128
        AscendC::printf("xLocal size: %d\n", xLocal.GetSize());
        AscendC::printf("tileLength: %d\n", this->tileLength);
        
        AscendC::Add(zLocal, xLocal, yLocal, this->tileLength);
        outQueueZ.EnQue<float>(zLocal);
        inQueueX.FreeTensor(xLocal);
        inQueueY.FreeTensor(yLocal);
    }
}

关于对齐问题的例子。定位思路

检查API使用是否满足通用对齐约束：确认DataCopy、VEC 相关指令的地址对齐要求，检查代码中内存访问的起始地址是否满足约束。
打印地址偏移: 在 DataCopy等API前添加printf，打印 UB/GM 的访问起始地址偏移量，验证是否为对齐值的整数倍：
孪生调试（CPU 侧）验证: 将算子工程改为核函数调用模式，通过孪生调试的CPU侧调试直接显示错误原因，精准定位错误行。

cd Sources/07.04/log/debug/plog && \
grep -nr "ERROR"

日志输出：

plog-2139_20260401113227121.log:1:[ERROR] RUNTIME(2139,demo):2026-04-01-11:32:27.103.305 [stars_engine.cc:1534]2216 ProcLogicCqReport:Task run failed, device_id=0, stream_id=47, task_id=0, sqe_type=0(ffts), errType=0x1(task exception), sqSwStatus=0
plog-2139_20260401113227121.log:2:[ERROR] RUNTIME(2139,demo):2026-04-01-11:32:27.131.791 [device_error_core_proc.cc:321]2216 AddExceptionRegInfo:add error register: core_id=10, stream_id=47, task_id=0
plog-2139_20260401113227121.log:3:[ERROR] RUNTIME(2139,demo):2026-04-01-11:32:27.131.817 [device_error_core_proc.cc:347]2216 PrintCoreInfo:The error from device(chipId:6, dieId:0), serial number is 9, there is an exception of aivec error, core id is 10, error code = 0, dump info: pc start: 0x12c041200000, current: 0x12c041200780, vec error info: 0xdf0571f120, mte error info: 0xfe, ifu error info: 0x10433f7000c0, ccu error info: 0x164c066855a631e3, cube error info: 0, biu error info: 0, aic error mask: 0x6500020bd00028c, para base: 0x12c100000000.
#重点是下面一条日志里面的 errorStr: instruction address misalign(ADDR_MISALIGN).
plog-2139_20260401113227121.log:4:[ERROR] RUNTIME(2139,demo):2026-04-01-11:32:27.131.870 [device_error_core_proc.cc:360]2216 PrintCoreInfo:The extend info: errcode:(0, 0x10000, 0) errorStr: instruction address misalign(ADDR_MISALIGN). fixp_error0 info: 0xfe, fixp_error1 info: 0, fsmId:0, tslot:6, thread:0, ctxid:0, blk:0, sublk:0, subErrType:4.

原因是下面这段代码的问题：

    __aicore__ inline void CopyOut(int32_t progress)
    {
        AscendC::LocalTensor<float> zLocal = outQueueZ.DeQue<float>();
        AscendC::DataCopy(zGm[progress * this->tileLength], zLocal[1], this->tileLength); //这里zLocal[1]就是问题
        outQueueZ.FreeTensor(zLocal);
    }

最后实践题是找出sinh_custom.asc源码中存在两处代码问题并修正。

【声明】本内容来自华为云开发者社区博主，不代表华为云及华为云开发者社区的观点和立场。转载时必须标注文章的来源（华为云社区）、文章链接、文章作者等基本信息，否则作者和本社区有权追究责任。如果您发现本社区中有涉嫌抄袭的内容，欢迎发送邮件进行举报，并提供相关证据，一经查实，本社区将立刻删除涉嫌侵权内容，举报邮箱： cloudbbs@huaweicloud.com

点赞
收藏
关注作者

0/1000

抱歉，系统识别当前为高风险访问，暂不支持该操作

全部回复

上滑加载中

设置昵称

在此一键设置昵称，即可参与社区互动！

*长度不超过10个汉字或20个英文字符，设置后3个月内不可修改。

确认取消

加入云驻计划，成为创作者

华为云周边好礼
免费体验产品
特殊身份标识
线下官方门票
内部专家零距离
与10000+优质创作者共同成长

立即加入

CANN学习资源开源仓的算子调试二打印和日志

全部回复

设置昵称

关于作者

目录

热门推荐查看更多

相关文章

加入云驻计划，成为创作者

相关产品