CANN学习资源开源仓的算子调试二打印和日志

举报
黄生 发表于 2026/04/01 11:49:03 2026/04/01
【摘要】 DumpTensor 在算子执行过程中打印 Tensor 的内容,支持附加自定义信息(desc 参数,仅支持 uint32_t 类型),例如行号或标识符,以便在多处调用时区分不同输出。 参数 说明 tensor 支持 LocalTensor(位于 Unified Buffer/L1/L0C)或 GlobalTenso...

DumpTensor 在算子执行过程中打印 Tensor 的内容,支持附加自定义信息(desc 参数,仅支持 uint32_t 类型),例如行号或标识符,以便在多处调用时区分不同输出。

参数 说明
tensor 支持 LocalTensor(位于 Unified Buffer/L1/L0C)或 GlobalTensor(位于 Global Memory)。
desc 用户自定义附加信息(如行号),用于区分不同调用位置的输出。
dumpSize 需要打印的元素个数。
shapeInfo 按照Tensor的shape信息进行打印。当 shapeInfo 指定的尺寸大于 dumpSize 时,不足部分用 "-" 填充显示;若小于等于 dumpSize,则按实际形状打印,多余元素不显示。

CPU调试用到的printf接口在NPU板上也可以使用,其语法与标准 C/C++ printf 兼容。

在Add算子核函数的Compute方法中,用DumpTensor输出LocalTensor的内容,验证每个AI Core在首次计算时的输入与输出是否符合预期。(xLocal、yLocal和zLocal的前8个元素)。在Init方法中,用printf输出关键的切分参数,验证多核并行场景下的数据划分是否符合预期。(数据总长度、每个核心的数据切块数以及参与计算的AI Core总数),这个我倒觉得没必要,因为tiling是从host传过来的,在host打印更方便,何必到device上呢?

//add_custom.asc

#include <cstdint>
#include <iostream>
#include <vector>
#include <algorithm>
#include <iterator>
#include "acl/acl.h"
#include "kernel_operator.h"

constexpr uint32_t BUFFER_NUM = 2; // tensor num for each queue

struct AddCustomTilingData
{
    uint32_t totalLength;
    uint32_t tileNum;
};

class KernelAdd {
public:
    __aicore__ inline KernelAdd() {}
    __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z, uint32_t totalLength, uint32_t tileNum)
    {
        this->blockLength = totalLength / AscendC::GetBlockNum();
        this->tileNum = tileNum;
        this->tileLength = this->blockLength / tileNum / BUFFER_NUM;
        // 打印关键参数,验证切分逻辑
        AscendC::printf("totalLength: %d, tileNum: %d, blockNum: %lu\n", 
                        totalLength, tileNum, AscendC::GetBlockNum());
        xGm.SetGlobalBuffer((__gm__ float *)x + this->blockLength * AscendC::GetBlockIdx(), this->blockLength);
        yGm.SetGlobalBuffer((__gm__ float *)y + this->blockLength * AscendC::GetBlockIdx(), this->blockLength);
        zGm.SetGlobalBuffer((__gm__ float *)z + this->blockLength * AscendC::GetBlockIdx(), this->blockLength);
        pipe.InitBuffer(inQueueX, BUFFER_NUM, this->tileLength * sizeof(float));
        pipe.InitBuffer(inQueueY, BUFFER_NUM, this->tileLength * sizeof(float));
        pipe.InitBuffer(outQueueZ, BUFFER_NUM, this->tileLength * sizeof(float));
    }
    __aicore__ inline void Process()
    {
        int32_t loopCount = this->tileNum * BUFFER_NUM;
        for (int32_t i = 0; i < loopCount; i++) {
            CopyIn(i);
            Compute(i);
            CopyOut(i);
        }
    }

private:
    __aicore__ inline void CopyIn(int32_t progress)
    {
        AscendC::LocalTensor<float> xLocal = inQueueX.AllocTensor<float>();
        AscendC::LocalTensor<float> yLocal = inQueueY.AllocTensor<float>();
        AscendC::DataCopy(xLocal, xGm[progress * this->tileLength], this->tileLength);
        AscendC::DataCopy(yLocal, yGm[progress * this->tileLength], this->tileLength);
        inQueueX.EnQue(xLocal);
        inQueueY.EnQue(yLocal);
    }
    __aicore__ inline void Compute(int32_t progress)
    {
        AscendC::LocalTensor<float> xLocal = inQueueX.DeQue<float>();
        AscendC::LocalTensor<float> yLocal = inQueueY.DeQue<float>();
        AscendC::LocalTensor<float> zLocal = outQueueZ.AllocTensor<float>();
        // 仅在第一次调用时打印输入和输出数据的前8个元素
        if (progress == 0) {
            AscendC::DumpTensor(xLocal, __LINE__, 8);
            AscendC::DumpTensor(yLocal, __LINE__, 8);
        }
        
        AscendC::Add(zLocal, xLocal, yLocal, this->tileLength);
        
        if (progress == 0) {
            AscendC::DumpTensor(zLocal, __LINE__, 8);
        }
        outQueueZ.EnQue<float>(zLocal);
        inQueueX.FreeTensor(xLocal);
        inQueueY.FreeTensor(yLocal);
    }
    __aicore__ inline void CopyOut(int32_t progress)
    {
        AscendC::LocalTensor<float> zLocal = outQueueZ.DeQue<float>();
        AscendC::DataCopy(zGm[progress * this->tileLength], zLocal, this->tileLength);
        outQueueZ.FreeTensor(zLocal);
    }

private:
    AscendC::TPipe pipe;
    AscendC::TQue<AscendC::TPosition::VECIN, BUFFER_NUM> inQueueX, inQueueY;
    AscendC::TQue<AscendC::TPosition::VECOUT, BUFFER_NUM> outQueueZ;
    AscendC::GlobalTensor<float> xGm;
    AscendC::GlobalTensor<float> yGm;
    AscendC::GlobalTensor<float> zGm;
    uint32_t blockLength;
    uint32_t tileNum;
    uint32_t tileLength;
};

__global__ __aicore__ void add_custom(GM_ADDR x, GM_ADDR y, GM_ADDR z, AddCustomTilingData tiling)
{
    KERNEL_TASK_TYPE_DEFAULT(KERNEL_TYPE_AIV_ONLY);
    KernelAdd op;
    op.Init(x, y, z, tiling.totalLength, tiling.tileNum);
    op.Process();
}

std::vector<float> kernel_add(std::vector<float> &x, std::vector<float> &y)
{
    constexpr uint32_t blockDim = 8;
    uint32_t totalLength = x.size();
    size_t totalByteSize = totalLength * sizeof(float);
    int32_t deviceId = 0;
    aclrtStream stream = nullptr;
    AddCustomTilingData tiling = {/*totalLength:*/totalLength, /*tileNum:*/8};
    uint8_t *xHost = reinterpret_cast<uint8_t *>(x.data());
    uint8_t *yHost = reinterpret_cast<uint8_t *>(y.data());
    uint8_t *zHost = nullptr;
    uint8_t *xDevice = nullptr;
    uint8_t *yDevice = nullptr;
    uint8_t *zDevice = nullptr;

    aclInit(nullptr);
    aclrtSetDevice(deviceId);
    aclrtCreateStream(&stream);

    aclrtMallocHost((void **)(&zHost), totalByteSize);
    aclrtMalloc((void **)&xDevice, totalByteSize, ACL_MEM_MALLOC_HUGE_FIRST);
    aclrtMalloc((void **)&yDevice, totalByteSize, ACL_MEM_MALLOC_HUGE_FIRST);
    aclrtMalloc((void **)&zDevice, totalByteSize, ACL_MEM_MALLOC_HUGE_FIRST);

    aclrtMemcpy(xDevice, totalByteSize, xHost, totalByteSize, ACL_MEMCPY_HOST_TO_DEVICE);
    aclrtMemcpy(yDevice, totalByteSize, yHost, totalByteSize, ACL_MEMCPY_HOST_TO_DEVICE);

    add_custom<<<blockDim, nullptr, stream>>>(xDevice, yDevice, zDevice, tiling);
    aclrtSynchronizeStream(stream);

    aclrtMemcpy(zHost, totalByteSize, zDevice, totalByteSize, ACL_MEMCPY_DEVICE_TO_HOST);
    std::vector<float> z((float *)zHost, (float *)(zHost + totalByteSize));

    aclrtFree(xDevice);
    aclrtFree(yDevice);
    aclrtFree(zDevice);
    aclrtFreeHost(zHost);

    aclrtDestroyStream(stream);
    aclrtResetDevice(deviceId);
    aclFinalize();

    return z;
}

uint32_t VerifyResult(std::vector<float> &output, std::vector<float> &golden)
{
    auto printTensor = [](std::vector<float> &tensor, const char *name) {
        constexpr size_t maxPrintSize = 20;
        std::cout << name << ": ";
        std::copy(tensor.begin(), tensor.begin() + std::min(tensor.size(), maxPrintSize),
            std::ostream_iterator<float>(std::cout, " "));
        if (tensor.size() > maxPrintSize) {
            std::cout << "...";
        }
        std::cout << std::endl;
    };
    printTensor(output, "Output");
    printTensor(golden, "Golden");
    if (std::equal(golden.begin(), golden.end(), output.begin())) {
        std::cout << "[Success] Case accuracy is verification passed." << std::endl;
        return 0;
    } else {
        std::cout << "[Failed] Case accuracy is verification failed!" << std::endl;
        return 1;
    }
    return 0;
}

int32_t main(int32_t argc, char *argv[])
{
    constexpr uint32_t totalLength = 8 * 2048;
    constexpr float valueX = 1.2f;
    constexpr float valueY = 2.3f;
    std::vector<float> x(totalLength, valueX);
    std::vector<float> y(totalLength, valueY);

    std::vector<float> output = kernel_add(x, y);

    std::vector<float> golden(totalLength, valueX + valueY);
    return VerifyResult(output, golden);
}

输出:

totalLength: 16384, tileNum: 8, blockNum: 8
DumpTensor: desc=63, addr=0x0, data_type=float32, position=UB, dump_size=8
[1.200000, 1.200000, 1.200000, 1.200000, 1.200000, 1.200000, 1.200000, 1.200000]
DumpTensor: desc=64, addr=0x400, data_type=float32, position=UB, dump_size=8
[2.300000, 2.300000, 2.300000, 2.300000, 2.300000, 2.300000, 2.300000, 2.300000]
DumpTensor: desc=70, addr=0x800, data_type=float32, position=UB, dump_size=8
[3.500000, 3.500000, 3.500000, 3.500000, 3.500000, 3.500000, 3.500000, 3.500000]
...

Ascend EP 形态下日志默认存储在 $HOME/ascend/log/ 目录。RC 形态下默认存储在 /var/log/npu/slog/ 目录。可通过环境变量 ASCEND_PROCESS_LOG_PATH 自定义日志落盘路径(仅适用于 Ascend EP 模式)。关于精度问题的例子:

class KernelAdd {
public:
    __aicore__ inline KernelAdd() {}
    __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z, uint32_t totalLength, uint32_t tileNum)
    {
        this->blockLength = totalLength / AscendC::GetBlockNum();
        this->tileNum = tileNum;
        this->tileLength = this->blockLength / tileNum / BUFFER_NUM;
        xGm.SetGlobalBuffer((__gm__ float *)x + this->blockLength * AscendC::GetBlockIdx(), this->blockLength);
        yGm.SetGlobalBuffer((__gm__ float *)y + this->blockLength * AscendC::GetBlockIdx(), this->blockLength);
        zGm.SetGlobalBuffer((__gm__ float *)z + this->blockLength * AscendC::GetBlockIdx(), this->blockLength);
        pipe.InitBuffer(inQueueX, BUFFER_NUM, this->tileLength); //问题出在这里
        pipe.InitBuffer(inQueueY, BUFFER_NUM, this->tileLength * sizeof(float));
        pipe.InitBuffer(outQueueZ, BUFFER_NUM, this->tileLength * sizeof(float));
    }
    //...
    
private:
    __aicore__ inline void Compute(int32_t progress)
    {
        AscendC::LocalTensor<float> xLocal = inQueueX.DeQue<float>();
        AscendC::LocalTensor<float> yLocal = inQueueY.DeQue<float>();
        AscendC::LocalTensor<float> zLocal = outQueueZ.AllocTensor<float>();
        // 添加打印调试信息:
        //xLocal size: 32
		//tileLength: 128
        AscendC::printf("xLocal size: %d\n", xLocal.GetSize());
        AscendC::printf("tileLength: %d\n", this->tileLength);
        
        AscendC::Add(zLocal, xLocal, yLocal, this->tileLength);
        outQueueZ.EnQue<float>(zLocal);
        inQueueX.FreeTensor(xLocal);
        inQueueY.FreeTensor(yLocal);
    }
}        

关于对齐问题的例子。定位思路

  • 检查API使用是否满足通用对齐约束:确认DataCopy、VEC 相关指令的地址对齐要求,检查代码中内存访问的起始地址是否满足约束。
  • 打印地址偏移: 在 DataCopy等API前添加printf,打印 UB/GM 的访问起始地址偏移量,验证是否为对齐值的整数倍:
  • 孪生调试(CPU 侧)验证: 将算子工程改为核函数调用模式,通过孪生调试的CPU侧调试直接显示错误原因,精准定位错误行。
cd Sources/07.04/log/debug/plog && \
grep -nr "ERROR"

日志输出:

plog-2139_20260401113227121.log:1:[ERROR] RUNTIME(2139,demo):2026-04-01-11:32:27.103.305 [stars_engine.cc:1534]2216 ProcLogicCqReport:Task run failed, device_id=0, stream_id=47, task_id=0, sqe_type=0(ffts), errType=0x1(task exception), sqSwStatus=0
plog-2139_20260401113227121.log:2:[ERROR] RUNTIME(2139,demo):2026-04-01-11:32:27.131.791 [device_error_core_proc.cc:321]2216 AddExceptionRegInfo:add error register: core_id=10, stream_id=47, task_id=0
plog-2139_20260401113227121.log:3:[ERROR] RUNTIME(2139,demo):2026-04-01-11:32:27.131.817 [device_error_core_proc.cc:347]2216 PrintCoreInfo:The error from device(chipId:6, dieId:0), serial number is 9, there is an exception of aivec error, core id is 10, error code = 0, dump info: pc start: 0x12c041200000, current: 0x12c041200780, vec error info: 0xdf0571f120, mte error info: 0xfe, ifu error info: 0x10433f7000c0, ccu error info: 0x164c066855a631e3, cube error info: 0, biu error info: 0, aic error mask: 0x6500020bd00028c, para base: 0x12c100000000.
#重点是下面一条日志里面的 errorStr: instruction address misalign(ADDR_MISALIGN).
plog-2139_20260401113227121.log:4:[ERROR] RUNTIME(2139,demo):2026-04-01-11:32:27.131.870 [device_error_core_proc.cc:360]2216 PrintCoreInfo:The extend info: errcode:(0, 0x10000, 0) errorStr: instruction address misalign(ADDR_MISALIGN). fixp_error0 info: 0xfe, fixp_error1 info: 0, fsmId:0, tslot:6, thread:0, ctxid:0, blk:0, sublk:0, subErrType:4.

原因是下面这段代码的问题:

    __aicore__ inline void CopyOut(int32_t progress)
    {
        AscendC::LocalTensor<float> zLocal = outQueueZ.DeQue<float>();
        AscendC::DataCopy(zGm[progress * this->tileLength], zLocal[1], this->tileLength); //这里zLocal[1]就是问题
        outQueueZ.FreeTensor(zLocal);
    }

最后实践题是找出sinh_custom.asc源码中存在两处代码问题并修正。

【声明】本内容来自华为云开发者社区博主,不代表华为云及华为云开发者社区的观点和立场。转载时必须标注文章的来源(华为云社区)、文章链接、文章作者等基本信息,否则作者和本社区有权追究责任。如果您发现本社区中有涉嫌抄袭的内容,欢迎发送邮件进行举报,并提供相关证据,一经查实,本社区将立刻删除涉嫌侵权内容,举报邮箱: cloudbbs@huaweicloud.com
  • 点赞
  • 收藏
  • 关注作者

评论(0

0/1000
抱歉,系统识别当前为高风险访问,暂不支持该操作

全部回复

上滑加载中

设置昵称

在此一键设置昵称,即可参与社区互动!

*长度不超过10个汉字或20个英文字符,设置后3个月内不可修改。

*长度不超过10个汉字或20个英文字符,设置后3个月内不可修改。