- 微信
- 微博
  
  分享文章到微博
- 复制链接
  
  复制链接到剪贴板

Ascend C算子开发（中级）—— 编写Sinh算子

Byyyi耀发表于 2024/09/23 16:55:34 2024/09/23

【摘要】 Ascend C算子开发（中级）—— 编写Sinh算子准备工作一块香橙派AI pro开发板，一根Type-c口的电源线，一根网线，一个网线转接器，一台笔记本电脑。香橙派与PC连接1). 硬件连接与启动（如下图所示）a）检查Orange Pi AI pro 是否已经插入SD卡；b）使用网线连接Orange Pi AI pro以太网口，网线另一端连接PC/转接头；c）连接电源，如下图所示两...

Ascend C算子开发（中级）—— 编写Sinh算子

准备工作

一块香橙派AI pro开发板，一根Type-c口的电源线，一根网线，一个网线转接器，一台笔记本电脑。

香橙派与PC连接

1). 硬件连接与启动（如下图所示）

a）检查Orange Pi AI pro 是否已经插入SD卡；
b）使用网线连接Orange Pi AI pro以太网口，网线另一端连接PC/转接头；
c）连接电源，如下图所示两个LED指示灯绿色常亮，表示启动正常；
d）网口下方两个灯，右侧绿灯常亮，左侧橙灯闪烁，代表网口物理连接正常

2).从PC远程登录到香橙派（根据各自系统版本选择）

（Windows）以太网口远程登录：
开发文档（备注，OPi AIpro以太网口为192.168.137.100，Windows PC以太网可设置为192.168.137.101 ； ssh链接以 root 用户名登录，密码为 Mind@123）
Mac系统远程登录：
开发文档（备注，OPi AIpro以太网口为192.168.137.100，Mac PC以太网口可设置为192.168.137.2 ；ssh链接以 root 用户名登录，密码为 Mind@123）

Add 算子调用体验

编译并调用一个Add算子的全过程：

编译

cd ~/samples/operator/AddCustomSample/FrameworkLaunch/AddCustom
bash build.sh

部署

cd build_out
./custom_opp_ubuntu_aarch64.run

调用

cd ~/samples/operator/AddCustomSample/FrameworkLaunch/AclNNInvocation
bash run.sh

Sinh算子开发（Ascend C算子开发中级认证考试内容）

1). 使用提供的考试代码工程，cd /root/SinhCustom/SinhCustom ，依次打开下图红框所示的三个源码文件，并根据注释提示补全代码；可以对照Add算子的例子进行修改。

op_host端

sinh_custom.cpp

#include "sinh_custom_tiling.h"
#include "register/op_def_registry.h"

namespace optiling {
/**
Tiling Func负责对输入数据进行分块（Tile）处理。分块处理的好处在于，可以并行计算不同块中的数据，提升计算效率。
BLOCK_DIM 定义了每次计算操作需要处理的块的数量。
TILE_NUM 定义了在每个计算块中进一步将数据划分为更小的子块。每个子块的数据大小由blocklength/TILE_NUM来决定。
该方法将 totalLength 和 TILE_NUM 此类方法保存在tiling对象中，随后将这些信息写入`RawTilingData`中
**/
static ge::graphStatus TilingFunc(gert::TilingContext* context)
{
    SinhCustomTilingData tiling;
    //考生自行填充
    const uint32_t BLOCK_DIM = 8;
    const uint32_t TILE_NUM = 8;
    uint32_t totalLength = context->GetInputShape(0)->GetOriginShape().GetShapeSize();
    context->SetBlockDim(BLOCK_DIM);
    tiling.set_totalLength(totalLength);
    tiling.set_tileNum(TILE_NUM);
    tiling.SaveToBuffer(context->GetRawTilingData()->GetData(), 
    context->GetRawTilingData()->GetCapacity());
    context->GetRawTilingData()->SetDataSize(tiling.GetDataSize());
    size_t *currentWorkspace = context->GetWorkspaceSizes(1);
    currentWorkspace[0] = 0;
    return ge::GRAPH_SUCCESS;
}
}
/**
这个函数定义了输入与输出的形状推理逻辑，保证输入和输出的形状是相同的。
**/
namespace ge {
static ge::graphStatus InferShape(gert::InferShapeContext* context)
{
    const gert::Shape* x1_shape = context->GetInputShape(0);
    gert::Shape* y_shape = context->GetOutputShape(0);
    *y_shape = *x1_shape;
    return GRAPH_SUCCESS;
}
}
/**
该类定义了一个自定义的sinh算子，明确了输入和输出的张量格式和数据类型（DT_FLOAT16），并且指定该算子的推理形状函数是InferShape，Tiling函数是TilingFunc。
最后，通过OP_ADD(SinhCustom)将该算子注册到Ascend编译器中。
**/
namespace ops {
class SinhCustom : public OpDef {
public:
    explicit SinhCustom(const char* name) : OpDef(name)
    {
        this->Input("x")
            .ParamType(REQUIRED)
            .DataType({ge::DT_FLOAT16})
            .Format({ge::FORMAT_ND})
            .UnknownShapeFormat({ge::FORMAT_ND});
        this->Output("y")
            .ParamType(REQUIRED)
            .DataType({ge::DT_FLOAT16})
            .Format({ge::FORMAT_ND})
            .UnknownShapeFormat({ge::FORMAT_ND});

        this->SetInferShape(ge::InferShape);

        this->AICore()
            .SetTiling(optiling::TilingFunc);
        this->AICore().AddConfig("ascend310b");
    }
};

OP_ADD(SinhCustom);
}

sinh_custom_tilling.h


#include "register/tilingdata_base.h"
/**
这里定义了tiling数据结构的字段totalLength和tileNum，它们分别表示输入数据的总长度和分块数目。通过REGISTER_TILING_DATA_CLASS将SinhCustomTilingData与算子SinhCustom进行绑定。
**/
namespace optiling {
BEGIN_TILING_DATA_DEF(SinhCustomTilingData)
  //考生自行定义tiling结构体成员变量
  TILING_DATA_FIELD_DEF(uint32_t, totalLength);
  TILING_DATA_FIELD_DEF(uint32_t, tileNum);
END_TILING_DATA_DEF;
REGISTER_TILING_DATA_CLASS(SinhCustom, SinhCustomTilingData)
}

op_kernel端

前两个类和Add的算子对应类完全相同，关键需要修改的是op_kernel端的逻辑，因为sinh算子的公式为sinh(x) = (exp(x) - exp(-x)) / 2.0，总共分为四个部分，分别是

sinh_custom.cpp

#include "kernel_operator.h"
using namespace AscendC;
constexpr int32_t BUFFER_NUM = 2;

class KernelSinh {
public:
    __aicore__ inline KernelSinh() {}
    /**
    该函数负责初始化全局和局部缓存、块和Tile的长度，并根据tileNum和blockLength来计算tileLength。
xGm.SetGlobalBuffer 和 yGm.SetGlobalBuffer 初始化全局内存上的输入和输出数据区域。
pipe.InitBuffer 初始化了多个队列和临时缓冲区，用于算子执行过程中数据的缓存和处理。
    **/
    __aicore__ inline void Init(GM_ADDR x,GM_ADDR y,uint32_t totalLength, uint32_t tileNum)
    {
        //考生补充初始化代码
        ASSERT(GetBlockNum() != 0 && "block dim can not be zero!");
        this->blockLength = totalLength / GetBlockNum();
        this->tileNum = tileNum;
        ASSERT(tileNum != 0 && "tile num can not be zero!");
        this->tileLength = this->blockLength / tileNum / BUFFER_NUM;
        xGm.SetGlobalBuffer((__gm__ DTYPE_X *)x + this->blockLength * GetBlockIdx(), 
        this->blockLength);
        yGm.SetGlobalBuffer((__gm__ DTYPE_Y *)y + this->blockLength * GetBlockIdx(), 
        this->blockLength);
        pipe.InitBuffer(inQueueX, BUFFER_NUM, this->tileLength * sizeof(DTYPE_X));
        pipe.InitBuffer(outQueueY, BUFFER_NUM, this->tileLength * sizeof(DTYPE_Y));
        pipe.InitBuffer(tmpBuffer1, this->tileLength * sizeof(DTYPE_X));
        pipe.InitBuffer(tmpBuffer2, this->tileLength * sizeof(DTYPE_X));
        pipe.InitBuffer(tmpBuffer3, this->tileLength * sizeof(DTYPE_X));
        pipe.InitBuffer(tmpBuffer4, this->tileLength * sizeof(DTYPE_X));
    }
    __aicore__ inline void Process()
    {
        /*
        Process函数执行主循环，每次循环中执行三个步骤：从全局内存拷贝数据到局部内存（CopyIn），计算（Compute），然后将结果从局部内存拷贝回全局内存（CopyOut）。
        */
        int32_t loopCount = this->tileNum*BUFFER_NUM;
        for (int32_t i = 0; i < loopCount; i++) {
            CopyIn(i);
            Compute(i);
            CopyOut(i);
        }
    }

private:
    __aicore__ inline void CopyIn(int32_t progress)
    {
        //考生补充算子代码
        LocalTensor<DTYPE_X> xLocal = inQueueX.AllocTensor<DTYPE_X>();
        DataCopy(xLocal, xGm[progress * this->tileLength], this->tileLength);
        inQueueX.EnQue(xLocal);
    }
    __aicore__ inline void Compute(int32_t progress)
    {
        //考生补充算子计算代码
        LocalTensor<DTYPE_X> xLocal = inQueueX.DeQue<DTYPE_X>();
        LocalTensor<DTYPE_Y> yLocal = outQueueY.AllocTensor<DTYPE_Y>();
        LocalTensor<DTYPE_X> tmpTensor1 = tmpBuffer1.Get<DTYPE_X>();
        LocalTensor<DTYPE_X> tmpTensor2 = tmpBuffer2.Get<DTYPE_X>();
        LocalTensor<DTYPE_X> tmpTensor3 = tmpBuffer3.Get<DTYPE_X>();
        LocalTensor<DTYPE_X> tmpTensor4 = tmpBuffer4.Get<DTYPE_X>();
        DTYPE_X inputVal1 = -1;
        DTYPE_X inputVal2 = 0.5;
        //sinh(x) = (exp(x) - exp(-x)) / 2.0
        /**
        将输入张量乘以-1（Muls），得到-x。
		计算exp(-x)（Exp）。
		计算exp(x)。
		计算exp(x) - exp(-x)（Sub）。
		将结果乘以0.5，得到sinh(x)的结果（Muls）。
        **/
        Muls(tmpTensor1, xLocal, inputVal1, this->tileLength);
        Exp(tmpTensor2, tmpTensor1, this->tileLength);
        Exp(tmpTensor3, xLocal, this->tileLength);
        Sub(tmpTensor4, tmpTensor3, tmpTensor2, this->tileLength);
        Muls(yLocal, tmpTensor4, inputVal2, this->tileLength);
        outQueueY.EnQue<DTYPE_Y>(yLocal);
        inQueueX.FreeTensor(xLocal);
    }
    __aicore__ inline void CopyOut(int32_t progress)
    {
        //考生补充算子代码
        LocalTensor<DTYPE_Y> yLocal = outQueueY.DeQue<DTYPE_Y>();
        DataCopy(yGm[progress * this->tileLength], yLocal, this->tileLength);
        outQueueY.FreeTensor(yLocal);
    }

private:
    TPipe pipe;
    //create queue for input, in this case depth is equal to buffer num
    TQue<QuePosition::VECIN, BUFFER_NUM> inQueueX;
    //create queue for output, in this case depth is equal to buffer num
    TQue<QuePosition::VECOUT, BUFFER_NUM> outQueueY;
    GlobalTensor<half> xGm;
    GlobalTensor<half> yGm;

    //考生补充自定义成员变量
    TBuf<QuePosition::VECCALC> tmpBuffer1, tmpBuffer2, tmpBuffer3, tmpBuffer4;
    uint32_t blockLength;
    uint32_t tileNum;
    uint32_t tileLength;
};
/**
这是最终的自定义内核函数，通过Init函数初始化操作，并调用Process函数执行具体计算。
**/
extern "C" __global__ __aicore__ void sinh_custom(GM_ADDR x, GM_ADDR y, GM_ADDR workspace, GM_ADDR tiling) {
    GET_TILING_DATA(tiling_data, tiling);
    KernelSinh op;
    //补充init和process函数调用内容
    op.Init(x, y, tiling_data.totalLength, tiling_data.tileNum);
    op.Process();
}

2)代码补齐完成后，cd /root/SinhCustom/SinhCustom ，然后执行如下命令进行编译构造：
bash build.sh
当命令显示如下信息，证明构建成功

3)构建成功之后，

cd /root/SinhCustom/SinhCustom/build_out 
./custom_opp_ubuntu_aarch64.run

当命令行显示如下信息证明安装成功:

4)最后，

cd /root/SinhCustom/AclNNInvocation 
bash run.sh

当命令行显示如下信息，说明通过测试.

5)测试通过后，将上述代码打包在zip 包内，例如使用如下命令：

cd /root 
zip -r SinhCustom.zip SinhCustom

6)打包完成后，到MobaXTerm左侧的文件栏中找到压缩包，下载到PC本地，上传到考试页面，交卷即可。

【声明】本内容来自华为云开发者社区博主，不代表华为云及华为云开发者社区的观点和立场。转载时必须标注文章的来源（华为云社区）、文章链接、文章作者等基本信息，否则作者和本社区有权追究责任。如果您发现本社区中有涉嫌抄袭的内容，欢迎发送邮件进行举报，并提供相关证据，一经查实，本社区将立刻删除涉嫌侵权内容，举报邮箱： cloudbbs@huaweicloud.com

点赞
收藏
关注作者

0/1000

抱歉，系统识别当前为高风险访问，暂不支持该操作

全部回复

上滑加载中

设置昵称

在此一键设置昵称，即可参与社区互动！

*长度不超过10个汉字或20个英文字符，设置后3个月内不可修改。

确认取消

加入云驻计划，成为创作者

华为云周边好礼
免费体验产品
特殊身份标识
线下官方门票
内部专家零距离
与10000+优质创作者共同成长

立即加入

Ascend C算子开发（中级）—— 编写Sinh算子

Ascend C算子开发（中级）—— 编写Sinh算子

准备工作

香橙派与PC连接

Add 算子调用体验

Sinh算子开发（Ascend C算子开发中级认证考试内容）

全部回复

设置昵称

关于作者

目录

加入云驻计划，成为创作者

Ascend C算子开发（中级）—— 编写Sinh算子

Ascend C算子开发（中级）—— 编写Sinh算子

准备工作

香橙派与PC连接

Add 算子调用体验

Sinh算子开发（Ascend C算子开发中级认证考试内容）

全部回复

设置昵称

关于作者

目录

热门推荐查看更多

相关文章

加入云驻计划，成为创作者

相关产品