0基础体验基于CANN ACL实现Add单算子调用应用步骤记录

举报
JeffDing 发表于 2021/12/27 11:15:07 2021/12/27
【摘要】 一、下载样例代码wget https://obs-book.obs.cn-east-2.myhuaweicloud.com/aclopsamples/aclop_samples.tar.gztar -zxvf aclop_samples.tar.gz二、安装TF1.5pip3 install tensorflow==1.15.0 --user https://pypi.tuna.tsing...

一、下载样例代码

wget https://obs-book.obs.cn-east-2.myhuaweicloud.com/aclopsamples/aclop_samples.tar.gz
tar -zxvf aclop_samples.tar.gz

二、安装TF1.5

pip3 install tensorflow==1.15.0 --user  https://pypi.tuna.tsinghua.edu.cn/simple

三、查看昇腾310算子清单

算子清单:https://support.huaweicloud.com/oplist-cann504alpha3infer/atlas_11_operatorlist_0001.html

如果在windows上打开显示空白的话,需要右键chm文件然后找到【解锁】按钮,点一下,再打开就可以看到内容了

四、开始单算子调用应用代码修改

复制样例代码

cp -r conv2d_aclopExecuteV2 sub_aclopExecuteV2

实现算子列表中的Add算子

清空model目录

rm -rf model/*

修改op_list.json

[
  {
    "op": "Add",
    "input_desc": [
      {
        "format": "NHWC",
        "shape": [1, 1, 1, 1],
        "type": "float16"
      },
      {
        "format": "NHWC",
        "shape": [1, 1, 1, 1],
        "type": "float16"
      }
    ],
    "output_desc": [
      {
        "format": "NHWC",
        "shape": [1, 1, 1, 1],
        "type": "float16"
      }
    ],
    "attr": [
    ]
  }
]

转换

atc --singleop=./op_list.json --output=./ --soc_version=Ascend310

修改src/main.cpp

// int main()
#include "acl/acl.h"
#include "utils.h"
#include "acl/ops/acl_cblas.h"
#include "acl/acl_op_compiler.h"

using namespace std;

void PrintResult(void * out_buffers,uint32_t out_tensor_size, std::string out_file){
    void* hostBuffer = nullptr;
    void* outData = nullptr;
    aclError ret = aclrtMallocHost(&hostBuffer, out_tensor_size);
    if (ret != ACL_ERROR_NONE) {
        ERROR_LOG("fail to print result, malloc host failed");
	    
    }
    ret = aclrtMemcpy(hostBuffer, out_tensor_size, out_buffers,out_tensor_size, ACL_MEMCPY_DEVICE_TO_HOST);
    if (ret != ACL_ERROR_NONE) {
        ERROR_LOG("fail to print result, memcpy device to host failed, errorCode is %d", static_cast<int32_t>(ret));
	    aclrtFreeHost(hostBuffer);
	    
    }
    outData = reinterpret_cast<aclFloat16*>(hostBuffer);
    ofstream outstr(out_file, ios::out | ios::binary);
    outstr.write((char*)outData, out_tensor_size);
    outstr.close();

}


int main(int argc, char* argv[])
{ 

    for (int i = 0; i < argc; i++) 
	{
		cout << "第" << i << "个参数是:" << argv[i] << endl;
	}

    // 输入,后期需要修改为各种算子都可以输入的框架
    std::string x_format = "NHWC";
    std::string y_format = "NHWC";

    std::string input_x_file = argv[1];
    std::string input_y_file = argv[2];
    std::string out_file = argv[3];
    //std::string out_file ;

    std::vector<int64_t> inputShapeCast{2, 1024, 1024, 3};
    std::vector<int64_t> inputFilterShapeCast{6, 3, 3, 3};
    std::vector<int64_t> outputShapeCast{2, 1024, 1024, 6};

    // single op call
    const char* opType_ = "Conv2D";
    // int numInput = 3;
    int numInput = 2;
    int numOutput = 1;

    aclDataType inputDataTypeCast = ACL_FLOAT16;
    aclDataType outputDataTypeCast = ACL_FLOAT16;


    // ACL init
    const char *aclConfigPath = "../src/acl.json";
    aclError ret = aclInit(aclConfigPath);
    if (ret != ACL_ERROR_NONE) {
        ERROR_LOG("acl init failed, errorCode = %d", static_cast<int32_t>(ret));
        return FAILED;
    }
    INFO_LOG("acl init success");

    int32_t deviceId_ = 0;
    aclrtContext context_;
    aclrtStream stream_;

    // set device
    ret = aclrtSetDevice(deviceId_);
    ret = aclrtCreateContext(&context_, deviceId_);
    ret = aclrtCreateStream(&stream_);  

    // set model dir 
    ret = aclopSetModelDir("../model");

    aclTensorDesc *inputDescCast[numInput];
    aclTensorDesc *OutputDescCast[numOutput];

    // Create aclTensorDesc, to describe the shape/format/datatype, etc.
    inputDescCast[0] = aclCreateTensorDesc(inputDataTypeCast, inputShapeCast.size(), inputShapeCast.data(), ACL_FORMAT_NHWC);
    inputDescCast[1] = aclCreateTensorDesc(inputDataTypeCast, inputFilterShapeCast.size(), inputFilterShapeCast.data(), ACL_FORMAT_NCHW);
    // inputDescCast[2] = aclCreateTensorDesc(ACL_DT_UNDEFINED, 0, nullptr, ACL_FORMAT_UNDEFINED);
    OutputDescCast[0] = aclCreateTensorDesc(outputDataTypeCast, outputShapeCast.size(), outputShapeCast.data(), ACL_FORMAT_NHWC);
 
    // set Conv2D attr
    aclopAttr *opAttr = aclopCreateAttr();
    if (opAttr == nullptr) {
        ERROR_LOG("singleOp create attr failed");
        return FAILED;
    }
    int64_t intList[4]{1, 1, 1, 1};

    /*
    ret = aclopSetAttrListInt(opAttr, "strides", 4, intList);
    if (ret != ACL_ERROR_NONE) {
        ERROR_LOG("singleOp set strides attr failed");
        aclopDestroyAttr(opAttr);
        return FAILED;
    }
    
    ret = aclopSetAttrListInt(opAttr, "pads", 4, intList);
    if (ret != ACL_ERROR_NONE) {
        ERROR_LOG("singleOp set pads attr failed");
        aclopDestroyAttr(opAttr);
        return FAILED;
    }

    ret = aclopSetAttrListInt(opAttr, "dilations", 4, intList);
    if (ret != ACL_ERROR_NONE) {
        ERROR_LOG("singleOp set dilations attr failed");
        aclopDestroyAttr(opAttr);
        return FAILED;
    }   
*/

    void* x_tensor_ptr = nullptr;
    void* y_tensor_ptr = nullptr;
    void* out_tensor_ptr = nullptr;
    uint32_t x_tensor_size;
    uint32_t y_tensor_size;
    uint32_t out_tensor_size = sizeof(x_tensor_size);
    std::vector<aclDataBuffer*> in_buffers;
    std::vector<aclDataBuffer*> out_buffers;

    // 分别从文件中加载数据后,数据已经在device中了
    x_tensor_ptr = Utils::GetDeviceBufferOfFile(input_x_file, x_tensor_size);
    y_tensor_ptr = Utils::GetDeviceBufferOfFile(input_y_file, y_tensor_size);
    //out_tensor_ptr = Utils::GetDeviceBufferOfFile(out_file, out_tensor_size);

    ret = aclrtMalloc(&out_tensor_ptr, out_tensor_size, ACL_MEM_MALLOC_HUGE_FIRST);

    // 包装成databuffer
    aclDataBuffer* x_tensor_data = aclCreateDataBuffer(x_tensor_ptr, x_tensor_size);


    if (x_tensor_data == nullptr) {
        ERROR_LOG("can't create data buffer, create input failed");
        return FAILED;
    }

    // 包装成databuffer
    aclDataBuffer* y_tensor_data = aclCreateDataBuffer(y_tensor_ptr, y_tensor_size);
    if (y_tensor_data == nullptr) {
        ERROR_LOG("can't create data buffer, create input failed");
        return FAILED;
    }

    // 包装成databuffer
    aclDataBuffer* bias_tensor_data = aclCreateDataBuffer(nullptr, 0);

    // 包装成databuffer
    aclDataBuffer* out_tensor_data = aclCreateDataBuffer(out_tensor_ptr, out_tensor_size);
    if (out_tensor_data == nullptr) {
        ERROR_LOG("can't create data buffer, create input failed");
        return FAILED;
    }

    in_buffers.push_back(x_tensor_data);
    in_buffers.push_back(y_tensor_data);
    in_buffers.push_back(bias_tensor_data);
    out_buffers.push_back(out_tensor_data);


                            
    for(int i=0; i < 1; i++)
    {
        // ret = aclopCompileAndExecute(opType_, numInput, inputDescCast, 
        // in_buffers.data(), numOutput, OutputDescCast, out_buffers.data(),
        // opAttr, ACL_ENGINE_SYS, ACL_COMPILE_SYS, NULL, stream_);
        ret = aclopExecuteV2(opType_, numInput, inputDescCast, 
        in_buffers.data(), numOutput, OutputDescCast, out_buffers.data(),
        opAttr, stream_);
        ret = aclrtSynchronizeStream(stream_);
        if (ret != ACL_ERROR_NONE) {
            ERROR_LOG("execute singleOp conv2d failed, errorCode is %d", static_cast<int32_t>(ret));
            aclDestroyTensorDesc(inputDescCast[0]);
            aclDestroyTensorDesc(OutputDescCast[0]);
            return FAILED;
        }

        INFO_LOG("execute conv2d %d", i);
        PrintResult(out_tensor_ptr, out_tensor_size, out_file);

    }    

    INFO_LOG("execute op success");

    if (stream_ != nullptr) {
        ret = aclrtDestroyStream(stream_);
        if (ret != ACL_ERROR_NONE) {
            ERROR_LOG("destroy stream failed, errorCode = %d", static_cast<int32_t>(ret));
        }
        stream_ = nullptr;
    }
    INFO_LOG("end to destroy stream");

    if (context_ != nullptr) {
        ret = aclrtDestroyContext(context_);
        if (ret != ACL_ERROR_NONE) {
            ERROR_LOG("destroy context failed, errorCode = %d", static_cast<int32_t>(ret));
        }
        context_ = nullptr;
    }
    INFO_LOG("end to destroy context");

    ret = aclrtResetDevice(deviceId_);
    if (ret != ACL_ERROR_NONE) {
        ERROR_LOG("reset device %d failed, errorCode = %d", deviceId_, static_cast<int32_t>(ret));
    }
    INFO_LOG("end to reset device ");

    ret = aclFinalize();
    if (ret != ACL_ERROR_NONE) {
        ERROR_LOG("finalize acl failed, errorCode = %d", static_cast<int32_t>(ret));
    }
    INFO_LOG("end to finalize acl");

    return SUCCESS;
}

 使用cmake编译,修改computertf.py

import tensorflow as tf
import numpy as np
import sys, getopt,time

def compute(input_x_file, input_y_file, out_file):
    a = np.fromfile(input_x_file,dtype=np.float16)
    b = np.fromfile(input_y_file,dtype=np.float16)

    x = tf.Variable(a)
    y = tf.Variable(b)


    x = tf.Variable([1.0,2.0,3.0,4.0])
    y = tf.Variable([0.1,0.2,0.3,0.4])

    op = tf.math.add(x,y,name=None)
    
    sess = tf.Session()
    tf.global_variables_initializer().run(session=sess)
    start = time.time()       
    # op_output为tf在cpu上的conv2d执行结果
    op_output, input, filter = sess.run([op, x, y])
    end = time.time()
    print("cpu_time: ", end - start)
    # NHWC
    # op_output = op_output.transpose((0,3,1,2))
    print(op_output)
    # numpy读取npu的结果文件
    c = np.fromfile(out_file, dtype=np.float16)

    # 将差异打印出来,如果全为0就是ok的
    #print(sum(c-op_output))



if __name__ == '__main__':
    # print(sys.argv)
    if(len(sys.argv) < 4):
        print("paras is not ok")
        sys.exit()
    
    compute(sys.argv[1], sys.argv[2], sys.argv[3])

执行test.sh,出现以下结果:

【版权声明】本文为华为云社区用户原创内容,转载时必须标注文章的来源(华为云社区)、文章链接、文章作者等基本信息, 否则作者和本社区有权追究责任。如果您发现本社区中有涉嫌抄袭的内容,欢迎发送邮件进行举报,并提供相关证据,一经查实,本社区将立刻删除涉嫌侵权内容,举报邮箱: cloudbbs@huaweicloud.com
  • 点赞
  • 收藏
  • 关注作者

评论(0

0/1000
抱歉,系统识别当前为高风险访问,暂不支持该操作

全部回复

上滑加载中

设置昵称

在此一键设置昵称,即可参与社区互动!

*长度不超过10个汉字或20个英文字符,设置后3个月内不可修改。

*长度不超过10个汉字或20个英文字符,设置后3个月内不可修改。