0基础体验基于CANN ACL实现Add单算子调用应用步骤记录
【摘要】 一、下载样例代码wget https://obs-book.obs.cn-east-2.myhuaweicloud.com/aclopsamples/aclop_samples.tar.gztar -zxvf aclop_samples.tar.gz二、安装TF1.5pip3 install tensorflow==1.15.0 --user https://pypi.tuna.tsing...
一、下载样例代码
wget https://obs-book.obs.cn-east-2.myhuaweicloud.com/aclopsamples/aclop_samples.tar.gz
tar -zxvf aclop_samples.tar.gz
二、安装TF1.5
pip3 install tensorflow==1.15.0 --user https://pypi.tuna.tsinghua.edu.cn/simple
三、查看昇腾310算子清单
算子清单:https://support.huaweicloud.com/oplist-cann504alpha3infer/atlas_11_operatorlist_0001.html
如果在windows上打开显示空白的话,需要右键chm文件然后找到【解锁】按钮,点一下,再打开就可以看到内容了
四、开始单算子调用应用代码修改
复制样例代码
cp -r conv2d_aclopExecuteV2 sub_aclopExecuteV2
实现算子列表中的Add算子
清空model目录
rm -rf model/*
修改op_list.json
[
{
"op": "Add",
"input_desc": [
{
"format": "NHWC",
"shape": [1, 1, 1, 1],
"type": "float16"
},
{
"format": "NHWC",
"shape": [1, 1, 1, 1],
"type": "float16"
}
],
"output_desc": [
{
"format": "NHWC",
"shape": [1, 1, 1, 1],
"type": "float16"
}
],
"attr": [
]
}
]
转换
atc --singleop=./op_list.json --output=./ --soc_version=Ascend310
修改src/main.cpp
// int main()
#include "acl/acl.h"
#include "utils.h"
#include "acl/ops/acl_cblas.h"
#include "acl/acl_op_compiler.h"
using namespace std;
void PrintResult(void * out_buffers,uint32_t out_tensor_size, std::string out_file){
void* hostBuffer = nullptr;
void* outData = nullptr;
aclError ret = aclrtMallocHost(&hostBuffer, out_tensor_size);
if (ret != ACL_ERROR_NONE) {
ERROR_LOG("fail to print result, malloc host failed");
}
ret = aclrtMemcpy(hostBuffer, out_tensor_size, out_buffers,out_tensor_size, ACL_MEMCPY_DEVICE_TO_HOST);
if (ret != ACL_ERROR_NONE) {
ERROR_LOG("fail to print result, memcpy device to host failed, errorCode is %d", static_cast<int32_t>(ret));
aclrtFreeHost(hostBuffer);
}
outData = reinterpret_cast<aclFloat16*>(hostBuffer);
ofstream outstr(out_file, ios::out | ios::binary);
outstr.write((char*)outData, out_tensor_size);
outstr.close();
}
int main(int argc, char* argv[])
{
for (int i = 0; i < argc; i++)
{
cout << "第" << i << "个参数是:" << argv[i] << endl;
}
// 输入,后期需要修改为各种算子都可以输入的框架
std::string x_format = "NHWC";
std::string y_format = "NHWC";
std::string input_x_file = argv[1];
std::string input_y_file = argv[2];
std::string out_file = argv[3];
//std::string out_file ;
std::vector<int64_t> inputShapeCast{2, 1024, 1024, 3};
std::vector<int64_t> inputFilterShapeCast{6, 3, 3, 3};
std::vector<int64_t> outputShapeCast{2, 1024, 1024, 6};
// single op call
const char* opType_ = "Conv2D";
// int numInput = 3;
int numInput = 2;
int numOutput = 1;
aclDataType inputDataTypeCast = ACL_FLOAT16;
aclDataType outputDataTypeCast = ACL_FLOAT16;
// ACL init
const char *aclConfigPath = "../src/acl.json";
aclError ret = aclInit(aclConfigPath);
if (ret != ACL_ERROR_NONE) {
ERROR_LOG("acl init failed, errorCode = %d", static_cast<int32_t>(ret));
return FAILED;
}
INFO_LOG("acl init success");
int32_t deviceId_ = 0;
aclrtContext context_;
aclrtStream stream_;
// set device
ret = aclrtSetDevice(deviceId_);
ret = aclrtCreateContext(&context_, deviceId_);
ret = aclrtCreateStream(&stream_);
// set model dir
ret = aclopSetModelDir("../model");
aclTensorDesc *inputDescCast[numInput];
aclTensorDesc *OutputDescCast[numOutput];
// Create aclTensorDesc, to describe the shape/format/datatype, etc.
inputDescCast[0] = aclCreateTensorDesc(inputDataTypeCast, inputShapeCast.size(), inputShapeCast.data(), ACL_FORMAT_NHWC);
inputDescCast[1] = aclCreateTensorDesc(inputDataTypeCast, inputFilterShapeCast.size(), inputFilterShapeCast.data(), ACL_FORMAT_NCHW);
// inputDescCast[2] = aclCreateTensorDesc(ACL_DT_UNDEFINED, 0, nullptr, ACL_FORMAT_UNDEFINED);
OutputDescCast[0] = aclCreateTensorDesc(outputDataTypeCast, outputShapeCast.size(), outputShapeCast.data(), ACL_FORMAT_NHWC);
// set Conv2D attr
aclopAttr *opAttr = aclopCreateAttr();
if (opAttr == nullptr) {
ERROR_LOG("singleOp create attr failed");
return FAILED;
}
int64_t intList[4]{1, 1, 1, 1};
/*
ret = aclopSetAttrListInt(opAttr, "strides", 4, intList);
if (ret != ACL_ERROR_NONE) {
ERROR_LOG("singleOp set strides attr failed");
aclopDestroyAttr(opAttr);
return FAILED;
}
ret = aclopSetAttrListInt(opAttr, "pads", 4, intList);
if (ret != ACL_ERROR_NONE) {
ERROR_LOG("singleOp set pads attr failed");
aclopDestroyAttr(opAttr);
return FAILED;
}
ret = aclopSetAttrListInt(opAttr, "dilations", 4, intList);
if (ret != ACL_ERROR_NONE) {
ERROR_LOG("singleOp set dilations attr failed");
aclopDestroyAttr(opAttr);
return FAILED;
}
*/
void* x_tensor_ptr = nullptr;
void* y_tensor_ptr = nullptr;
void* out_tensor_ptr = nullptr;
uint32_t x_tensor_size;
uint32_t y_tensor_size;
uint32_t out_tensor_size = sizeof(x_tensor_size);
std::vector<aclDataBuffer*> in_buffers;
std::vector<aclDataBuffer*> out_buffers;
// 分别从文件中加载数据后,数据已经在device中了
x_tensor_ptr = Utils::GetDeviceBufferOfFile(input_x_file, x_tensor_size);
y_tensor_ptr = Utils::GetDeviceBufferOfFile(input_y_file, y_tensor_size);
//out_tensor_ptr = Utils::GetDeviceBufferOfFile(out_file, out_tensor_size);
ret = aclrtMalloc(&out_tensor_ptr, out_tensor_size, ACL_MEM_MALLOC_HUGE_FIRST);
// 包装成databuffer
aclDataBuffer* x_tensor_data = aclCreateDataBuffer(x_tensor_ptr, x_tensor_size);
if (x_tensor_data == nullptr) {
ERROR_LOG("can't create data buffer, create input failed");
return FAILED;
}
// 包装成databuffer
aclDataBuffer* y_tensor_data = aclCreateDataBuffer(y_tensor_ptr, y_tensor_size);
if (y_tensor_data == nullptr) {
ERROR_LOG("can't create data buffer, create input failed");
return FAILED;
}
// 包装成databuffer
aclDataBuffer* bias_tensor_data = aclCreateDataBuffer(nullptr, 0);
// 包装成databuffer
aclDataBuffer* out_tensor_data = aclCreateDataBuffer(out_tensor_ptr, out_tensor_size);
if (out_tensor_data == nullptr) {
ERROR_LOG("can't create data buffer, create input failed");
return FAILED;
}
in_buffers.push_back(x_tensor_data);
in_buffers.push_back(y_tensor_data);
in_buffers.push_back(bias_tensor_data);
out_buffers.push_back(out_tensor_data);
for(int i=0; i < 1; i++)
{
// ret = aclopCompileAndExecute(opType_, numInput, inputDescCast,
// in_buffers.data(), numOutput, OutputDescCast, out_buffers.data(),
// opAttr, ACL_ENGINE_SYS, ACL_COMPILE_SYS, NULL, stream_);
ret = aclopExecuteV2(opType_, numInput, inputDescCast,
in_buffers.data(), numOutput, OutputDescCast, out_buffers.data(),
opAttr, stream_);
ret = aclrtSynchronizeStream(stream_);
if (ret != ACL_ERROR_NONE) {
ERROR_LOG("execute singleOp conv2d failed, errorCode is %d", static_cast<int32_t>(ret));
aclDestroyTensorDesc(inputDescCast[0]);
aclDestroyTensorDesc(OutputDescCast[0]);
return FAILED;
}
INFO_LOG("execute conv2d %d", i);
PrintResult(out_tensor_ptr, out_tensor_size, out_file);
}
INFO_LOG("execute op success");
if (stream_ != nullptr) {
ret = aclrtDestroyStream(stream_);
if (ret != ACL_ERROR_NONE) {
ERROR_LOG("destroy stream failed, errorCode = %d", static_cast<int32_t>(ret));
}
stream_ = nullptr;
}
INFO_LOG("end to destroy stream");
if (context_ != nullptr) {
ret = aclrtDestroyContext(context_);
if (ret != ACL_ERROR_NONE) {
ERROR_LOG("destroy context failed, errorCode = %d", static_cast<int32_t>(ret));
}
context_ = nullptr;
}
INFO_LOG("end to destroy context");
ret = aclrtResetDevice(deviceId_);
if (ret != ACL_ERROR_NONE) {
ERROR_LOG("reset device %d failed, errorCode = %d", deviceId_, static_cast<int32_t>(ret));
}
INFO_LOG("end to reset device ");
ret = aclFinalize();
if (ret != ACL_ERROR_NONE) {
ERROR_LOG("finalize acl failed, errorCode = %d", static_cast<int32_t>(ret));
}
INFO_LOG("end to finalize acl");
return SUCCESS;
}
使用cmake编译,修改computertf.py
import tensorflow as tf
import numpy as np
import sys, getopt,time
def compute(input_x_file, input_y_file, out_file):
a = np.fromfile(input_x_file,dtype=np.float16)
b = np.fromfile(input_y_file,dtype=np.float16)
x = tf.Variable(a)
y = tf.Variable(b)
x = tf.Variable([1.0,2.0,3.0,4.0])
y = tf.Variable([0.1,0.2,0.3,0.4])
op = tf.math.add(x,y,name=None)
sess = tf.Session()
tf.global_variables_initializer().run(session=sess)
start = time.time()
# op_output为tf在cpu上的conv2d执行结果
op_output, input, filter = sess.run([op, x, y])
end = time.time()
print("cpu_time: ", end - start)
# NHWC
# op_output = op_output.transpose((0,3,1,2))
print(op_output)
# numpy读取npu的结果文件
c = np.fromfile(out_file, dtype=np.float16)
# 将差异打印出来,如果全为0就是ok的
#print(sum(c-op_output))
if __name__ == '__main__':
# print(sys.argv)
if(len(sys.argv) < 4):
print("paras is not ok")
sys.exit()
compute(sys.argv[1], sys.argv[2], sys.argv[3])
执行test.sh,出现以下结果:
【版权声明】本文为华为云社区用户原创内容,转载时必须标注文章的来源(华为云社区)、文章链接、文章作者等基本信息, 否则作者和本社区有权追究责任。如果您发现本社区中有涉嫌抄袭的内容,欢迎发送邮件进行举报,并提供相关证据,一经查实,本社区将立刻删除涉嫌侵权内容,举报邮箱:
cloudbbs@huaweicloud.com
- 点赞
- 收藏
- 关注作者
评论(0)