Host侧代码组装详解【免费下载链接】catlass本项目是CANN的算子模板库提供NPU上高性能矩阵乘及其相关融合类算子模板样例。项目地址: https://gitcode.com/cann/catlass概述本文以BasicMatmul算子为例详细讲解如何基于CATLASS模板库组装Host侧代码实现完整的矩阵乘法运算。Host侧代码主要负责环境初始化、资源管理、数据传输和算子调用等功能。Host侧代码结构Host侧代码通常包含以下几个核心部分环境初始化与资源申请输入数据准备设备内存申请与数据拷贝算子参数配置与调用结果数据拷贝与验证资源释放下面以BasicMatmul算子为例详细讲解每个部分的实现。完整示例代码创建examples/basic_matmul文件夹和examples/basic_matmul/basic_matmul.cpp文件以下为该cpp文件的编写示例。头文件与配置// 引入必要的头文件 #include catlass/gemm/kernel/basic_matmul.hpp #include helper.hpp #include golden.hpp #include catlass/catlass.hpp #include catlass/arch/arch.hpp #include catlass/gemm/block/block_mmad.hpp #include catlass/gemm/block/block_swizzle.hpp #include catlass/gemm/dispatch_policy.hpp #include catlass/gemm/gemm_type.hpp #include catlass/layout/layout.hpp #include catlass/status.hpp #include catlass/gemm/device/device_gemm.hpp using namespace Catlass; // 解析输入参数 struct Options { const std::string HELPER basic_matmul m n k [device_id]; GemmCoord problemShape{128, 128, 128}; int32_t deviceId{0}; Options() default; int Parse(int argc, const char **argv) { enum ArgsIndex { M_INDEX 1, N_INDEX, K_INDEX, DEVICE_ID_INDEX, ARGS_MAX }; if (argc ARGS_MAX || argc K_INDEX) { std::cerr HELPER std::endl; return -1; } problemShape.m() std::atoi(argv[M_INDEX]); problemShape.n() std::atoi(argv[N_INDEX]); problemShape.k() std::atoi(argv[K_INDEX]); if (argc ARGS_MAX) { deviceId std::atoi(argv[DEVICE_ID_INDEX]); } return 0; } };核心实现static void Run(const Options options) { /* 第一步流初始化与设备侧空间申请 */ aclrtStream stream{nullptr}; ACL_CHECK(aclInit(nullptr)); ACL_CHECK(aclrtSetDevice(options.deviceId)); ACL_CHECK(aclrtCreateStream(stream)); // 初始化matmul矩阵的shape参数 uint32_t m options.problemShape.m(); uint32_t n options.problemShape.n(); uint32_t k options.problemShape.k(); // 矩阵A的元素数量为m*k矩阵B的元素数量为k*n矩阵C的元素数量为m*n size_t lenA static_castsize_t(m) * k; size_t lenB static_castsize_t(k) * n; size_t lenC static_castsize_t(m) * n; // 根据矩阵元素数量和数据类型计算矩阵占用内存大小 size_t sizeA lenA * sizeof(fp16_t); size_t sizeB lenB * sizeof(fp16_t); size_t sizeC lenC * sizeof(fp16_t); // 初始化数据排布格式RowMajor表示行优先 using LayoutA layout::RowMajor; using LayoutB layout::RowMajor; using LayoutC layout::RowMajor; LayoutA layoutA{m, k}; LayoutB layoutB{k, n}; LayoutC layoutC{m, n}; // 初始化输入数据 std::vectorfp16_t hostA(lenA); std::vectorfp16_t hostB(lenB); golden::FillRandomDatafp16_t(hostA, -5.0f, 5.0f); golden::FillRandomDatafp16_t(hostB, -5.0f, 5.0f); // 申请A矩阵在device上的内存并将A矩阵拷贝至device uint8_t *deviceA{nullptr}; ACL_CHECK(aclrtMalloc(reinterpret_castvoid **(deviceA), sizeA, ACL_MEM_MALLOC_HUGE_FIRST)); ACL_CHECK(aclrtMemcpy(deviceA, sizeA, hostA.data(), sizeA, ACL_MEMCPY_HOST_TO_DEVICE)); // 申请B矩阵在device上的内存并将B矩阵拷贝至device uint8_t *deviceB{nullptr}; ACL_CHECK(aclrtMalloc(reinterpret_castvoid **(deviceB), sizeB, ACL_MEM_MALLOC_HUGE_FIRST)); ACL_CHECK(aclrtMemcpy(deviceB, sizeB, hostB.data(), sizeB, ACL_MEMCPY_HOST_TO_DEVICE)); // 申请C矩阵在device上的内存 uint8_t *deviceC{nullptr}; ACL_CHECK(aclrtMalloc(reinterpret_castvoid **(deviceC), sizeC, ACL_MEM_MALLOC_HUGE_FIRST)); // 获取当前硬件核心数量 auto aicCoreNum platform_ascendc::PlatformAscendCManager::GetInstance()-GetCoreNumAic(); /* 第二步选择优化策略 */ using ArchTag Arch::AtlasA2; using DispatchPolicy Gemm::MmadAtlasA2Pingpongtrue; // 定义tiling切分策略 using L1TileShape GemmShape128, 256, 256; using L0TileShape GemmShape128, 256, 64; /* 第三步选择数据类型并组装模板样例组件 */ using AType Gemm::GemmTypehalf, LayoutA; using BType Gemm::GemmTypehalf, LayoutB; using CType Gemm::GemmTypehalf, LayoutC; // 定义Block层进行矩阵乘计算的组件 using BlockMmad Gemm::Block::BlockMmadDispatchPolicy, L1TileShape, L0TileShape, AType, BType, CType; using BlockEpilogue void; // 配置Block调度器指定Block粒度的swizzle次序 using BlockScheduler typename Gemm::Block::GemmIdentityBlockSwizzle3, 0; // 指定kernel using MatmulKernel Gemm::Kernel::BasicMatmulBlockMmad, BlockEpilogue, BlockScheduler; // 定义Device层适配器 using MatmulAdapter Gemm::Device::DeviceGemmMatmulKernel; MatmulKernel::Arguments arguments{options.problemShape, deviceA, deviceB, deviceC}; /* 第四步执行模板样例 */ // 定义适配器对象 MatmulAdapter matmulOp; // 判断kernel对相关参数可执行 matmulOp.CanImplement(arguments); size_t sizeWorkspace matmulOp.GetWorkspaceSize(arguments); uint8_t *deviceWorkspace nullptr; if (sizeWorkspace 0) { ACL_CHECK( aclrtMalloc(reinterpret_castvoid **(deviceWorkspace), sizeWorkspace, ACL_MEM_MALLOC_HUGE_FIRST)); } // 初始化 matmulOp.Initialize(arguments, deviceWorkspace); // 调用执行 matmulOp(stream, aicCoreNum); ACL_CHECK(aclrtSynchronizeStream(stream)); if (sizeWorkspace 0) { ACL_CHECK(aclrtFree(deviceWorkspace)); } // 将输出数据搬出 std::vectorfp16_t hostC(lenC); ACL_CHECK(aclrtMemcpy(hostC.data(), sizeC, deviceC, sizeC, ACL_MEMCPY_DEVICE_TO_HOST)); // 计算精度标杆并与输出数据比对 std::vectorfloat hostGolden(lenC); golden::ComputeMatmul(hostGolden, hostA, hostB, m, n, k); auto diff helper::CompareData(hostC, hostGolden); std::cout Compare (diff ? failed : success) std::endl; /* 第五步释放资源 */ ACL_CHECK(aclrtFree(deviceA)); ACL_CHECK(aclrtFree(deviceB)); ACL_CHECK(aclrtFree(deviceC)); ACL_CHECK(aclrtDestroyStream(stream)); ACL_CHECK(aclrtResetDevice(options.deviceId)); ACL_CHECK(aclFinalize()); } int main(int argc, const char **argv) { Options options; if (options.Parse(argc, argv) ! 0) { return -1; } try { Run(options); return 0; } catch (const std::exception e) { std::cerr Error: e.what() std::endl; return -1; } }核心实现代码详解1. 环境初始化与资源申请这部分代码主要完成以下工作初始化ACL环境设置设备ID创建计算流计算矩阵维度和内存大小定义数据排布格式aclrtStream stream{nullptr}; ACL_CHECK(aclInit(nullptr)); ACL_CHECK(aclrtSetDevice(options.deviceId)); ACL_CHECK(aclrtCreateStream(stream)); // 计算矩阵维度和内存大小 uint32_t m options.problemShape.m(); uint32_t n options.problemShape.n(); uint32_t k options.problemShape.k(); size_t lenA static_castsize_t(m) * k; size_t lenB static_castsize_t(k) * n; size_t lenC static_castsize_t(m) * n; size_t sizeA lenA * sizeof(fp16_t); size_t sizeB lenB * sizeof(fp16_t); size_t sizeC lenC * sizeof(fp16_t); // 定义数据排布格式 using LayoutA layout::RowMajor; using LayoutB layout::RowMajor; using LayoutC layout::RowMajor; LayoutA layoutA{m, k}; LayoutB layoutB{k, n}; LayoutC layoutC{m, n};2. 输入数据准备这部分代码主要完成以下工作在Host侧创建输入数据缓冲区填充随机数据std::vectorfp16_t hostA(lenA); std::vectorfp16_t hostB(lenB); golden::FillRandomDatafp16_t(hostA, -5.0f, 5.0f); golden::FillRandomDatafp16_t(hostB, -5.0f, 5.0f);3. 设备内存申请与数据拷贝这部分代码主要完成以下工作在Device侧申请内存将Host侧数据拷贝到Device侧// 申请A矩阵在device上的内存并将A矩阵拷贝至device uint8_t *deviceA{nullptr}; ACL_CHECK(aclrtMalloc(reinterpret_castvoid **(deviceA), sizeA, ACL_MEM_MALLOC_HUGE_FIRST)); ACL_CHECK(aclrtMemcpy(deviceA, sizeA, hostA.data(), sizeA, ACL_MEMCPY_HOST_TO_DEVICE)); // 申请B矩阵在device上的内存并将B矩阵拷贝至device uint8_t *deviceB{nullptr}; ACL_CHECK(aclrtMalloc(reinterpret_castvoid **(deviceB), sizeB, ACL_MEM_MALLOC_HUGE_FIRST)); ACL_CHECK(aclrtMemcpy(deviceB, sizeB, hostB.data(), sizeB, ACL_MEMCPY_HOST_TO_DEVICE)); // 申请C矩阵在device上的内存 uint8_t *deviceC{nullptr}; ACL_CHECK(aclrtMalloc(reinterpret_castvoid **(deviceC), sizeC, ACL_MEM_MALLOC_HUGE_FIRST));4. 算子参数配置与调用这部分代码主要完成以下工作选择架构和调度策略定义Tile形状组装Block层和Kernel层组件初始化Device层适配器执行算子// 选择调度策略 using ArchTag Arch::AtlasA2; using DispatchPolicy Gemm::MmadAtlasA2Pingpongtrue; // 定义tiling切分策略 using L1TileShape GemmShape128, 256, 256; using L0TileShape GemmShape128, 256, 64; // 选择数据类型并组装模板样例组件 using AType Gemm::GemmTypehalf, LayoutA; using BType Gemm::GemmTypehalf, LayoutB; using CType Gemm::GemmTypehalf, LayoutC; // 定义Block层进行矩阵乘计算的组件 using BlockMmad Gemm::Block::BlockMmadDispatchPolicy, L1TileShape, L0TileShape, AType, BType, CType; using BlockEpilogue void; // 配置Block调度器 using BlockScheduler typename Gemm::Block::GemmIdentityBlockSwizzle3, 0; // 指定kernel using MatmulKernel Gemm::Kernel::BasicMatmulBlockMmad, BlockEpilogue, BlockScheduler; // 定义Device层适配器 using MatmulAdapter Gemm::Device::DeviceGemmMatmulKernel; MatmulKernel::Arguments arguments{options.problemShape, deviceA, deviceB, deviceC}; // 执行模板样例 MatmulAdapter matmulOp; matmulOp.CanImplement(arguments); size_t sizeWorkspace matmulOp.GetWorkspaceSize(arguments); uint8_t *deviceWorkspace nullptr; if (sizeWorkspace 0) { ACL_CHECK(aclrtMalloc(reinterpret_castvoid **(deviceWorkspace), sizeWorkspace, ACL_MEM_MALLOC_HUGE_FIRST)); } matmulOp.Initialize(arguments, deviceWorkspace); matmulOp(stream, aicCoreNum); ACL_CHECK(aclrtSynchronizeStream(stream));5. 结果数据拷贝与验证这部分代码主要完成以下工作将Device侧计算结果拷贝到Host侧计算标杆结果并与结果进行比较// 将输出数据搬出 std::vectorfp16_t hostC(lenC); ACL_CHECK(aclrtMemcpy(hostC.data(), sizeC, deviceC, sizeC, ACL_MEMCPY_DEVICE_TO_HOST)); // 计算精度标杆并与输出数据比对 std::vectorfloat hostGolden(lenC); golden::ComputeMatmul(hostGolden, hostA, hostB, m, n, k); auto diff helper::CompareData(hostC, hostGolden); std::cout Compare (diff ? failed : success) std::endl;6. 资源释放这部分代码主要完成以下工作释放Device侧内存销毁计算流重置设备释放ACL环境// 释放资源 ACL_CHECK(aclrtFree(deviceA)); ACL_CHECK(aclrtFree(deviceB)); ACL_CHECK(aclrtFree(deviceC)); if (sizeWorkspace 0) { ACL_CHECK(aclrtFree(deviceWorkspace)); } ACL_CHECK(aclrtDestroyStream(stream)); ACL_CHECK(aclrtResetDevice(options.deviceId)); ACL_CHECK(aclFinalize());编译与运行编译创建examples/basic_matmul/CMakeLists.txt文件# CMakeLists.txt set_source_files_properties(basic_matmul.cpp PROPERTIES LANGUAGE ASC) catlass_example_add_executable( basic_matmul cube basic_matmul.cpp )在examples/CMakeLists.txt中将新增示例加入编译清单set(EXAMPLE_ATLASA2 00_basic_matmul # ... basic_matmul )执行编译命令bash scripts/build.sh basic_matmul运行cd output/bin ./basic_matmul 256 512 1024 0【免费下载链接】catlass本项目是CANN的算子模板库提供NPU上高性能矩阵乘及其相关融合类算子模板样例。项目地址: https://gitcode.com/cann/catlass创作声明:本文部分内容由AI辅助生成(AIGC),仅供参考