From 525c3606a9c9e3c0dcc58f285a08830fd7c52f80 Mon Sep 17 00:00:00 2001 From: wly451717 Date: Tue, 9 Sep 2025 15:49:18 +0800 Subject: [PATCH 001/116] add atvos --- best_practices/atvos/atvos.h | 87 +++ best_practices/atvos/block/block_elemwise.h | 50 ++ .../atvos/block/block_elemwise_impl.h | 22 + best_practices/atvos/common/dagsch.h | 28 + best_practices/atvos/common/expression.h | 121 +++ best_practices/atvos/common/impl/atvos_impl.h | 152 ++++ best_practices/atvos/common/layout.h | 116 +++ best_practices/atvos/common/placeholder.h | 323 ++++++++ best_practices/atvos/device/device_vector.h | 58 ++ .../atvos/device/device_vector_impl.h | 31 + best_practices/atvos/dfx/check.h | 25 + best_practices/atvos/dfx/kernel_mirror.h | 32 + best_practices/atvos/kernel/kernel_elemwise.h | 35 + .../atvos/kernel/kernel_elemwise_impl.h | 24 + best_practices/atvos/tile/tile_alu.h | 730 ++++++++++++++++++ 15 files changed, 1834 insertions(+) create mode 100644 best_practices/atvos/atvos.h create mode 100644 best_practices/atvos/block/block_elemwise.h create mode 100644 best_practices/atvos/block/block_elemwise_impl.h create mode 100644 best_practices/atvos/common/dagsch.h create mode 100644 best_practices/atvos/common/expression.h create mode 100644 best_practices/atvos/common/impl/atvos_impl.h create mode 100644 best_practices/atvos/common/layout.h create mode 100644 best_practices/atvos/common/placeholder.h create mode 100644 best_practices/atvos/device/device_vector.h create mode 100644 best_practices/atvos/device/device_vector_impl.h create mode 100644 best_practices/atvos/dfx/check.h create mode 100644 best_practices/atvos/dfx/kernel_mirror.h create mode 100644 best_practices/atvos/kernel/kernel_elemwise.h create mode 100644 best_practices/atvos/kernel/kernel_elemwise_impl.h create mode 100644 best_practices/atvos/tile/tile_alu.h diff --git a/best_practices/atvos/atvos.h b/best_practices/atvos/atvos.h new file mode 100644 index 000000000..f39122886 --- /dev/null +++ b/best_practices/atvos/atvos.h @@ -0,0 +1,87 @@ +#ifndef _ATVC_H_TEMP_H_ +#define _ATVC_H_TEMP_H_ + +#include "common/impl/atvos_impl.h" + +namespace ATVOS { + +#pragma region "参数类型" +template +using Input = __auxc::Param<__auxc::ParamType::INPUT, DataType, Layout, Ext...>; + +template +using Output = __auxc::Param<__auxc::ParamType::OUTPUT, DataType, Layout, Ext...>; + +#pragma endregion + +#pragma region "layout" +template +struct Layout { + Shape shape; +}; + +template +using Shape = std::tuple; + +template +Shape MakeShape(Args... ts) { + return {ts...}; +} + +template +auto MakeLayout(Shape&& shape, Stride&& stride) { + return Layout{ shape}; +} +template +auto MakeLayout(Shape&& shape) { + return Layout{ shape}; +} + +template +auto MakeTensor(void* gmAddr, Shape&& layout) { + return Tensor>{gmAddr, layout}; +} + +#pragma endregion "layout" + + +template +struct ArgumentsBase { + using LayoutArgs = typename __auxc::TraitsToTensor::Type; +private: + template + static void SetArgs(LayoutArgs& obj, Argc const & argc, Args const& ... args) + { + constexpr static auto size = std::tuple_size_v< std::remove_reference_t< LayoutArgs > >; + + static_assert(offset= sizeof(LayoutArgs)"); + + //warning: 这个代码要修改,因为常量时 layout 与 传递的layout 的结构是不一样的 + std::get(obj) = argc; + + if constexpr ( offset < size - 1 && sizeof...(args) > 0 ) { + SetArgs(obj, args...); + } + } +public: + template + ArgumentsBase(Args const &... args) { + SetArgs<0>(this->param, args...); + } + ArgumentsBase(){}; +public: + LayoutArgs param; +}; + + +} //ATVOS + +#include "dfx/kernel_mirror.h" +#include "dfx/check.h" + +#include "common/placeholder.h" +#include "common/expression.h" +#include "common/dagsch.h" + + +#endif \ No newline at end of file diff --git a/best_practices/atvos/block/block_elemwise.h b/best_practices/atvos/block/block_elemwise.h new file mode 100644 index 000000000..ff45114fb --- /dev/null +++ b/best_practices/atvos/block/block_elemwise.h @@ -0,0 +1,50 @@ +#ifndef _BlockElemwise_H +#define _BlockElemwise_H + +namespace ATVOS { + +template +class BlockElemwise { +public: + using OpTraits = Traits; + using Arguments= ArgumentsBase; + using Policy = Policy_; +public: + BlockElemwise(){} + + bool CanImplement(Arguments const& arg){ return true;}; //检查参数的合法性 + int64_t GetWorkspaceSize(Arguments& arg){ return 0; }; + int64_t Run(Arguments const& arg, Policy* policy){ return 0; }; +}; + +template +class BlockElemwise { +public: + using ComputeOp = DagOp; //DagSchedule; + using OpTraits = Traits; + using Arguments= ArgumentsBase; + using Policy = int; + +public: + static_assert( !(Dfx::IsVoid /* && Dfx::IsVoid */ ), "OpTraits_ is not void"); + +public: + //using OpTraits = std::conditional_t, typename ComputeOp::OpTraits, OpTraits_>; + //using Arguments= std::conditional_t, typename ComputeOp::Arguments, ArgumentsBase>; + //using Policy = std::conditional_t, typename ComputeOp::Policy, Policy_>; + +public: + + BlockElemwise(){} + + bool CanImplement(Arguments const& arg) { return 0; }; + int64_t GetWorkspaceSize(Arguments& arg) { return 0; }; + int64_t Run(Arguments const& arg, Policy* policy){ return 0; }; + +}; + +#include "block_elemwise_impl.h" + +} //ATVOS + +#endif //_BlockElemwise_H \ No newline at end of file diff --git a/best_practices/atvos/block/block_elemwise_impl.h b/best_practices/atvos/block/block_elemwise_impl.h new file mode 100644 index 000000000..6a3d03ff0 --- /dev/null +++ b/best_practices/atvos/block/block_elemwise_impl.h @@ -0,0 +1,22 @@ +#ifndef _ATVOS_BLOCK_ELEMWISE_H_ +#define _ATVOS_BLOCK_ELEMWISE_H_ + +/* +template +bool BlockElemwise::CanImplement(Arguments const& arg){ + return 0; +} +template +int64_t BlockElemwise::GetWorkspaceSize(Arguments& arg) +{ + return 0; +} +template +int64_t BlockElemwise:: Run(Arguments const& arg, Policy* policy) +{ + std::cout << "cccc" << std::endl; + return 0; +} + */ + +#endif \ No newline at end of file diff --git a/best_practices/atvos/common/dagsch.h b/best_practices/atvos/common/dagsch.h new file mode 100644 index 000000000..a1459eca8 --- /dev/null +++ b/best_practices/atvos/common/dagsch.h @@ -0,0 +1,28 @@ +#ifndef _ATVOS_DAG_SCHEDUCLE_H_ +#define _ATVOS_DAG_SCHEDUCLE_H_ + + +namespace ATVOS { + +template +class DagSchedule { + using OpTraits = int; + using Arguments = ArgumentsBase; + using Policy = int; + +public: + //1, 从outlist 中获取输入,输出, 以及Scalar 类型 + //2, 检查是否存在Copy 函数,决策是否生成Copy + //3, 计算Dag 存活节点 + //4, 基于计算指令,分配地址; + //5, 分析bank 冲突,重新计算地址 + //6, 切分多核, 生成Tiling 的结构 + +}; + +} // ATVOS + + + + +#endif //_ATVOS_DAG_SCHEDUCLE_H_ \ No newline at end of file diff --git a/best_practices/atvos/common/expression.h b/best_practices/atvos/common/expression.h new file mode 100644 index 000000000..fdc3805c8 --- /dev/null +++ b/best_practices/atvos/common/expression.h @@ -0,0 +1,121 @@ +#ifndef _ATVOS_EXPRESSION_H_ +#define _ATVOS_EXPRESSION_H_ + +namespace ATVOS { + +#if 0 +/** + * @brief condition + * + */ +template +struct TupleFilter {}; + +template +struct TupleFilter >{ + using Type = decltype(std::tuple_cat( + std::conditional_t< C(), std::tuple, std::tuple<>>{}... + )); +}; + +template +constexpr bool GetRealArgs() { return true; } + +#endif + +/** + * @tparam Func 需要执行的向量操作函数 + * @tparam Ts Func的入参 + */ +template +struct Bind { +public: + using Fun = Func; + using Args = std::tuple; + + constexpr static bool IsBindFun = true; + + template + constexpr Bind(Args... args){}; +/* + using RealArgs = typename TupleFilter::Type; + +private: + template + struct CreateRealBind; + + template + struct CreateRealBind> { + using Type = __aux::Condition<(Vec::IsCastNoOp::Value), + typename RealArgs::template At<0>, + Bind>; + }; + +public: + // 消除入参中CastNoOp之后的BindType + using RealBindType = typename CreateRealBind::Type; + + // 第 @offset 个出参Dtype + template + using FunRetArgType = typename Fun::template FunRetArgType; + // 第 @offset 个入参Dtype + template + using FunInArgType = typename Fun::template FunInArgType; + +public: + // 当前Bind的输出Dtype + using OutDataType = FunRetArgType<0>; + using DType = OutDataType; + + // 入参中输出Placeholder的列表(输出GM) + using OutHolders = typename Args::template Filter<__aux::TypeIsOutHolder>; + // 入参列表(Args中过滤掉Placeholder::Out) + using InArgs = typename Args::template Filter<__aux::TypeIsInput>; + // 入参中输入Placeholder的列表(输入GM) + using InHolders = typename Args::template Filter<__aux::TypeIsInHolder>; + // 入参中函数列表(其他Bind类型) + using InFuns = typename Args::template Filter<__aux::TypeIsInFun>; + // 入参中Scalar变量列表 + using Vars = typename Args::template Filter<__aux::TypeIsVar>; + // 入参中Const输入列表 + using ConstValues = typename Args::template Filter<__aux::TypeIsConst>; + // 入参中输入是TensorScalar的列表 + using InScalarHolders = typename InHolders::template Filter<__aux::TypeIsInScalarHolder>; + // 入参中函数是Scalar操作的列表 + using InScalarFuns = typename InFuns::template Filter<__aux::TypeIsScalarBind>; + // 入参中ReduceOp操作列表 + using InReduceOpFuns = typename InFuns::template Filter<__aux::TypeIsReduceOpBind>; + // 入参中非ScalarOp函数列表 + using InNonScalarFuns = typename InFuns::template Remove::template Remove; + + static_assert(InArgs::Size == InHolders::Size + InFuns::Size + Vars::Size + ConstValues::Size, "why?"); + + // 标识当前Bind是否是Scalar操作(不需要使用任何UB空间) + constexpr static bool IsScalarOp = InScalarHolders::Size == InHolders::Size && \ + InScalarFuns::Size == InFuns::Size && \ + !Vec::IsDuplicateOp::Value; + + // 入参个数 + constexpr static uint32_t InputSize = InArgs::Size; + // 当前Bind的依赖列表:输入依赖 + 自身 + using DependFuns = typename __aux::GetDependFunsAux::Type::template Append; + // 溯源当前Bind的输入PlaceHolder + using SourceInHolders = typename __aux::GetSourceInHoldersAux::Type::template Union; + +*/ +}; + + + + +template +constexpr inline auto Expr(Args... agrs) { + return Bind(agrs...); +} + + +} // namespace ATVOS + + + +#endif \ No newline at end of file diff --git a/best_practices/atvos/common/impl/atvos_impl.h b/best_practices/atvos/common/impl/atvos_impl.h new file mode 100644 index 000000000..9584a636c --- /dev/null +++ b/best_practices/atvos/common/impl/atvos_impl.h @@ -0,0 +1,152 @@ +#ifndef _ATVC_IMPL_H_IMPL_H_ +#define _ATVC_IMPL_H_IMPL_H_ + +#include + +namespace ATVOS { + +struct EmptyStruct {}; + +template +struct Tensor { + using Type = DataType; + using Layout = std::conditional_t< std::is_same_v, EmptyStruct, Layout_> ; + + void* gmAddr; + Layout layout; +}; + +namespace __auxc { + +#pragma region "算子原型的类型定义" + +enum class ParamType { + Normal, // 原生类型,不做处理 + INPUT, // GM + OUTPUT, // GM + TEMP, // UB + PRE_TEMP_OUTPUT, // UB + POST_TEMP_INPUT, // UB +}; +template +struct Param { + constexpr static ParamType paramType = paramType_; + using DataType = DataType_; + using Layout = Layout_; + using ExtList = std::tuple; +}; +#pragma endregion +} //namespace __auxc + +namespace __auxc { + +#pragma region "OpTraits To Tensor" +template +struct ArgsToTensorType { + using Type = T; +}; +template +struct ArgsToTensorType< Param > { + using Type = DataType_; +}; +template +struct ArgsToTensorType< Param > { + using Type = Tensor; +}; +template +struct ArgsToTensorType< Param > { + using Type = Tensor; +}; +template +struct ArgsToTensorType< Param > { + using Type = Tensor; +}; +template +struct ArgsToTensorType< Param > { + using Type = Tensor; +}; +template +struct ArgsToTensorType< Param > { + using Type = Tensor; +}; + + +template +using ArgsToTensorType_t = typename ArgsToTensorType::Type; + +template +struct TraitsToTensor { using Type = T; }; + +template +struct TraitsToTensor > { + using Type = std::tuple< ArgsToTensorType_t... >; +}; +#pragma endregion +} //namespace __auxc + + + +namespace __auxc { + +template +struct GetParamLayout { + using Type = EmptyStruct; +}; + +template +struct GetParamLayout +{ + using Type = typename T::Layout; +}; + +template +using GetParamLayoutT = typename GetParamLayout::Type; + + + +//////////////////////// //////////// //////////// //////////// +template +struct _TemplateArgs {}; + +template<> +struct _TemplateArgs<> {}; + +template +struct _TemplateArgs : public _TemplateArgs { + using Type = GetParamLayoutT; +}; + + +///////////////////////////////////////////////////////////// +template +struct OpTraitsLayout{ + using Type = T; +}; + + +// Ts 为: Input +struct OpTraitsLayout< std::tuple > { + using Type = _TemplateArgs; +}; + +///////////////////////////////////////////////////////////// + +template +void LayoutSet(R& layout, T&& src, Shapes&&... ext){ + layout.template SetLayout(src); + if constexpr( sizeof...(ext) > 0){ + LayoutSet(layout, std::forward(ext)...); + } +} + + +} //__auxc + + + + +} //ATVOS + + +#endif //_ATVC_IMPL_H_IMPL_H_ \ No newline at end of file diff --git a/best_practices/atvos/common/layout.h b/best_practices/atvos/common/layout.h new file mode 100644 index 000000000..73a5c1cc8 --- /dev/null +++ b/best_practices/atvos/common/layout.h @@ -0,0 +1,116 @@ +#ifndef _ATVOSS_LAYOUT_H_ +#define _ATVOSS_LAYOUT_H_ + +#pragma once + +namespace ATVOS { + +template +using Shape = std::tuple; + +template +using Stride = std::tuple; + +template +using Coord = std::tuple; + +template +constexpr Shape make_shape(Ts const&... t) { + return {t...}; +} +template +constexpr Stride make_stride(Ts const&... t) { + return {t...}; +} + +template +constexpr Coord make_coord(Ts const&... t) { + return {t...}; +} + +template > +struct Layout : private std::tuple { + constexpr Layout(Shape const& shape = {}, Stride const& stride = {}) : std::tuple(shape, stride) + {} + constexpr decltype(auto) layout() { return *this; } + + constexpr decltype(auto) layout() const { return *this; } + + template + constexpr decltype(auto) shape() { + return get<0,I...>(static_cast&>(*this)); + } + + template + constexpr decltype(auto) shape() const { + return get<0,I...>(static_cast const&>(*this)); + } + + template + constexpr decltype(auto) stride() { + return get<1,I...>(static_cast&>(*this)); + } + + template + constexpr decltype(auto) stride() const { + return get<1,I...>(static_cast const&>(*this)); + } + template constexpr auto operator()(Coord const& coord) const { + if constexpr (has_underscore::value) { + return slice(coord, *this); + } else { + return crd2idx(coord, shape(), stride()); + } + } + + // Convenience function for multi-dimensional coordinates + template + constexpr + auto + operator()(Coord0 const& c0, Coord1 const& c1, Coords const&... cs) const { + return operator()(make_coord(c0,c1,cs...)); + } + +}; + + + +template +constexpr decltype(auto) layout(Layout const& layout) +{ + if constexpr (sizeof...(Is) == 0) { + return layout; + } else { + return get(layout); + } +} + +template +constexpr decltype(auto) shape(Layout& layout) +{ + return layout.template shape(); +} + +template +constexpr decltype(auto) shape(Layout const& layout) +{ + return layout.template shape(); +} + +template +constexpr auto size(Layout const& layout) +{ + return size(shape(layout)); +} + +template +constexpr auto rank(Layout const& layout) +{ + return rank(shape(layout)); +} + +} //ATVOS + + + +#endif //_ATVOSS_LAYOUT_H_ \ No newline at end of file diff --git a/best_practices/atvos/common/placeholder.h b/best_practices/atvos/common/placeholder.h new file mode 100644 index 000000000..bdca4cd9c --- /dev/null +++ b/best_practices/atvos/common/placeholder.h @@ -0,0 +1,323 @@ + +/** + * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/*! + * \file placeholder.h + * \brief + */ +#ifndef _ATVOS_PLACEHOLDER_H_ +#define _ATVOS_PLACEHOLDER_H_ + +#ifndef __CCE_AICORE__ +#define __aicore__ +#define __global__ +#define __gm__ +#endif + +namespace Placeholder { + +#define CONCATENATE_DETAIL(name, idx) name##idx +#define MAKE_UNIQ_NAME(name, idx) CONCATENATE_DETAIL(name, idx) +#define MAKE_CONST(t, v) \ + struct MAKE_UNIQ_NAME(ConcatValue_##t##_, __COUNTER__) { \ + constexpr static bool IsConstValue_ = true; \ + static constexpr t value = v; \ + } + +enum HolderScope { GM, VECTOR }; + +template +struct Holder { + constexpr static int Pos = cur; + constexpr static bool IsHolder = true; + using DType = T; + using Attr = Attr_; +}; + +template +struct InAttr { + constexpr static int IsScalar = isScalar; + constexpr static HolderScope Scope = scope; +}; + +template +struct ScalarAttr : public InAttr {}; + +template +struct ScopeAttr : public InAttr {}; + +/** GM输入, 存在Scalar输入 */ +template > +struct In0 : public Holder {}; +template > +struct In1 : public Holder {}; +template > +struct In2 : public Holder {}; +template > +struct In3 : public Holder {}; +template > +struct In4 : public Holder {}; +template > +struct In5 : public Holder {}; +template > +struct In6 : public Holder {}; +template > +struct In7 : public Holder {}; +template > +struct In8 : public Holder {}; +template > +struct In9 : public Holder {}; +template > +struct In10 : public Holder {}; +template > +struct In11 : public Holder {}; + +template +struct IsInHolder { + constexpr static bool Value = false; +}; +template class T> +struct IsInHolder> { + constexpr static bool Value = false; +}; +template +struct IsInHolder> { + constexpr static bool Value = true; +}; +template +struct IsInHolder> { + constexpr static bool Value = true; +}; +template +struct IsInHolder> { + constexpr static bool Value = true; +}; +template +struct IsInHolder> { + constexpr static bool Value = true; +}; +template +struct IsInHolder> { + constexpr static bool Value = true; +}; +template +struct IsInHolder> { + constexpr static bool Value = true; +}; +template +struct IsInHolder> { + constexpr static bool Value = true; +}; +template +struct IsInHolder> { + constexpr static bool Value = true; +}; +template +struct IsInHolder> { + constexpr static bool Value = true; +}; +template +struct IsInHolder> { + constexpr static bool Value = true; +}; +template +struct IsInHolder> { + constexpr static bool Value = true; +}; +template +struct IsInHolder> { + constexpr static bool Value = true; +}; + +/** 判断输入是否是Scalar输入 */ +template +struct IsInScalar { + constexpr static bool Value = false; +}; +template class T> +struct IsInScalar> { + constexpr static bool Value = Attr::IsScalar == 1; +}; + +/** GM输出,不存在Scalar输出 */ +template +struct Out0 : public Holder { + using Parent = Parent_; +}; +template +struct Out1 : public Holder { + using Parent = Parent_; +}; +template +struct Out2 : public Holder { + using Parent = Parent_; +}; +template +struct Out3 : public Holder { + using Parent = Parent_; +}; +template +struct Out4 : public Holder { + using Parent = Parent_; +}; +template +struct Out5 : public Holder { + using Parent = Parent_; +}; +template +struct Out6 : public Holder { + using Parent = Parent_; +}; +template +struct Out7 : public Holder { + using Parent = Parent_; +}; +template +struct Out8 : public Holder { + using Parent = Parent_; +}; +template +struct Out9 : public Holder { + using Parent = Parent_; +}; + +template +struct IsOutHolder { + constexpr static bool Value = false; +}; +template class T> +struct IsOutHolder> { + constexpr static bool Value = false; +}; +template +struct IsOutHolder> { + constexpr static bool Value = true; +}; +template +struct IsOutHolder> { + constexpr static bool Value = true; +}; +template +struct IsOutHolder> { + constexpr static bool Value = true; +}; +template +struct IsOutHolder> { + constexpr static bool Value = true; +}; +template +struct IsOutHolder> { + constexpr static bool Value = true; +}; +template +struct IsOutHolder> { + constexpr static bool Value = true; +}; +template +struct IsOutHolder> { + constexpr static bool Value = true; +}; +template +struct IsOutHolder> { + constexpr static bool Value = true; +}; +template +struct IsOutHolder> { + constexpr static bool Value = true; +}; +template +struct IsOutHolder> { + constexpr static bool Value = true; +}; + +template +struct Var : public Holder { + constexpr static bool IsVarValue = true; +}; + +#define CHECK_TYPE(NAME, V) \ + template \ + struct NAME { \ + template \ + static uint8_t has(...); \ + template \ + static uint32_t has(decltype(C::V)); \ + constexpr static bool Value = sizeof(has(NULL)) == sizeof(uint32_t); \ + } + +CHECK_TYPE(IsVar, IsVarValue); +CHECK_TYPE(IsConstValue, IsConstValue_); + +/** 值设置:定义值存储结构 */ +template +struct VarTypeStruct {}; +template <> +struct VarTypeStruct<> {}; +template +struct VarTypeStruct { + using DataType = typename T::DType; + DataType value; + template + __aicore__ inline constexpr DataType Get() { + return value; + } + template + __aicore__ inline constexpr void Set(DataType v) { + value = v; + } +}; + +template +struct VarTypeStruct : public VarTypeStruct { + using DataType = typename T::DType; + DataType value; + template + __aicore__ inline constexpr DataType Get() { + if constexpr (offset == 0) { + return value; + } else { + return VarTypeStruct::template Get(); + } + } + template + __aicore__ inline constexpr void Set(DataType v) { + if constexpr (offset == 0) { + value = v; + } else { + VarTypeStruct::template Set(v); + } + } +}; + +template +struct VarTypeAux { + using Type = VarTypeStruct; +}; + + +} // namespace Placeholder + +#pragma region +template +struct Int { + constexpr static int value = v; +}; +#pragma endregion + + + +#endif // _ATVOS_PLACEHOLDER_H_ diff --git a/best_practices/atvos/device/device_vector.h b/best_practices/atvos/device/device_vector.h new file mode 100644 index 000000000..e312fe073 --- /dev/null +++ b/best_practices/atvos/device/device_vector.h @@ -0,0 +1,58 @@ +#ifndef _DEVICE_VECTOR_H_ +#define _DEVICE_VECTOR_H_ + +#include "../dfx/check.h" + + +namespace ATVOS { + + +using Stream = int*; + +/** + * @brief + * + * @tparam OpTraits_ 需要的Layout 信息 + * @tparam KernelOp kernel 的入口 + * @tparam Policy 策略 + * @tparam Enable + */ +template +class DeviceAdapt { +public: + using OpTraits = std::conditional_t, typename KernelOp::OpTraits, OpTraits_>; + using Arguments = std::conditional_t, typename KernelOp::Arguments, ArgumentsBase>; + using Policy = std::conditional_t, typename KernelOp::Policy, Policy_>; +public: + static_assert( Dfx::IsTupleV, "OpTraits_ is std::tuple"); +public: + DeviceAdapt(){}; + + bool CanImplement(Arguments& arg); //检查参数的合法性 + int64_t GetWorkspaceSize(Arguments& arg); + int64_t Run(Arguments& args, Policy* policy=nullptr, Stream stream = nullptr); + +}; + +#include "device_vector_impl.h" + + + + +} // ATVOS + + + +#endif //_DEVICE_VECTOR_H_ + + + + + + + + + + + + \ No newline at end of file diff --git a/best_practices/atvos/device/device_vector_impl.h b/best_practices/atvos/device/device_vector_impl.h new file mode 100644 index 000000000..3b52be84e --- /dev/null +++ b/best_practices/atvos/device/device_vector_impl.h @@ -0,0 +1,31 @@ +#ifndef __ATVOS_DEVECOR_VECTOR_IMPPL_H_ +#define __ATVOS_DEVECOR_VECTOR_IMPPL_H_ + + + +template +bool DeviceAdapt::CanImplement(Arguments& arg){ + //检查KernelOp 的In,out 是否进行了 gm 地址的赋值 + return 0; +}; +template +int64_t DeviceAdapt< KernelOp, OpTraits_, Policy_, Schedule, Enable>::GetWorkspaceSize(Arguments& arg) +{ + return 0; +}; + +template +int64_t DeviceAdapt::Run(Arguments& arg, Policy* policy, Stream stream){ + //构建Kernel 对象,然后调用 + KernelOp op; + + op.Run(arg, nullptr); + return 0; +}; + + + + + +#endif //__ATVOS_DEVECOR_VECTOR_IMPPL_H_ + diff --git a/best_practices/atvos/dfx/check.h b/best_practices/atvos/dfx/check.h new file mode 100644 index 000000000..604ac7db4 --- /dev/null +++ b/best_practices/atvos/dfx/check.h @@ -0,0 +1,25 @@ +#ifndef _ATVOS_DFX_CHECK_H_ +#define _ATVOS_DFX_CHECK_H_ + +namespace ATVOS { +namespace Dfx { + +template +struct IsTuple { constexpr static bool Value = false; }; + +template +struct IsTuple< std::tuple > { constexpr static bool Value = true; }; + +template +constexpr bool IsTupleV = IsTuple::Value; + +template +constexpr bool IsVoid = std::is_same_v; + + +} //namespace Dfx +} //namespace ATVOS + + + +#endif \ No newline at end of file diff --git a/best_practices/atvos/dfx/kernel_mirror.h b/best_practices/atvos/dfx/kernel_mirror.h new file mode 100644 index 000000000..214dc0b42 --- /dev/null +++ b/best_practices/atvos/dfx/kernel_mirror.h @@ -0,0 +1,32 @@ +#ifndef _ATVOS_DFX_KERNEL_MIRROR_H_ +#define _ATVOS_DFX_KERNEL_MIRROR_H_ + + + +#ifdef __CCE_AICORE__ +#include "kernel_operator.h" +using namespace AscendC; +#else +// CPU编译过程中需要Mock掉Kernel侧的实现 +#define __aicore__ +#define __global__ +#define __gm__ +struct half { + uint16_t value; +}; +struct bfloat16_t { + uint16_t value; +}; +template +struct LocalTensor {}; + +template +struct GlobalTensor { + void SetGlobalBuffer(T* y) {} +}; +#endif + + + + +#endif \ No newline at end of file diff --git a/best_practices/atvos/kernel/kernel_elemwise.h b/best_practices/atvos/kernel/kernel_elemwise.h new file mode 100644 index 000000000..870b17241 --- /dev/null +++ b/best_practices/atvos/kernel/kernel_elemwise.h @@ -0,0 +1,35 @@ +#ifndef __KERNEL_ELEMWISE_H_ +#define __KERNEL_ELEMWISE_H_ + +namespace ATVOS { + +template +class KernelElemwise +{ +public: + static_assert( !(Dfx::IsVoid && Dfx::IsVoid), "OpTraits_ is not void"); + +public: + using OpTraits = std::conditional_t, typename BlockOp::OpTraits, OpTraits_>; + using Arguments= std::conditional_t, typename BlockOp::Arguments, ArgumentsBase>; + using Policy = std::conditional_t, typename BlockOp::Policy, Policy_>; +public: + static_assert( Dfx::IsTupleV, "OpTraits_ is std::tuple"); +public: + KernelElemwise(){}; + + bool CanImplement(Arguments& arg); //检查参数的合法性 + int64_t GetWorkspaceSize(Arguments& arg); + int64_t Run(Arguments& args, Policy* policy); + +protected: + Policy policy; +}; + + +#include "kernel_elemwise_impl.h" + +} // ATVOS + + +#endif //__KERNEL_ELEMWISE_H_ \ No newline at end of file diff --git a/best_practices/atvos/kernel/kernel_elemwise_impl.h b/best_practices/atvos/kernel/kernel_elemwise_impl.h new file mode 100644 index 000000000..7bf3e0ca5 --- /dev/null +++ b/best_practices/atvos/kernel/kernel_elemwise_impl.h @@ -0,0 +1,24 @@ +#ifndef _ATVOS_KERNLE_ELELMWISE_H_ +#define _ATVOS_KERNLE_ELELMWISE_H_ + +template +bool KernelElemwise::CanImplement(Arguments& arg){ + //检查KernelOp 的In,out 是否进行了 gm 地址的赋值 + return true; +}; +template +int64_t KernelElemwise::GetWorkspaceSize(Arguments& arg) +{ + return 0; +}; + +template +int64_t KernelElemwise::Run(Arguments& args, Policy* policy){ + + BlockOp op; + op.Run(args, policy); + return 0; +}; + + +#endif diff --git a/best_practices/atvos/tile/tile_alu.h b/best_practices/atvos/tile/tile_alu.h new file mode 100644 index 000000000..9fae50a6c --- /dev/null +++ b/best_practices/atvos/tile/tile_alu.h @@ -0,0 +1,730 @@ +/** + * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/*! + * \file vec.h + * \brief + */ + +#ifndef _ATVOS_VECTOR_ALU_H_ +#define _ATVOS_VECTOR_ALU_H_ + +namespace ATVOS { +namespace Tile { + +#pragma region "TileOpBase" + +template +struct FunBase { + constexpr static int OutputSize = outSize; + constexpr static int InputSize = inSize; + constexpr static int TempSize = tempSize; + constexpr static int FixedSize = fixedBuf; + constexpr static bool IsAdvanced = isAdvanced; + + // 数据类型列表 + using DataTypes = typename std::tuple; + + // 输入的数据类型列表 + template + using FunRetArgType = typename std::tuple_element::type; + + // 输出的数据类型列表 + template + using FunInArgType = typename std::tuple_element::type; +}; + +// 单输入 +template +struct ElemwiseUnaryOP : public FunBase<1, 1, tempBufSize, fixedBuf, isAdvanced, R, In1> {}; + +// 双输入 +template +struct ElemwiseBinaryOP : public FunBase<1, 2, tempBufSize, fixedBuf, isAdvanced, R, In1, In2> {}; + +// 三输入 +template +struct ElemwiseTernaryOP : public FunBase<1, 3, tempBufSize, fixedBuf, isAdvanced, R, In1, In2, In3> {}; + +// 四输入 +template +struct ElemwiseQuaternaryOP : public FunBase<1, 4, tempBufSize, fixedBuf, isAdvanced, R, In1, In2, In3, In4> {}; + +// 五输入 +template +struct ElemwiseQuinaryOP : public FunBase<1, 5, tempBufSize, fixedBuf, isAdvanced, R, In1, In2, In3, In4, In5> {}; + +// 六输入 +template +struct Elemwise6OP : public FunBase<1, 6, tempBufSize, fixedBuf, isAdvanced, R, In1, In2, In3, In4, In5, In6> {}; + +// 7 输入 +template +struct Elemwise7OP : public FunBase<1, 7, tempBufSize, fixedBuf, isAdvanced, R, In1, In2, In3, In4, In5, In6, In7> {}; + +// 8 输入 +template +struct Elemwise8OP : public FunBase<1, 8, tempBufSize, fixedBuf, isAdvanced, R, In1, In2, In3, In4, In5, In6, In7, In8> {}; + +// 9 输入 +template +struct Elemwise9OP : public FunBase<1, 9, tempBufSize, fixedBuf, isAdvanced, R, In1, In2, In3, In4, In5, In6, In7, In8, In9> {}; + +// 10 +template +struct Elemwise10OP : public FunBase<1, 10, tempBufSize, fixedBuf, isAdvanced, R, In1, In2, In3, In4, In5, In6, In7, In8, In9, In10> {}; + +// 11 +template +struct Elemwise11OP : public FunBase<1, 11, tempBufSize, fixedBuf, isAdvanced, R, In1, In2, In3, In4, In5, In6, In7, In8, In9, In10, In11> {}; + + +#pragma region "单输入" + +template +struct Duplicate : public ElemwiseUnaryOP { +__aicore__ inline Duplicate(const LocalTensor& dstLocal, const T& scalar, const int32_t& count) { +#ifdef __CCE_AICORE__ + AscendC::Duplicate(dstLocal, scalar, count); +#endif + } +}; + +template +struct Reciprocal : public ElemwiseUnaryOP { + __aicore__ inline Reciprocal(const LocalTensor& dstLocal, const LocalTensor& src, const int32_t& count) { +#ifdef __CCE_AICORE__ + AscendC::Reciprocal(dstLocal, src, count); +#endif + } +}; + +template +struct Log : public ElemwiseUnaryOP { + __aicore__ inline Log(LocalTensor& dst, LocalTensor& src, int count) { +#ifdef __CCE_AICORE__ + AscendC::Log(dst, src, count); +#endif + } +}; +template +struct Copy : public ElemwiseUnaryOP { + __aicore__ inline Copy(LocalTensor& dst, LocalTensor& src, int count) { +#ifdef __CCE_AICORE__ + AscendC::Copy(dst, src, count); +#endif + } +}; +template +struct Abs : public ElemwiseUnaryOP { + __aicore__ inline Abs(LocalTensor& dst, LocalTensor& src, int count) { +#ifdef __CCE_AICORE__ + AscendC::Abs(dst, src, count); +#endif + } +}; +template +struct Cast : public ElemwiseUnaryOP { + __aicore__ inline Cast(LocalTensor& dst, LocalTensor& src, const uint32_t& count) { +#ifdef __CCE_AICORE__ + AscendC::Cast(dst, src, static_cast(roundMode), count); +#endif + } +}; + + +template +struct Exp : public ElemwiseUnaryOP { + __aicore__ inline Exp(LocalTensor& dst, LocalTensor& src, int count) { +#ifdef __CCE_AICORE__ + AscendC::Exp(dst, src, count); +#endif + } +}; + +template +struct Sin : public ElemwiseUnaryOP { + __aicore__ inline Sin(LocalTensor& dst, LocalTensor& src, int count) { +#ifdef __CCE_AICORE__ + static constexpr SinConfig config = { SinAlgo::RADIAN_REDUCTION }; + AscendC::Sin(dst, src, count); +#endif + } +}; + +template +struct Cos : public ElemwiseUnaryOP { + __aicore__ inline Cos(LocalTensor& dst, LocalTensor& src, int count) { +#ifdef __CCE_AICORE__ + static constexpr CosConfig config = { CosAlgo::RADIAN_REDUCTION }; + AscendC::Cos(dst, src, count); +#endif + } +}; + +template +struct Sqrt : public ElemwiseUnaryOP { + __aicore__ inline Sqrt(LocalTensor& dst, LocalTensor& src, int count) { +#ifdef __CCE_AICORE__ + AscendC::Sqrt(dst, src, count); +#endif + } +}; + +template +struct ReduceOp : public ElemwiseUnaryOP { + __aicore__ inline ReduceOp(LocalTensor& dst, LocalTensor& src, int count) { +#ifdef __CCE_AICORE__ + (void)dst; + (void)src; + (void)count; +#endif + } +}; + +template +struct Truncate : public ElemwiseUnaryOP { + __aicore__ inline Truncate(LocalTensor& dst, LocalTensor& src, const uint32_t& count) { +#ifdef __CCE_AICORE__ + AscendC::Truncate(roundMode)>(dst, src, count); +#endif + } +}; + +template +struct Brc : public ElemwiseUnaryOP{ + __aicore__ inline Brc(LocalTensor& dst, LocalTensor& src, int count) + // __aicore__ inline Brc(LocalTensor& dst, LocalTensor& src, const uint32_t* dstShape, const uint32_t* srcShape, BroadcastTiling* tiling) + { +#ifdef __CCE_AICORE__ + // AscendC::Broadcast(dst, src, dstShape, srcShape, tiling); +#endif + } +}; + +template +struct Not : public ElemwiseUnaryOP { + __aicore__ inline Not(LocalTensor& dst, LocalTensor& src, int count) { +#ifdef __CCE_AICORE__ + AscendC::Not(dst, src, count); +#endif + } +}; +#pragma endregion "单输入" + +#pragma region "Copy" + +template +struct CopyIn : public ElemwiseUnaryOP { + __aicore__ inline CopyIn(const LocalTensor& dst, const GlobalTensor& src, uint32_t count) { +#ifdef __CCE_AICORE__ + DataCopyExtParams copyParams{1, static_cast(count * sizeof(T)), 0, 0, 0}; + AscendC::DataCopyPadExtParams padParams{false, 0, 0, 0}; + AscendC::DataCopyPad(dst, src, copyParams, padParams); +#endif + } +}; + +static constexpr uint8_t NDDMA_DIM = 5; +template +struct CopyInBrc : public ElemwiseUnaryOP{ +#ifdef __DAV_C310__ + __aicore__ inline CopyInBrc(const LocalTensor& dst, const GlobalTensor& src, + const MultiCopyParams& params, + const MultiCopyConfig& config = kDefaultMultiCopyConfig) + { + AscendC::DataCopy(dst, src, params); + } +#endif + __aicore__ inline CopyInBrc(const LocalTensor& dst, const GlobalTensor& src, uint32_t count) + { +#ifdef __CCE_AICORE__ + AscendC::DataCopyExtParams copyParams{1, static_cast(count * sizeof(T)), 0, 0, 0}; + AscendC::DataCopyPadExtParams padParams{false, 0, 0, 0}; + AscendC::DataCopyPad(dst, src, copyParams, padParams); +#endif + } +}; +template +struct CopyOut : public ElemwiseUnaryOP { + __aicore__ inline CopyOut(GlobalTensor& dst, LocalTensor& src, int count) { +#ifdef __CCE_AICORE__ + DataCopyExtParams copyParams{1, static_cast(count * sizeof(T)), 0, 0, 0}; + AscendC::DataCopyPad(dst, src, copyParams); +#endif + } +}; +#pragma endreigon "Copy" + +#pragma region "双输入" + +template +struct Add : public ElemwiseBinaryOP { + __aicore__ inline Add(const LocalTensor& dst, const LocalTensor& src1, LocalTensor& src2, int count) { +#ifdef __CCE_AICORE__ + AscendC::Add(dst, src1, src2, count); +#endif + } +}; + +template +struct Fmod : public ElemwiseBinaryOP { + __aicore__ inline Fmod(const LocalTensor& dst, const LocalTensor& src1, LocalTensor& src2, int count) { +#ifdef __CCE_AICORE__ + AscendC::Fmod(dst, src1, src2, count); +#endif + } +}; + +template +struct Adds : public ElemwiseBinaryOP { + __aicore__ inline Adds(LocalTensor& dst, LocalTensor& src, T& scalar, int count) { +#ifdef __CCE_AICORE__ + AscendC::Adds(dst, src, scalar, count); +#endif + } + __aicore__ inline Adds(LocalTensor& dst, T& scalar, LocalTensor& src, int count) { +#ifdef __CCE_AICORE__ + AscendC::Adds(dst, src, scalar, count); +#endif + } + __aicore__ inline Adds(T& dst, T& scalar0, T& scalar1, int count) { + dst = scalar0 + scalar1;; + } +}; + +template +struct Relu : public ElemwiseBinaryOP { + __aicore__ inline Relu(const LocalTensor& dst, const LocalTensor& src, int32_t count) { +#ifdef __CCE_AICORE__ + AscendC::Relu(dst, src, count); +#endif + } +}; + +template +struct Max : public ElemwiseBinaryOP { + __aicore__ inline Max(const LocalTensor& dst, const LocalTensor& src1, LocalTensor& src2, int count) { +#ifdef __CCE_AICORE__ + AscendC::Max(dst, src1, src2, count); +#endif + } +}; + +template +struct Maxs : public ElemwiseBinaryOP { + __aicore__ inline Maxs(LocalTensor& dst, LocalTensor& src, T& scalar, int count) { +#ifdef __CCE_AICORE__ + AscendC::Maxs(dst, src, scalar, count); +#endif + } + __aicore__ inline Maxs(LocalTensor& dst, T& scalar, LocalTensor& src, int count) { +#ifdef __CCE_AICORE__ + AscendC::Duplicate(dst, scalar, count); + AscendC::Max(dst, dst, src, count); +#endif + } +}; + +template +struct Min : public ElemwiseBinaryOP { + __aicore__ inline Min(const LocalTensor& dst, const LocalTensor& src1, LocalTensor& src2, int count) { +#ifdef __CCE_AICORE__ + AscendC::Min(dst, src1, src2, count); +#endif + } +}; + +template +struct Mins : public ElemwiseBinaryOP { + __aicore__ inline Mins(LocalTensor& dst, LocalTensor& src, T& scalar, int count) { +#ifdef __CCE_AICORE__ + AscendC::Mins(dst, src, scalar, count); +#endif + } + __aicore__ inline Mins(LocalTensor& dst, T& scalar, LocalTensor& src, int count) { +#ifdef __CCE_AICORE__ + AscendC::Mins(dst, src, scalar, count); +#endif + } +}; + +template +struct Sub : public ElemwiseBinaryOP { + __aicore__ inline Sub(LocalTensor& dst, LocalTensor& src1, LocalTensor& src2, int count) { +#ifdef __CCE_AICORE__ + AscendC::Sub(dst, src1, src2, count); +#endif + } +}; + +template +struct Mul : public ElemwiseBinaryOP { + __aicore__ inline Mul(LocalTensor& dst, LocalTensor& src1, LocalTensor& src2, int count) { +#ifdef __CCE_AICORE__ + AscendC::Mul(dst, src1, src2, count); +#endif + } +}; + +template +struct Muls : public ElemwiseBinaryOP { + __aicore__ inline Muls(const LocalTensor& dst, const T& scalar, const LocalTensor& src, int count) { +#ifdef __CCE_AICORE__ + AscendC::Muls(dst, src, scalar, count); +#endif + } + __aicore__ inline Muls(const LocalTensor& dst, const LocalTensor& src, const T& scalar, int count) { +#ifdef __CCE_AICORE__ + AscendC::Muls(dst, src, scalar, count); +#endif + } + __aicore__ inline Muls(T& dst, T& scalar0, T& scalar1, int count) { + dst = scalar0 * scalar1;; + } +}; + +template +struct Div : public ElemwiseBinaryOP { + __aicore__ inline Div(LocalTensor& dst, LocalTensor& src1, LocalTensor& src2, int count) { +#ifdef __CCE_AICORE__ + AscendC::Div(dst, src1, src2, count); +#endif + } +}; + +template +struct DivHighPrecision : public ElemwiseBinaryOP { + __aicore__ inline DivHighPrecision(LocalTensor& dst, LocalTensor& src1, LocalTensor& src2, int count) { +#ifdef __CCE_AICORE__ + static constexpr DivConfig config = {DivAlgo::DIFF_COMPENSATION}; + AscendC::Div(dst, src1, src2, count); +#endif + } +}; + +template +struct Divs : public ElemwiseBinaryOP { + __aicore__ inline Divs(LocalTensor& dst, LocalTensor& src, T& scalar, int count) { +#ifdef __CCE_AICORE__ + AscendC::Duplicate(dst, scalar, count); + AscendC::Div(dst, src, dst, count); +#endif + } + __aicore__ inline Divs(LocalTensor& dst, T& scalar, LocalTensor& src, int count) { +#ifdef __CCE_AICORE__ + AscendC::Duplicate(dst, scalar, count); + AscendC::Div(dst, dst, src, count); +#endif + } + __aicore__ inline Divs(T& dst, T& scalar0, T& scalar1, int count) { + dst = scalar0 / scalar1;; + } +}; + +template +struct Subs : public ElemwiseBinaryOP { + __aicore__ inline Subs(LocalTensor& dst, LocalTensor& src, T& scalar, int count) { +#ifdef __CCE_AICORE__ + AscendC::Duplicate(dst, scalar, count); + AscendC::Sub(dst, src, dst, count); +#endif + } + __aicore__ inline Subs(LocalTensor& dst, T& scalar, LocalTensor& src, int count) { +#ifdef __CCE_AICORE__ + AscendC::Duplicate(dst, scalar, count); + AscendC::Sub(dst, dst, src, count); +#endif + } + __aicore__ inline Subs(T& dst, T& scalar0, T& scalar1, int count) { + dst = scalar0 - scalar1;; + } +}; + +template +struct LeakyRelu : public ElemwiseBinaryOP { + __aicore__ inline LeakyRelu(LocalTensor& dst, LocalTensor& src, T& scalarValue, int count) { +#ifdef __CCE_AICORE__ + AscendC::LeakyRelu(dst, src, scalarValue, count); +#endif + } +}; + +template +struct Compare : public ElemwiseBinaryOP { + __aicore__ inline Compare(LocalTensor& dstLocal, LocalTensor& src0Local, LocalTensor& src1Local, + uint32_t count) { +#ifdef __CCE_AICORE__ + AscendC::Compare(dstLocal, src0Local, src1Local, static_cast(cmpMode), count); +#endif + } + + __aicore__ inline Compare(LocalTensor& dstLocal, LocalTensor& src0Local, T src1Scalar, uint32_t count) { +#ifdef __CCE_AICORE__ + AscendC::CompareScalar(dstLocal, src0Local, src1Scalar, static_cast(cmpMode), count); +#endif + } +}; + +template +struct And : public ElemwiseBinaryOP { + __aicore__ inline And(LocalTensor& dst, LocalTensor& src1, LocalTensor& src2, const int32_t& count) { +#ifdef __CCE_AICORE__ + AscendC::And(dst, src1, src2, count); +#endif + } +}; + +template +struct Ands : public ElemwiseBinaryOP { + __aicore__ inline Ands(LocalTensor& dst, LocalTensor& src, T& scalar, const int32_t& count) { +#ifdef __CCE_AICORE__ + AscendC::Ands(dst, src, scalar, count); +#endif + } + __aicore__ inline Ands(LocalTensor& dst, T& scalar, LocalTensor& src, const int32_t& count) { +#ifdef __CCE_AICORE__ + AscendC::Ands(dst, scalar, src, count); +#endif + } +}; + +template +struct Or : public ElemwiseBinaryOP { + __aicore__ inline Or(LocalTensor& dst, LocalTensor& src1, LocalTensor& src2, const int32_t& count) { +#ifdef __CCE_AICORE__ + AscendC::Or(dst, src1, src2, count); +#endif + } +}; + +template +struct Ors : public ElemwiseBinaryOP { + __aicore__ inline Ors(LocalTensor& dst, LocalTensor& src, T& scalar, const int32_t& count) { +#ifdef __CCE_AICORE__ + AscendC::Ors(dst, src, scalar, count); +#endif + } + __aicore__ inline Ors(LocalTensor& dst, T& scalar, LocalTensor& src, const int32_t& count) { +#ifdef __CCE_AICORE__ + AscendC::Ors(dst, scalar, src, count); +#endif + } +}; + +template +struct Power : public ElemwiseBinaryOP { + __aicore__ inline Power(LocalTensor& dstLocal, LocalTensor& src0Local, LocalTensor& src1Local, + uint32_t count) { +#ifdef __CCE_AICORE__ + static constexpr PowerConfig config = {PowerAlgo::DOUBLE_FLOAT_TECH}; + AscendC::Power(dstLocal, src0Local, src1Local, count); +#endif + } + + __aicore__ inline Power(LocalTensor& dstLocal, LocalTensor& src0Local, T& src1Scalar, uint32_t count) { +#ifdef __CCE_AICORE__ + static constexpr PowerConfig config = {PowerAlgo::DOUBLE_FLOAT_TECH}; + AscendC::Power(dstLocal, src0Local, src1Scalar, count); +#endif + } + + __aicore__ inline Power(LocalTensor& dstLocal, T& src0Scalar, LocalTensor& src1Local, uint32_t count) { +#ifdef __CCE_AICORE__ + static constexpr PowerConfig config = {PowerAlgo::DOUBLE_FLOAT_TECH}; + AscendC::Power(dstLocal, src0Scalar, src1Local, count); +#endif + } +}; +#pragma endregion "双输入" + +#pragma region "三输入" + +template +struct Select : public ElemwiseTernaryOP { + __aicore__ inline Select(const LocalTensor& dstLocal, const LocalTensor& selMask, + const LocalTensor& src0Local, const LocalTensor& src1Local, uint32_t count) { +#ifdef __CCE_AICORE__ + AscendC::Select(dstLocal, selMask, src0Local, src1Local, static_cast(selMode), count); +#endif + } + + __aicore__ inline Select(const LocalTensor& dstLocal, const LocalTensor& selMask, + const LocalTensor& src0Local, T src1Scalar, uint32_t count) { +#ifdef __CCE_AICORE__ + AscendC::Select(dstLocal, selMask, src0Local, src1Scalar, + AscendC::SELMODE::VSEL_TENSOR_SCALAR_MODE, count); +#endif + } + + __aicore__ inline Select(const LocalTensor& dstLocal, const LocalTensor& selMask, + T src0Scalar, const LocalTensor& src1Local, uint32_t count) { +#ifdef __CCE_AICORE__ + AscendC::Select(dstLocal, selMask, src0Scalar, src1Local, + AscendC::SELMODE::VSEL_TENSOR_SCALAR_MODE, count); +#endif + } +}; + +/** + * dst = src1 + src2 * alpha + */ +template +struct FusedMulAdd : public ElemwiseTernaryOP { + __aicore__ inline FusedMulAdd(const LocalTensor& dst, const LocalTensor& src1, const LocalTensor& src2, + const LocalTensor& alpha, int count) { +#ifdef __CCE_AICORE__ + AscendC::FusedMulAdd(src2, alpha, src1, count); + AscendC::Copy(dst, src2, count); +#endif + } +}; + +/** + * dst = src1 + src2 * scalar + */ +template +struct Axpy : public ElemwiseTernaryOP { + __aicore__ inline Axpy(const LocalTensor& dst, const T& scalar, const LocalTensor& src1, + const LocalTensor& src2, int count) { +#ifdef __CCE_AICORE__ + AscendC::Axpy(src1, src2, scalar, count); + AscendC::Copy(dst, src1, count); +#endif + } + __aicore__ inline Axpy(const LocalTensor& dst, const LocalTensor& src1, const LocalTensor& src2, + const T& scalar, int count) { +#ifdef __CCE_AICORE__ + AscendC::Axpy(src1, src2, scalar, count); + AscendC::Copy(dst, src1, count); +#endif + } +}; +#pragma endregion "三输入" + + +} //namespace Tile +} //namespace ATVOS + + + +#if 0 + +template +struct IsCastOp { + constexpr static bool Value = false; +}; + +template +struct IsCastOp> { + constexpr static bool Value = true; +}; + +template +struct IsCastNoOp { + constexpr static bool Value = false; +}; + +template +struct IsCastNoOp> { + constexpr static bool Value = std::is_same::value; +}; + +template +struct IsCopyInBrcOp { + constexpr static bool Value = false; +}; + +template +struct IsCopyInBrcOp> { + constexpr static bool Value = true; +}; + +template +struct IsCopyInOp { + constexpr static bool Value = false; +}; + +template +struct IsCopyInOp> { + constexpr static bool Value = true; +}; + +template +struct IsCopyInOp> { + constexpr static bool Value = true; +}; + +template +struct IsVecBrcOp { + constexpr static bool Value = false; +}; + +template +struct IsVecBrcOp> { + constexpr static bool Value = true; +}; + +template +struct IsCopyOutOp { + constexpr static bool Value = false; +}; + +template +struct IsCopyOutOp> { + constexpr static bool Value = true; +}; + +template +struct IsCopyOutOp> { + constexpr static bool Value = true; +}; + +template +struct IsReduceOp { + constexpr static bool Value = false; +}; + +template +struct IsReduceOp> { + constexpr static bool Value = true; +}; + +template +struct IsDuplicateOp { + constexpr static bool Value = false; +}; + +template +struct IsDuplicateOp> { + constexpr static bool Value = true; +}; + +#endif + + +#endif //_ATVOS_VECTOR_ALU_H_ \ No newline at end of file -- Gitee From ff608d8af484fc3ec044ab3dffb4524cadefeb48 Mon Sep 17 00:00:00 2001 From: wly451717 Date: Tue, 9 Sep 2025 16:47:42 +0800 Subject: [PATCH 002/116] fix compile err --- best_practices/atvos/{ => include}/atvos.h | 0 .../{ => include}/block/block_elemwise.h | 0 .../{ => include}/block/block_elemwise_impl.h | 0 .../atvos/{ => include}/common/dagsch.h | 0 .../atvos/{ => include}/common/expression.h | 0 .../{ => include}/common/impl/atvos_impl.h | 0 .../atvos/{ => include}/common/layout.h | 0 .../atvos/{ => include}/common/placeholder.h | 0 .../{ => include}/device/device_vector.h | 0 .../{ => include}/device/device_vector_impl.h | 0 .../atvos/{ => include}/dfx/check.h | 0 .../atvos/{ => include}/dfx/kernel_mirror.h | 0 .../{ => include}/kernel/kernel_elemwise.h | 0 .../kernel/kernel_elemwise_impl.h | 0 .../atvos/{ => include}/tile/tile_alu.h | 0 best_practices/atvos/main/main.cpp | 112 ++++++++++++++++++ 16 files changed, 112 insertions(+) rename best_practices/atvos/{ => include}/atvos.h (100%) rename best_practices/atvos/{ => include}/block/block_elemwise.h (100%) rename best_practices/atvos/{ => include}/block/block_elemwise_impl.h (100%) rename best_practices/atvos/{ => include}/common/dagsch.h (100%) rename best_practices/atvos/{ => include}/common/expression.h (100%) rename best_practices/atvos/{ => include}/common/impl/atvos_impl.h (100%) rename best_practices/atvos/{ => include}/common/layout.h (100%) rename best_practices/atvos/{ => include}/common/placeholder.h (100%) rename best_practices/atvos/{ => include}/device/device_vector.h (100%) rename best_practices/atvos/{ => include}/device/device_vector_impl.h (100%) rename best_practices/atvos/{ => include}/dfx/check.h (100%) rename best_practices/atvos/{ => include}/dfx/kernel_mirror.h (100%) rename best_practices/atvos/{ => include}/kernel/kernel_elemwise.h (100%) rename best_practices/atvos/{ => include}/kernel/kernel_elemwise_impl.h (100%) rename best_practices/atvos/{ => include}/tile/tile_alu.h (100%) create mode 100644 best_practices/atvos/main/main.cpp diff --git a/best_practices/atvos/atvos.h b/best_practices/atvos/include/atvos.h similarity index 100% rename from best_practices/atvos/atvos.h rename to best_practices/atvos/include/atvos.h diff --git a/best_practices/atvos/block/block_elemwise.h b/best_practices/atvos/include/block/block_elemwise.h similarity index 100% rename from best_practices/atvos/block/block_elemwise.h rename to best_practices/atvos/include/block/block_elemwise.h diff --git a/best_practices/atvos/block/block_elemwise_impl.h b/best_practices/atvos/include/block/block_elemwise_impl.h similarity index 100% rename from best_practices/atvos/block/block_elemwise_impl.h rename to best_practices/atvos/include/block/block_elemwise_impl.h diff --git a/best_practices/atvos/common/dagsch.h b/best_practices/atvos/include/common/dagsch.h similarity index 100% rename from best_practices/atvos/common/dagsch.h rename to best_practices/atvos/include/common/dagsch.h diff --git a/best_practices/atvos/common/expression.h b/best_practices/atvos/include/common/expression.h similarity index 100% rename from best_practices/atvos/common/expression.h rename to best_practices/atvos/include/common/expression.h diff --git a/best_practices/atvos/common/impl/atvos_impl.h b/best_practices/atvos/include/common/impl/atvos_impl.h similarity index 100% rename from best_practices/atvos/common/impl/atvos_impl.h rename to best_practices/atvos/include/common/impl/atvos_impl.h diff --git a/best_practices/atvos/common/layout.h b/best_practices/atvos/include/common/layout.h similarity index 100% rename from best_practices/atvos/common/layout.h rename to best_practices/atvos/include/common/layout.h diff --git a/best_practices/atvos/common/placeholder.h b/best_practices/atvos/include/common/placeholder.h similarity index 100% rename from best_practices/atvos/common/placeholder.h rename to best_practices/atvos/include/common/placeholder.h diff --git a/best_practices/atvos/device/device_vector.h b/best_practices/atvos/include/device/device_vector.h similarity index 100% rename from best_practices/atvos/device/device_vector.h rename to best_practices/atvos/include/device/device_vector.h diff --git a/best_practices/atvos/device/device_vector_impl.h b/best_practices/atvos/include/device/device_vector_impl.h similarity index 100% rename from best_practices/atvos/device/device_vector_impl.h rename to best_practices/atvos/include/device/device_vector_impl.h diff --git a/best_practices/atvos/dfx/check.h b/best_practices/atvos/include/dfx/check.h similarity index 100% rename from best_practices/atvos/dfx/check.h rename to best_practices/atvos/include/dfx/check.h diff --git a/best_practices/atvos/dfx/kernel_mirror.h b/best_practices/atvos/include/dfx/kernel_mirror.h similarity index 100% rename from best_practices/atvos/dfx/kernel_mirror.h rename to best_practices/atvos/include/dfx/kernel_mirror.h diff --git a/best_practices/atvos/kernel/kernel_elemwise.h b/best_practices/atvos/include/kernel/kernel_elemwise.h similarity index 100% rename from best_practices/atvos/kernel/kernel_elemwise.h rename to best_practices/atvos/include/kernel/kernel_elemwise.h diff --git a/best_practices/atvos/kernel/kernel_elemwise_impl.h b/best_practices/atvos/include/kernel/kernel_elemwise_impl.h similarity index 100% rename from best_practices/atvos/kernel/kernel_elemwise_impl.h rename to best_practices/atvos/include/kernel/kernel_elemwise_impl.h diff --git a/best_practices/atvos/tile/tile_alu.h b/best_practices/atvos/include/tile/tile_alu.h similarity index 100% rename from best_practices/atvos/tile/tile_alu.h rename to best_practices/atvos/include/tile/tile_alu.h diff --git a/best_practices/atvos/main/main.cpp b/best_practices/atvos/main/main.cpp new file mode 100644 index 000000000..82b565b12 --- /dev/null +++ b/best_practices/atvos/main/main.cpp @@ -0,0 +1,112 @@ +#include + +#include "../include/atvos.h" +#include "../include/tile/tile_alu.h" +#include "../include/block/block_elemwise.h" +#include "../include/kernel/kernel_elemwise.h" +#include "../include/device/device_vector.h" + + + +using namespace ATVOS; + +// 1, 描述算子原型 +using Input0 = Input> >; //描述每个Block的输入,以及处理的Shape 大小 +using Input1 = Input> >; //描述每个Block的输入, 自动推导 +using out = Output> >; //描述每个Block的输入,自动推导 + +using OpTraits = std::tuple; + +// 2, 描述Compute 过程 +template +struct MyComputer { + using T = Traits_; + constexpr static auto myAdd = Expr>( Placeholder::In1{}, Placeholder::In2{}); + constexpr static auto subs = Expr>(myAdd, 0.5 ); + constexpr static auto out = std::tuple{subs}; +}; + +#if 0 +template +struct MyComputer { + using T = Traits_; + template + void Run(LocalTensor out, LocalTensor& in1, LocalTensor& in2, int a ){ + auto myAdd = Expr>( in1, in2); + auto subs = Expr>(myAdd, 0.5 ); + out = Evaluate(subs); + } + + template + auto Run(){ + auto myAdd = Expr>( in1, in2); + auto subs = Expr>(myAdd, 0.5 ); + return subs; + } +}; +#endif + +// 3, 选择 Block 模版(进行调度) +template +using MyBlock = BlockElemwise::out), Traits_>; + +// 4, 选择 kerne 类型( 处理整尾核) +template +using MyKernel = KernelElemwise< MyBlock>; + +// 5, Device 侧 host+device 混合编程 +using device = DeviceAdapt< MyKernel >; + +int main(int argc, const char* argv[]) { + + + device op; + + void* gmAddr = nullptr; + auto tensor = MakeTensor(gmAddr, MakeShape(1024,1024)); //硬件可能的Shape + + device::Arguments arguments( tensor, tensor, MakeTensor(gmAddr, MakeShape(1024,1024)), 20) ; + + // 校验参数是否合理(当前device能否支持arguments) + auto status = op.CanImplement(arguments); + size_t sizeWorkspace = op.GetWorkspaceSize(arguments); + + op.Run(arguments); + + std::cout << "run" << std::endl; + + return 0; + +} + + +#if 0 + +void kernel() { + TPipe pipe; + constexpr TQue queIn; //编译时分配资源 + constexpr TQue queOut; //编译时分配资源 + constexpr TQue queCalc; //编译时分配资源 + + //vector 操作 + LocalTensor tensor; + LocalTensor tensorOut, tensorOut2; + DataCopy(tensor, gmTesnor, 1024); + queIn.Enque(tensor); + queIn.Deque(tensor); + + Add(tensorOut, tensor, tensor, 1024) + + //Cube 操作 + TQue::EnQue(tensorOut); + TQue::DeQue(tensorOut); + Conv(tensorOut2, tensorOut, ...); + TQue::EnQue(tensorOut2); + TQue::DeQue(tensorOut2); + + DataCopy(gm2, tensorOut2, 1024) + + pipe.staticInitBuffer(queIn); + pipe.staticInitBuffer(queOut); +} +#endif -- Gitee From 857ca9f2e2129246a987e9bc1e1fed135a535ee6 Mon Sep 17 00:00:00 2001 From: wwx921016 Date: Wed, 10 Sep 2025 09:23:42 +0800 Subject: [PATCH 003/116] Add expression template demo --- .../expr_template_demo.cpp | 801 ++++++++++++++++++ 1 file changed, 801 insertions(+) create mode 100644 best_practices/expression_template/expr_template_demo.cpp diff --git a/best_practices/expression_template/expr_template_demo.cpp b/best_practices/expression_template/expr_template_demo.cpp new file mode 100644 index 000000000..3d6fec071 --- /dev/null +++ b/best_practices/expression_template/expr_template_demo.cpp @@ -0,0 +1,801 @@ +#include +#include +#include +#include +#include +#include +#include + + +/* Utilities */ + +namespace Util { + +template +struct AlwaysFalse : std::false_type {}; + +template +inline constexpr bool AlwaysFalse_v = AlwaysFalse::value; + +template