diff --git a/best_practices/atvos/include/atvos.h b/best_practices/atvos/include/atvos.h new file mode 100644 index 0000000000000000000000000000000000000000..f3912288633093810de5117f8bd34ff9865f8561 --- /dev/null +++ b/best_practices/atvos/include/atvos.h @@ -0,0 +1,87 @@ +#ifndef _ATVC_H_TEMP_H_ +#define _ATVC_H_TEMP_H_ + +#include "common/impl/atvos_impl.h" + +namespace ATVOS { + +#pragma region "参数类型" +template +using Input = __auxc::Param<__auxc::ParamType::INPUT, DataType, Layout, Ext...>; + +template +using Output = __auxc::Param<__auxc::ParamType::OUTPUT, DataType, Layout, Ext...>; + +#pragma endregion + +#pragma region "layout" +template +struct Layout { + Shape shape; +}; + +template +using Shape = std::tuple; + +template +Shape MakeShape(Args... ts) { + return {ts...}; +} + +template +auto MakeLayout(Shape&& shape, Stride&& stride) { + return Layout{ shape}; +} +template +auto MakeLayout(Shape&& shape) { + return Layout{ shape}; +} + +template +auto MakeTensor(void* gmAddr, Shape&& layout) { + return Tensor>{gmAddr, layout}; +} + +#pragma endregion "layout" + + +template +struct ArgumentsBase { + using LayoutArgs = typename __auxc::TraitsToTensor::Type; +private: + template + static void SetArgs(LayoutArgs& obj, Argc const & argc, Args const& ... args) + { + constexpr static auto size = std::tuple_size_v< std::remove_reference_t< LayoutArgs > >; + + static_assert(offset= sizeof(LayoutArgs)"); + + //warning: 这个代码要修改,因为常量时 layout 与 传递的layout 的结构是不一样的 + std::get(obj) = argc; + + if constexpr ( offset < size - 1 && sizeof...(args) > 0 ) { + SetArgs(obj, args...); + } + } +public: + template + ArgumentsBase(Args const &... args) { + SetArgs<0>(this->param, args...); + } + ArgumentsBase(){}; +public: + LayoutArgs param; +}; + + +} //ATVOS + +#include "dfx/kernel_mirror.h" +#include "dfx/check.h" + +#include "common/placeholder.h" +#include "common/expression.h" +#include "common/dagsch.h" + + +#endif \ No newline at end of file diff --git a/best_practices/atvos/include/block/block_elemwise.h b/best_practices/atvos/include/block/block_elemwise.h new file mode 100644 index 0000000000000000000000000000000000000000..ff45114fb65e2baaf9d340fc400004440b1de5d5 --- /dev/null +++ b/best_practices/atvos/include/block/block_elemwise.h @@ -0,0 +1,50 @@ +#ifndef _BlockElemwise_H +#define _BlockElemwise_H + +namespace ATVOS { + +template +class BlockElemwise { +public: + using OpTraits = Traits; + using Arguments= ArgumentsBase; + using Policy = Policy_; +public: + BlockElemwise(){} + + bool CanImplement(Arguments const& arg){ return true;}; //检查参数的合法性 + int64_t GetWorkspaceSize(Arguments& arg){ return 0; }; + int64_t Run(Arguments const& arg, Policy* policy){ return 0; }; +}; + +template +class BlockElemwise { +public: + using ComputeOp = DagOp; //DagSchedule; + using OpTraits = Traits; + using Arguments= ArgumentsBase; + using Policy = int; + +public: + static_assert( !(Dfx::IsVoid /* && Dfx::IsVoid */ ), "OpTraits_ is not void"); + +public: + //using OpTraits = std::conditional_t, typename ComputeOp::OpTraits, OpTraits_>; + //using Arguments= std::conditional_t, typename ComputeOp::Arguments, ArgumentsBase>; + //using Policy = std::conditional_t, typename ComputeOp::Policy, Policy_>; + +public: + + BlockElemwise(){} + + bool CanImplement(Arguments const& arg) { return 0; }; + int64_t GetWorkspaceSize(Arguments& arg) { return 0; }; + int64_t Run(Arguments const& arg, Policy* policy){ return 0; }; + +}; + +#include "block_elemwise_impl.h" + +} //ATVOS + +#endif //_BlockElemwise_H \ No newline at end of file diff --git a/best_practices/atvos/include/block/block_elemwise_impl.h b/best_practices/atvos/include/block/block_elemwise_impl.h new file mode 100644 index 0000000000000000000000000000000000000000..6a3d03ff06ccd80303a2e439e22c6ab33d91dfa1 --- /dev/null +++ b/best_practices/atvos/include/block/block_elemwise_impl.h @@ -0,0 +1,22 @@ +#ifndef _ATVOS_BLOCK_ELEMWISE_H_ +#define _ATVOS_BLOCK_ELEMWISE_H_ + +/* +template +bool BlockElemwise::CanImplement(Arguments const& arg){ + return 0; +} +template +int64_t BlockElemwise::GetWorkspaceSize(Arguments& arg) +{ + return 0; +} +template +int64_t BlockElemwise:: Run(Arguments const& arg, Policy* policy) +{ + std::cout << "cccc" << std::endl; + return 0; +} + */ + +#endif \ No newline at end of file diff --git a/best_practices/atvos/include/common/dagsch.h b/best_practices/atvos/include/common/dagsch.h new file mode 100644 index 0000000000000000000000000000000000000000..a1459eca8e4ae07db94ddde8a5cc81bcc8059a45 --- /dev/null +++ b/best_practices/atvos/include/common/dagsch.h @@ -0,0 +1,28 @@ +#ifndef _ATVOS_DAG_SCHEDUCLE_H_ +#define _ATVOS_DAG_SCHEDUCLE_H_ + + +namespace ATVOS { + +template +class DagSchedule { + using OpTraits = int; + using Arguments = ArgumentsBase; + using Policy = int; + +public: + //1, 从outlist 中获取输入,输出, 以及Scalar 类型 + //2, 检查是否存在Copy 函数,决策是否生成Copy + //3, 计算Dag 存活节点 + //4, 基于计算指令,分配地址; + //5, 分析bank 冲突,重新计算地址 + //6, 切分多核, 生成Tiling 的结构 + +}; + +} // ATVOS + + + + +#endif //_ATVOS_DAG_SCHEDUCLE_H_ \ No newline at end of file diff --git a/best_practices/atvos/include/common/expression.h b/best_practices/atvos/include/common/expression.h new file mode 100644 index 0000000000000000000000000000000000000000..fdc3805c885b468c9d5616eab48453443b9ba1e2 --- /dev/null +++ b/best_practices/atvos/include/common/expression.h @@ -0,0 +1,121 @@ +#ifndef _ATVOS_EXPRESSION_H_ +#define _ATVOS_EXPRESSION_H_ + +namespace ATVOS { + +#if 0 +/** + * @brief condition + * + */ +template +struct TupleFilter {}; + +template +struct TupleFilter >{ + using Type = decltype(std::tuple_cat( + std::conditional_t< C(), std::tuple, std::tuple<>>{}... + )); +}; + +template +constexpr bool GetRealArgs() { return true; } + +#endif + +/** + * @tparam Func 需要执行的向量操作函数 + * @tparam Ts Func的入参 + */ +template +struct Bind { +public: + using Fun = Func; + using Args = std::tuple; + + constexpr static bool IsBindFun = true; + + template + constexpr Bind(Args... args){}; +/* + using RealArgs = typename TupleFilter::Type; + +private: + template + struct CreateRealBind; + + template + struct CreateRealBind> { + using Type = __aux::Condition<(Vec::IsCastNoOp::Value), + typename RealArgs::template At<0>, + Bind>; + }; + +public: + // 消除入参中CastNoOp之后的BindType + using RealBindType = typename CreateRealBind::Type; + + // 第 @offset 个出参Dtype + template + using FunRetArgType = typename Fun::template FunRetArgType; + // 第 @offset 个入参Dtype + template + using FunInArgType = typename Fun::template FunInArgType; + +public: + // 当前Bind的输出Dtype + using OutDataType = FunRetArgType<0>; + using DType = OutDataType; + + // 入参中输出Placeholder的列表(输出GM) + using OutHolders = typename Args::template Filter<__aux::TypeIsOutHolder>; + // 入参列表(Args中过滤掉Placeholder::Out) + using InArgs = typename Args::template Filter<__aux::TypeIsInput>; + // 入参中输入Placeholder的列表(输入GM) + using InHolders = typename Args::template Filter<__aux::TypeIsInHolder>; + // 入参中函数列表(其他Bind类型) + using InFuns = typename Args::template Filter<__aux::TypeIsInFun>; + // 入参中Scalar变量列表 + using Vars = typename Args::template Filter<__aux::TypeIsVar>; + // 入参中Const输入列表 + using ConstValues = typename Args::template Filter<__aux::TypeIsConst>; + // 入参中输入是TensorScalar的列表 + using InScalarHolders = typename InHolders::template Filter<__aux::TypeIsInScalarHolder>; + // 入参中函数是Scalar操作的列表 + using InScalarFuns = typename InFuns::template Filter<__aux::TypeIsScalarBind>; + // 入参中ReduceOp操作列表 + using InReduceOpFuns = typename InFuns::template Filter<__aux::TypeIsReduceOpBind>; + // 入参中非ScalarOp函数列表 + using InNonScalarFuns = typename InFuns::template Remove::template Remove; + + static_assert(InArgs::Size == InHolders::Size + InFuns::Size + Vars::Size + ConstValues::Size, "why?"); + + // 标识当前Bind是否是Scalar操作(不需要使用任何UB空间) + constexpr static bool IsScalarOp = InScalarHolders::Size == InHolders::Size && \ + InScalarFuns::Size == InFuns::Size && \ + !Vec::IsDuplicateOp::Value; + + // 入参个数 + constexpr static uint32_t InputSize = InArgs::Size; + // 当前Bind的依赖列表:输入依赖 + 自身 + using DependFuns = typename __aux::GetDependFunsAux::Type::template Append; + // 溯源当前Bind的输入PlaceHolder + using SourceInHolders = typename __aux::GetSourceInHoldersAux::Type::template Union; + +*/ +}; + + + + +template +constexpr inline auto Expr(Args... agrs) { + return Bind(agrs...); +} + + +} // namespace ATVOS + + + +#endif \ No newline at end of file diff --git a/best_practices/atvos/include/common/impl/atvos_impl.h b/best_practices/atvos/include/common/impl/atvos_impl.h new file mode 100644 index 0000000000000000000000000000000000000000..9584a636cf84621e97e6665ac6132a5d31a77407 --- /dev/null +++ b/best_practices/atvos/include/common/impl/atvos_impl.h @@ -0,0 +1,152 @@ +#ifndef _ATVC_IMPL_H_IMPL_H_ +#define _ATVC_IMPL_H_IMPL_H_ + +#include + +namespace ATVOS { + +struct EmptyStruct {}; + +template +struct Tensor { + using Type = DataType; + using Layout = std::conditional_t< std::is_same_v, EmptyStruct, Layout_> ; + + void* gmAddr; + Layout layout; +}; + +namespace __auxc { + +#pragma region "算子原型的类型定义" + +enum class ParamType { + Normal, // 原生类型,不做处理 + INPUT, // GM + OUTPUT, // GM + TEMP, // UB + PRE_TEMP_OUTPUT, // UB + POST_TEMP_INPUT, // UB +}; +template +struct Param { + constexpr static ParamType paramType = paramType_; + using DataType = DataType_; + using Layout = Layout_; + using ExtList = std::tuple; +}; +#pragma endregion +} //namespace __auxc + +namespace __auxc { + +#pragma region "OpTraits To Tensor" +template +struct ArgsToTensorType { + using Type = T; +}; +template +struct ArgsToTensorType< Param > { + using Type = DataType_; +}; +template +struct ArgsToTensorType< Param > { + using Type = Tensor; +}; +template +struct ArgsToTensorType< Param > { + using Type = Tensor; +}; +template +struct ArgsToTensorType< Param > { + using Type = Tensor; +}; +template +struct ArgsToTensorType< Param > { + using Type = Tensor; +}; +template +struct ArgsToTensorType< Param > { + using Type = Tensor; +}; + + +template +using ArgsToTensorType_t = typename ArgsToTensorType::Type; + +template +struct TraitsToTensor { using Type = T; }; + +template +struct TraitsToTensor > { + using Type = std::tuple< ArgsToTensorType_t... >; +}; +#pragma endregion +} //namespace __auxc + + + +namespace __auxc { + +template +struct GetParamLayout { + using Type = EmptyStruct; +}; + +template +struct GetParamLayout +{ + using Type = typename T::Layout; +}; + +template +using GetParamLayoutT = typename GetParamLayout::Type; + + + +//////////////////////// //////////// //////////// //////////// +template +struct _TemplateArgs {}; + +template<> +struct _TemplateArgs<> {}; + +template +struct _TemplateArgs : public _TemplateArgs { + using Type = GetParamLayoutT; +}; + + +///////////////////////////////////////////////////////////// +template +struct OpTraitsLayout{ + using Type = T; +}; + + +// Ts 为: Input +struct OpTraitsLayout< std::tuple > { + using Type = _TemplateArgs; +}; + +///////////////////////////////////////////////////////////// + +template +void LayoutSet(R& layout, T&& src, Shapes&&... ext){ + layout.template SetLayout(src); + if constexpr( sizeof...(ext) > 0){ + LayoutSet(layout, std::forward(ext)...); + } +} + + +} //__auxc + + + + +} //ATVOS + + +#endif //_ATVC_IMPL_H_IMPL_H_ \ No newline at end of file diff --git a/best_practices/atvos/include/common/layout.h b/best_practices/atvos/include/common/layout.h new file mode 100644 index 0000000000000000000000000000000000000000..73a5c1cc89183c9cab461af348e1830eccde2be0 --- /dev/null +++ b/best_practices/atvos/include/common/layout.h @@ -0,0 +1,116 @@ +#ifndef _ATVOSS_LAYOUT_H_ +#define _ATVOSS_LAYOUT_H_ + +#pragma once + +namespace ATVOS { + +template +using Shape = std::tuple; + +template +using Stride = std::tuple; + +template +using Coord = std::tuple; + +template +constexpr Shape make_shape(Ts const&... t) { + return {t...}; +} +template +constexpr Stride make_stride(Ts const&... t) { + return {t...}; +} + +template +constexpr Coord make_coord(Ts const&... t) { + return {t...}; +} + +template > +struct Layout : private std::tuple { + constexpr Layout(Shape const& shape = {}, Stride const& stride = {}) : std::tuple(shape, stride) + {} + constexpr decltype(auto) layout() { return *this; } + + constexpr decltype(auto) layout() const { return *this; } + + template + constexpr decltype(auto) shape() { + return get<0,I...>(static_cast&>(*this)); + } + + template + constexpr decltype(auto) shape() const { + return get<0,I...>(static_cast const&>(*this)); + } + + template + constexpr decltype(auto) stride() { + return get<1,I...>(static_cast&>(*this)); + } + + template + constexpr decltype(auto) stride() const { + return get<1,I...>(static_cast const&>(*this)); + } + template constexpr auto operator()(Coord const& coord) const { + if constexpr (has_underscore::value) { + return slice(coord, *this); + } else { + return crd2idx(coord, shape(), stride()); + } + } + + // Convenience function for multi-dimensional coordinates + template + constexpr + auto + operator()(Coord0 const& c0, Coord1 const& c1, Coords const&... cs) const { + return operator()(make_coord(c0,c1,cs...)); + } + +}; + + + +template +constexpr decltype(auto) layout(Layout const& layout) +{ + if constexpr (sizeof...(Is) == 0) { + return layout; + } else { + return get(layout); + } +} + +template +constexpr decltype(auto) shape(Layout& layout) +{ + return layout.template shape(); +} + +template +constexpr decltype(auto) shape(Layout const& layout) +{ + return layout.template shape(); +} + +template +constexpr auto size(Layout const& layout) +{ + return size(shape(layout)); +} + +template +constexpr auto rank(Layout const& layout) +{ + return rank(shape(layout)); +} + +} //ATVOS + + + +#endif //_ATVOSS_LAYOUT_H_ \ No newline at end of file diff --git a/best_practices/atvos/include/common/placeholder.h b/best_practices/atvos/include/common/placeholder.h new file mode 100644 index 0000000000000000000000000000000000000000..bdca4cd9cf41ad4d832be6a361500765eeb07c37 --- /dev/null +++ b/best_practices/atvos/include/common/placeholder.h @@ -0,0 +1,323 @@ + +/** + * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/*! + * \file placeholder.h + * \brief + */ +#ifndef _ATVOS_PLACEHOLDER_H_ +#define _ATVOS_PLACEHOLDER_H_ + +#ifndef __CCE_AICORE__ +#define __aicore__ +#define __global__ +#define __gm__ +#endif + +namespace Placeholder { + +#define CONCATENATE_DETAIL(name, idx) name##idx +#define MAKE_UNIQ_NAME(name, idx) CONCATENATE_DETAIL(name, idx) +#define MAKE_CONST(t, v) \ + struct MAKE_UNIQ_NAME(ConcatValue_##t##_, __COUNTER__) { \ + constexpr static bool IsConstValue_ = true; \ + static constexpr t value = v; \ + } + +enum HolderScope { GM, VECTOR }; + +template +struct Holder { + constexpr static int Pos = cur; + constexpr static bool IsHolder = true; + using DType = T; + using Attr = Attr_; +}; + +template +struct InAttr { + constexpr static int IsScalar = isScalar; + constexpr static HolderScope Scope = scope; +}; + +template +struct ScalarAttr : public InAttr {}; + +template +struct ScopeAttr : public InAttr {}; + +/** GM输入, 存在Scalar输入 */ +template > +struct In0 : public Holder {}; +template > +struct In1 : public Holder {}; +template > +struct In2 : public Holder {}; +template > +struct In3 : public Holder {}; +template > +struct In4 : public Holder {}; +template > +struct In5 : public Holder {}; +template > +struct In6 : public Holder {}; +template > +struct In7 : public Holder {}; +template > +struct In8 : public Holder {}; +template > +struct In9 : public Holder {}; +template > +struct In10 : public Holder {}; +template > +struct In11 : public Holder {}; + +template +struct IsInHolder { + constexpr static bool Value = false; +}; +template class T> +struct IsInHolder> { + constexpr static bool Value = false; +}; +template +struct IsInHolder> { + constexpr static bool Value = true; +}; +template +struct IsInHolder> { + constexpr static bool Value = true; +}; +template +struct IsInHolder> { + constexpr static bool Value = true; +}; +template +struct IsInHolder> { + constexpr static bool Value = true; +}; +template +struct IsInHolder> { + constexpr static bool Value = true; +}; +template +struct IsInHolder> { + constexpr static bool Value = true; +}; +template +struct IsInHolder> { + constexpr static bool Value = true; +}; +template +struct IsInHolder> { + constexpr static bool Value = true; +}; +template +struct IsInHolder> { + constexpr static bool Value = true; +}; +template +struct IsInHolder> { + constexpr static bool Value = true; +}; +template +struct IsInHolder> { + constexpr static bool Value = true; +}; +template +struct IsInHolder> { + constexpr static bool Value = true; +}; + +/** 判断输入是否是Scalar输入 */ +template +struct IsInScalar { + constexpr static bool Value = false; +}; +template class T> +struct IsInScalar> { + constexpr static bool Value = Attr::IsScalar == 1; +}; + +/** GM输出,不存在Scalar输出 */ +template +struct Out0 : public Holder { + using Parent = Parent_; +}; +template +struct Out1 : public Holder { + using Parent = Parent_; +}; +template +struct Out2 : public Holder { + using Parent = Parent_; +}; +template +struct Out3 : public Holder { + using Parent = Parent_; +}; +template +struct Out4 : public Holder { + using Parent = Parent_; +}; +template +struct Out5 : public Holder { + using Parent = Parent_; +}; +template +struct Out6 : public Holder { + using Parent = Parent_; +}; +template +struct Out7 : public Holder { + using Parent = Parent_; +}; +template +struct Out8 : public Holder { + using Parent = Parent_; +}; +template +struct Out9 : public Holder { + using Parent = Parent_; +}; + +template +struct IsOutHolder { + constexpr static bool Value = false; +}; +template class T> +struct IsOutHolder> { + constexpr static bool Value = false; +}; +template +struct IsOutHolder> { + constexpr static bool Value = true; +}; +template +struct IsOutHolder> { + constexpr static bool Value = true; +}; +template +struct IsOutHolder> { + constexpr static bool Value = true; +}; +template +struct IsOutHolder> { + constexpr static bool Value = true; +}; +template +struct IsOutHolder> { + constexpr static bool Value = true; +}; +template +struct IsOutHolder> { + constexpr static bool Value = true; +}; +template +struct IsOutHolder> { + constexpr static bool Value = true; +}; +template +struct IsOutHolder> { + constexpr static bool Value = true; +}; +template +struct IsOutHolder> { + constexpr static bool Value = true; +}; +template +struct IsOutHolder> { + constexpr static bool Value = true; +}; + +template +struct Var : public Holder { + constexpr static bool IsVarValue = true; +}; + +#define CHECK_TYPE(NAME, V) \ + template \ + struct NAME { \ + template \ + static uint8_t has(...); \ + template \ + static uint32_t has(decltype(C::V)); \ + constexpr static bool Value = sizeof(has(NULL)) == sizeof(uint32_t); \ + } + +CHECK_TYPE(IsVar, IsVarValue); +CHECK_TYPE(IsConstValue, IsConstValue_); + +/** 值设置:定义值存储结构 */ +template +struct VarTypeStruct {}; +template <> +struct VarTypeStruct<> {}; +template +struct VarTypeStruct { + using DataType = typename T::DType; + DataType value; + template + __aicore__ inline constexpr DataType Get() { + return value; + } + template + __aicore__ inline constexpr void Set(DataType v) { + value = v; + } +}; + +template +struct VarTypeStruct : public VarTypeStruct { + using DataType = typename T::DType; + DataType value; + template + __aicore__ inline constexpr DataType Get() { + if constexpr (offset == 0) { + return value; + } else { + return VarTypeStruct::template Get(); + } + } + template + __aicore__ inline constexpr void Set(DataType v) { + if constexpr (offset == 0) { + value = v; + } else { + VarTypeStruct::template Set(v); + } + } +}; + +template +struct VarTypeAux { + using Type = VarTypeStruct; +}; + + +} // namespace Placeholder + +#pragma region +template +struct Int { + constexpr static int value = v; +}; +#pragma endregion + + + +#endif // _ATVOS_PLACEHOLDER_H_ diff --git a/best_practices/atvos/include/device/device_vector.h b/best_practices/atvos/include/device/device_vector.h new file mode 100644 index 0000000000000000000000000000000000000000..9ec8a0d74cc89a3cac5e0e8efdcb583c94e5984b --- /dev/null +++ b/best_practices/atvos/include/device/device_vector.h @@ -0,0 +1,58 @@ +#ifndef _DEVICE_VECTOR_H_ +#define _DEVICE_VECTOR_H_ + +#include "../dfx/check.h" + + +namespace ATVOS { + + +using Stream = int*; + +/** + * @brief + * + * @tparam OpTraits_ 需要的Layout 信息 + * @tparam KernelOp kernel 的入口 + * @tparam Policy 策略: 决策KernelOp的处理策略, 例如:Reduce 时分解为2个OpKernel, 前一个OpKernel 处理Reduce 到workspace, 后一个OpKernel 从workspace 到gm + * @tparam Enable + */ +template +class DeviceAdapt { +public: + using OpTraits = std::conditional_t, typename KernelOp::OpTraits, OpTraits_>; + using Arguments = std::conditional_t, typename KernelOp::Arguments, ArgumentsBase>; + using Policy = std::conditional_t, typename KernelOp::Policy, Policy_>; +public: + static_assert( Dfx::IsTupleV, "OpTraits_ is std::tuple"); +public: + DeviceAdapt(){}; + + bool CanImplement(Arguments& arg); //检查参数的合法性 + int64_t GetWorkspaceSize(Arguments& arg); + int64_t Run(Arguments& args, Policy* policy=nullptr, Stream stream = nullptr); + +}; + +#include "device_vector_impl.h" + + + + +} // ATVOS + + + +#endif //_DEVICE_VECTOR_H_ + + + + + + + + + + + + \ No newline at end of file diff --git a/best_practices/atvos/include/device/device_vector_impl.h b/best_practices/atvos/include/device/device_vector_impl.h new file mode 100644 index 0000000000000000000000000000000000000000..3b52be84e21917ff0f0667fc4adbfa3df662e712 --- /dev/null +++ b/best_practices/atvos/include/device/device_vector_impl.h @@ -0,0 +1,31 @@ +#ifndef __ATVOS_DEVECOR_VECTOR_IMPPL_H_ +#define __ATVOS_DEVECOR_VECTOR_IMPPL_H_ + + + +template +bool DeviceAdapt::CanImplement(Arguments& arg){ + //检查KernelOp 的In,out 是否进行了 gm 地址的赋值 + return 0; +}; +template +int64_t DeviceAdapt< KernelOp, OpTraits_, Policy_, Schedule, Enable>::GetWorkspaceSize(Arguments& arg) +{ + return 0; +}; + +template +int64_t DeviceAdapt::Run(Arguments& arg, Policy* policy, Stream stream){ + //构建Kernel 对象,然后调用 + KernelOp op; + + op.Run(arg, nullptr); + return 0; +}; + + + + + +#endif //__ATVOS_DEVECOR_VECTOR_IMPPL_H_ + diff --git a/best_practices/atvos/include/dfx/check.h b/best_practices/atvos/include/dfx/check.h new file mode 100644 index 0000000000000000000000000000000000000000..604ac7db46039b29fea105d9992fbac327ef33d5 --- /dev/null +++ b/best_practices/atvos/include/dfx/check.h @@ -0,0 +1,25 @@ +#ifndef _ATVOS_DFX_CHECK_H_ +#define _ATVOS_DFX_CHECK_H_ + +namespace ATVOS { +namespace Dfx { + +template +struct IsTuple { constexpr static bool Value = false; }; + +template +struct IsTuple< std::tuple > { constexpr static bool Value = true; }; + +template +constexpr bool IsTupleV = IsTuple::Value; + +template +constexpr bool IsVoid = std::is_same_v; + + +} //namespace Dfx +} //namespace ATVOS + + + +#endif \ No newline at end of file diff --git a/best_practices/atvos/include/dfx/kernel_mirror.h b/best_practices/atvos/include/dfx/kernel_mirror.h new file mode 100644 index 0000000000000000000000000000000000000000..214dc0b428d5e6fc9b0399226492e2a608e7ab9e --- /dev/null +++ b/best_practices/atvos/include/dfx/kernel_mirror.h @@ -0,0 +1,32 @@ +#ifndef _ATVOS_DFX_KERNEL_MIRROR_H_ +#define _ATVOS_DFX_KERNEL_MIRROR_H_ + + + +#ifdef __CCE_AICORE__ +#include "kernel_operator.h" +using namespace AscendC; +#else +// CPU编译过程中需要Mock掉Kernel侧的实现 +#define __aicore__ +#define __global__ +#define __gm__ +struct half { + uint16_t value; +}; +struct bfloat16_t { + uint16_t value; +}; +template +struct LocalTensor {}; + +template +struct GlobalTensor { + void SetGlobalBuffer(T* y) {} +}; +#endif + + + + +#endif \ No newline at end of file diff --git a/best_practices/atvos/include/kernel/kernel_elemwise.h b/best_practices/atvos/include/kernel/kernel_elemwise.h new file mode 100644 index 0000000000000000000000000000000000000000..870b17241e7d2730315c198a8fd1446f6bb32775 --- /dev/null +++ b/best_practices/atvos/include/kernel/kernel_elemwise.h @@ -0,0 +1,35 @@ +#ifndef __KERNEL_ELEMWISE_H_ +#define __KERNEL_ELEMWISE_H_ + +namespace ATVOS { + +template +class KernelElemwise +{ +public: + static_assert( !(Dfx::IsVoid && Dfx::IsVoid), "OpTraits_ is not void"); + +public: + using OpTraits = std::conditional_t, typename BlockOp::OpTraits, OpTraits_>; + using Arguments= std::conditional_t, typename BlockOp::Arguments, ArgumentsBase>; + using Policy = std::conditional_t, typename BlockOp::Policy, Policy_>; +public: + static_assert( Dfx::IsTupleV, "OpTraits_ is std::tuple"); +public: + KernelElemwise(){}; + + bool CanImplement(Arguments& arg); //检查参数的合法性 + int64_t GetWorkspaceSize(Arguments& arg); + int64_t Run(Arguments& args, Policy* policy); + +protected: + Policy policy; +}; + + +#include "kernel_elemwise_impl.h" + +} // ATVOS + + +#endif //__KERNEL_ELEMWISE_H_ \ No newline at end of file diff --git a/best_practices/atvos/include/kernel/kernel_elemwise_impl.h b/best_practices/atvos/include/kernel/kernel_elemwise_impl.h new file mode 100644 index 0000000000000000000000000000000000000000..7bf3e0ca58f66d5b13186f6a01b2bba087ea9aae --- /dev/null +++ b/best_practices/atvos/include/kernel/kernel_elemwise_impl.h @@ -0,0 +1,24 @@ +#ifndef _ATVOS_KERNLE_ELELMWISE_H_ +#define _ATVOS_KERNLE_ELELMWISE_H_ + +template +bool KernelElemwise::CanImplement(Arguments& arg){ + //检查KernelOp 的In,out 是否进行了 gm 地址的赋值 + return true; +}; +template +int64_t KernelElemwise::GetWorkspaceSize(Arguments& arg) +{ + return 0; +}; + +template +int64_t KernelElemwise::Run(Arguments& args, Policy* policy){ + + BlockOp op; + op.Run(args, policy); + return 0; +}; + + +#endif diff --git a/best_practices/atvos/include/tile/tile_alu.h b/best_practices/atvos/include/tile/tile_alu.h new file mode 100644 index 0000000000000000000000000000000000000000..9fae50a6c0c61e566724b8b853ccc01403dcfb2a --- /dev/null +++ b/best_practices/atvos/include/tile/tile_alu.h @@ -0,0 +1,730 @@ +/** + * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/*! + * \file vec.h + * \brief + */ + +#ifndef _ATVOS_VECTOR_ALU_H_ +#define _ATVOS_VECTOR_ALU_H_ + +namespace ATVOS { +namespace Tile { + +#pragma region "TileOpBase" + +template +struct FunBase { + constexpr static int OutputSize = outSize; + constexpr static int InputSize = inSize; + constexpr static int TempSize = tempSize; + constexpr static int FixedSize = fixedBuf; + constexpr static bool IsAdvanced = isAdvanced; + + // 数据类型列表 + using DataTypes = typename std::tuple; + + // 输入的数据类型列表 + template + using FunRetArgType = typename std::tuple_element::type; + + // 输出的数据类型列表 + template + using FunInArgType = typename std::tuple_element::type; +}; + +// 单输入 +template +struct ElemwiseUnaryOP : public FunBase<1, 1, tempBufSize, fixedBuf, isAdvanced, R, In1> {}; + +// 双输入 +template +struct ElemwiseBinaryOP : public FunBase<1, 2, tempBufSize, fixedBuf, isAdvanced, R, In1, In2> {}; + +// 三输入 +template +struct ElemwiseTernaryOP : public FunBase<1, 3, tempBufSize, fixedBuf, isAdvanced, R, In1, In2, In3> {}; + +// 四输入 +template +struct ElemwiseQuaternaryOP : public FunBase<1, 4, tempBufSize, fixedBuf, isAdvanced, R, In1, In2, In3, In4> {}; + +// 五输入 +template +struct ElemwiseQuinaryOP : public FunBase<1, 5, tempBufSize, fixedBuf, isAdvanced, R, In1, In2, In3, In4, In5> {}; + +// 六输入 +template +struct Elemwise6OP : public FunBase<1, 6, tempBufSize, fixedBuf, isAdvanced, R, In1, In2, In3, In4, In5, In6> {}; + +// 7 输入 +template +struct Elemwise7OP : public FunBase<1, 7, tempBufSize, fixedBuf, isAdvanced, R, In1, In2, In3, In4, In5, In6, In7> {}; + +// 8 输入 +template +struct Elemwise8OP : public FunBase<1, 8, tempBufSize, fixedBuf, isAdvanced, R, In1, In2, In3, In4, In5, In6, In7, In8> {}; + +// 9 输入 +template +struct Elemwise9OP : public FunBase<1, 9, tempBufSize, fixedBuf, isAdvanced, R, In1, In2, In3, In4, In5, In6, In7, In8, In9> {}; + +// 10 +template +struct Elemwise10OP : public FunBase<1, 10, tempBufSize, fixedBuf, isAdvanced, R, In1, In2, In3, In4, In5, In6, In7, In8, In9, In10> {}; + +// 11 +template +struct Elemwise11OP : public FunBase<1, 11, tempBufSize, fixedBuf, isAdvanced, R, In1, In2, In3, In4, In5, In6, In7, In8, In9, In10, In11> {}; + + +#pragma region "单输入" + +template +struct Duplicate : public ElemwiseUnaryOP { +__aicore__ inline Duplicate(const LocalTensor& dstLocal, const T& scalar, const int32_t& count) { +#ifdef __CCE_AICORE__ + AscendC::Duplicate(dstLocal, scalar, count); +#endif + } +}; + +template +struct Reciprocal : public ElemwiseUnaryOP { + __aicore__ inline Reciprocal(const LocalTensor& dstLocal, const LocalTensor& src, const int32_t& count) { +#ifdef __CCE_AICORE__ + AscendC::Reciprocal(dstLocal, src, count); +#endif + } +}; + +template +struct Log : public ElemwiseUnaryOP { + __aicore__ inline Log(LocalTensor& dst, LocalTensor& src, int count) { +#ifdef __CCE_AICORE__ + AscendC::Log(dst, src, count); +#endif + } +}; +template +struct Copy : public ElemwiseUnaryOP { + __aicore__ inline Copy(LocalTensor& dst, LocalTensor& src, int count) { +#ifdef __CCE_AICORE__ + AscendC::Copy(dst, src, count); +#endif + } +}; +template +struct Abs : public ElemwiseUnaryOP { + __aicore__ inline Abs(LocalTensor& dst, LocalTensor& src, int count) { +#ifdef __CCE_AICORE__ + AscendC::Abs(dst, src, count); +#endif + } +}; +template +struct Cast : public ElemwiseUnaryOP { + __aicore__ inline Cast(LocalTensor& dst, LocalTensor& src, const uint32_t& count) { +#ifdef __CCE_AICORE__ + AscendC::Cast(dst, src, static_cast(roundMode), count); +#endif + } +}; + + +template +struct Exp : public ElemwiseUnaryOP { + __aicore__ inline Exp(LocalTensor& dst, LocalTensor& src, int count) { +#ifdef __CCE_AICORE__ + AscendC::Exp(dst, src, count); +#endif + } +}; + +template +struct Sin : public ElemwiseUnaryOP { + __aicore__ inline Sin(LocalTensor& dst, LocalTensor& src, int count) { +#ifdef __CCE_AICORE__ + static constexpr SinConfig config = { SinAlgo::RADIAN_REDUCTION }; + AscendC::Sin(dst, src, count); +#endif + } +}; + +template +struct Cos : public ElemwiseUnaryOP { + __aicore__ inline Cos(LocalTensor& dst, LocalTensor& src, int count) { +#ifdef __CCE_AICORE__ + static constexpr CosConfig config = { CosAlgo::RADIAN_REDUCTION }; + AscendC::Cos(dst, src, count); +#endif + } +}; + +template +struct Sqrt : public ElemwiseUnaryOP { + __aicore__ inline Sqrt(LocalTensor& dst, LocalTensor& src, int count) { +#ifdef __CCE_AICORE__ + AscendC::Sqrt(dst, src, count); +#endif + } +}; + +template +struct ReduceOp : public ElemwiseUnaryOP { + __aicore__ inline ReduceOp(LocalTensor& dst, LocalTensor& src, int count) { +#ifdef __CCE_AICORE__ + (void)dst; + (void)src; + (void)count; +#endif + } +}; + +template +struct Truncate : public ElemwiseUnaryOP { + __aicore__ inline Truncate(LocalTensor& dst, LocalTensor& src, const uint32_t& count) { +#ifdef __CCE_AICORE__ + AscendC::Truncate(roundMode)>(dst, src, count); +#endif + } +}; + +template +struct Brc : public ElemwiseUnaryOP{ + __aicore__ inline Brc(LocalTensor& dst, LocalTensor& src, int count) + // __aicore__ inline Brc(LocalTensor& dst, LocalTensor& src, const uint32_t* dstShape, const uint32_t* srcShape, BroadcastTiling* tiling) + { +#ifdef __CCE_AICORE__ + // AscendC::Broadcast(dst, src, dstShape, srcShape, tiling); +#endif + } +}; + +template +struct Not : public ElemwiseUnaryOP { + __aicore__ inline Not(LocalTensor& dst, LocalTensor& src, int count) { +#ifdef __CCE_AICORE__ + AscendC::Not(dst, src, count); +#endif + } +}; +#pragma endregion "单输入" + +#pragma region "Copy" + +template +struct CopyIn : public ElemwiseUnaryOP { + __aicore__ inline CopyIn(const LocalTensor& dst, const GlobalTensor& src, uint32_t count) { +#ifdef __CCE_AICORE__ + DataCopyExtParams copyParams{1, static_cast(count * sizeof(T)), 0, 0, 0}; + AscendC::DataCopyPadExtParams padParams{false, 0, 0, 0}; + AscendC::DataCopyPad(dst, src, copyParams, padParams); +#endif + } +}; + +static constexpr uint8_t NDDMA_DIM = 5; +template +struct CopyInBrc : public ElemwiseUnaryOP{ +#ifdef __DAV_C310__ + __aicore__ inline CopyInBrc(const LocalTensor& dst, const GlobalTensor& src, + const MultiCopyParams& params, + const MultiCopyConfig& config = kDefaultMultiCopyConfig) + { + AscendC::DataCopy(dst, src, params); + } +#endif + __aicore__ inline CopyInBrc(const LocalTensor& dst, const GlobalTensor& src, uint32_t count) + { +#ifdef __CCE_AICORE__ + AscendC::DataCopyExtParams copyParams{1, static_cast(count * sizeof(T)), 0, 0, 0}; + AscendC::DataCopyPadExtParams padParams{false, 0, 0, 0}; + AscendC::DataCopyPad(dst, src, copyParams, padParams); +#endif + } +}; +template +struct CopyOut : public ElemwiseUnaryOP { + __aicore__ inline CopyOut(GlobalTensor& dst, LocalTensor& src, int count) { +#ifdef __CCE_AICORE__ + DataCopyExtParams copyParams{1, static_cast(count * sizeof(T)), 0, 0, 0}; + AscendC::DataCopyPad(dst, src, copyParams); +#endif + } +}; +#pragma endreigon "Copy" + +#pragma region "双输入" + +template +struct Add : public ElemwiseBinaryOP { + __aicore__ inline Add(const LocalTensor& dst, const LocalTensor& src1, LocalTensor& src2, int count) { +#ifdef __CCE_AICORE__ + AscendC::Add(dst, src1, src2, count); +#endif + } +}; + +template +struct Fmod : public ElemwiseBinaryOP { + __aicore__ inline Fmod(const LocalTensor& dst, const LocalTensor& src1, LocalTensor& src2, int count) { +#ifdef __CCE_AICORE__ + AscendC::Fmod(dst, src1, src2, count); +#endif + } +}; + +template +struct Adds : public ElemwiseBinaryOP { + __aicore__ inline Adds(LocalTensor& dst, LocalTensor& src, T& scalar, int count) { +#ifdef __CCE_AICORE__ + AscendC::Adds(dst, src, scalar, count); +#endif + } + __aicore__ inline Adds(LocalTensor& dst, T& scalar, LocalTensor& src, int count) { +#ifdef __CCE_AICORE__ + AscendC::Adds(dst, src, scalar, count); +#endif + } + __aicore__ inline Adds(T& dst, T& scalar0, T& scalar1, int count) { + dst = scalar0 + scalar1;; + } +}; + +template +struct Relu : public ElemwiseBinaryOP { + __aicore__ inline Relu(const LocalTensor& dst, const LocalTensor& src, int32_t count) { +#ifdef __CCE_AICORE__ + AscendC::Relu(dst, src, count); +#endif + } +}; + +template +struct Max : public ElemwiseBinaryOP { + __aicore__ inline Max(const LocalTensor& dst, const LocalTensor& src1, LocalTensor& src2, int count) { +#ifdef __CCE_AICORE__ + AscendC::Max(dst, src1, src2, count); +#endif + } +}; + +template +struct Maxs : public ElemwiseBinaryOP { + __aicore__ inline Maxs(LocalTensor& dst, LocalTensor& src, T& scalar, int count) { +#ifdef __CCE_AICORE__ + AscendC::Maxs(dst, src, scalar, count); +#endif + } + __aicore__ inline Maxs(LocalTensor& dst, T& scalar, LocalTensor& src, int count) { +#ifdef __CCE_AICORE__ + AscendC::Duplicate(dst, scalar, count); + AscendC::Max(dst, dst, src, count); +#endif + } +}; + +template +struct Min : public ElemwiseBinaryOP { + __aicore__ inline Min(const LocalTensor& dst, const LocalTensor& src1, LocalTensor& src2, int count) { +#ifdef __CCE_AICORE__ + AscendC::Min(dst, src1, src2, count); +#endif + } +}; + +template +struct Mins : public ElemwiseBinaryOP { + __aicore__ inline Mins(LocalTensor& dst, LocalTensor& src, T& scalar, int count) { +#ifdef __CCE_AICORE__ + AscendC::Mins(dst, src, scalar, count); +#endif + } + __aicore__ inline Mins(LocalTensor& dst, T& scalar, LocalTensor& src, int count) { +#ifdef __CCE_AICORE__ + AscendC::Mins(dst, src, scalar, count); +#endif + } +}; + +template +struct Sub : public ElemwiseBinaryOP { + __aicore__ inline Sub(LocalTensor& dst, LocalTensor& src1, LocalTensor& src2, int count) { +#ifdef __CCE_AICORE__ + AscendC::Sub(dst, src1, src2, count); +#endif + } +}; + +template +struct Mul : public ElemwiseBinaryOP { + __aicore__ inline Mul(LocalTensor& dst, LocalTensor& src1, LocalTensor& src2, int count) { +#ifdef __CCE_AICORE__ + AscendC::Mul(dst, src1, src2, count); +#endif + } +}; + +template +struct Muls : public ElemwiseBinaryOP { + __aicore__ inline Muls(const LocalTensor& dst, const T& scalar, const LocalTensor& src, int count) { +#ifdef __CCE_AICORE__ + AscendC::Muls(dst, src, scalar, count); +#endif + } + __aicore__ inline Muls(const LocalTensor& dst, const LocalTensor& src, const T& scalar, int count) { +#ifdef __CCE_AICORE__ + AscendC::Muls(dst, src, scalar, count); +#endif + } + __aicore__ inline Muls(T& dst, T& scalar0, T& scalar1, int count) { + dst = scalar0 * scalar1;; + } +}; + +template +struct Div : public ElemwiseBinaryOP { + __aicore__ inline Div(LocalTensor& dst, LocalTensor& src1, LocalTensor& src2, int count) { +#ifdef __CCE_AICORE__ + AscendC::Div(dst, src1, src2, count); +#endif + } +}; + +template +struct DivHighPrecision : public ElemwiseBinaryOP { + __aicore__ inline DivHighPrecision(LocalTensor& dst, LocalTensor& src1, LocalTensor& src2, int count) { +#ifdef __CCE_AICORE__ + static constexpr DivConfig config = {DivAlgo::DIFF_COMPENSATION}; + AscendC::Div(dst, src1, src2, count); +#endif + } +}; + +template +struct Divs : public ElemwiseBinaryOP { + __aicore__ inline Divs(LocalTensor& dst, LocalTensor& src, T& scalar, int count) { +#ifdef __CCE_AICORE__ + AscendC::Duplicate(dst, scalar, count); + AscendC::Div(dst, src, dst, count); +#endif + } + __aicore__ inline Divs(LocalTensor& dst, T& scalar, LocalTensor& src, int count) { +#ifdef __CCE_AICORE__ + AscendC::Duplicate(dst, scalar, count); + AscendC::Div(dst, dst, src, count); +#endif + } + __aicore__ inline Divs(T& dst, T& scalar0, T& scalar1, int count) { + dst = scalar0 / scalar1;; + } +}; + +template +struct Subs : public ElemwiseBinaryOP { + __aicore__ inline Subs(LocalTensor& dst, LocalTensor& src, T& scalar, int count) { +#ifdef __CCE_AICORE__ + AscendC::Duplicate(dst, scalar, count); + AscendC::Sub(dst, src, dst, count); +#endif + } + __aicore__ inline Subs(LocalTensor& dst, T& scalar, LocalTensor& src, int count) { +#ifdef __CCE_AICORE__ + AscendC::Duplicate(dst, scalar, count); + AscendC::Sub(dst, dst, src, count); +#endif + } + __aicore__ inline Subs(T& dst, T& scalar0, T& scalar1, int count) { + dst = scalar0 - scalar1;; + } +}; + +template +struct LeakyRelu : public ElemwiseBinaryOP { + __aicore__ inline LeakyRelu(LocalTensor& dst, LocalTensor& src, T& scalarValue, int count) { +#ifdef __CCE_AICORE__ + AscendC::LeakyRelu(dst, src, scalarValue, count); +#endif + } +}; + +template +struct Compare : public ElemwiseBinaryOP { + __aicore__ inline Compare(LocalTensor& dstLocal, LocalTensor& src0Local, LocalTensor& src1Local, + uint32_t count) { +#ifdef __CCE_AICORE__ + AscendC::Compare(dstLocal, src0Local, src1Local, static_cast(cmpMode), count); +#endif + } + + __aicore__ inline Compare(LocalTensor& dstLocal, LocalTensor& src0Local, T src1Scalar, uint32_t count) { +#ifdef __CCE_AICORE__ + AscendC::CompareScalar(dstLocal, src0Local, src1Scalar, static_cast(cmpMode), count); +#endif + } +}; + +template +struct And : public ElemwiseBinaryOP { + __aicore__ inline And(LocalTensor& dst, LocalTensor& src1, LocalTensor& src2, const int32_t& count) { +#ifdef __CCE_AICORE__ + AscendC::And(dst, src1, src2, count); +#endif + } +}; + +template +struct Ands : public ElemwiseBinaryOP { + __aicore__ inline Ands(LocalTensor& dst, LocalTensor& src, T& scalar, const int32_t& count) { +#ifdef __CCE_AICORE__ + AscendC::Ands(dst, src, scalar, count); +#endif + } + __aicore__ inline Ands(LocalTensor& dst, T& scalar, LocalTensor& src, const int32_t& count) { +#ifdef __CCE_AICORE__ + AscendC::Ands(dst, scalar, src, count); +#endif + } +}; + +template +struct Or : public ElemwiseBinaryOP { + __aicore__ inline Or(LocalTensor& dst, LocalTensor& src1, LocalTensor& src2, const int32_t& count) { +#ifdef __CCE_AICORE__ + AscendC::Or(dst, src1, src2, count); +#endif + } +}; + +template +struct Ors : public ElemwiseBinaryOP { + __aicore__ inline Ors(LocalTensor& dst, LocalTensor& src, T& scalar, const int32_t& count) { +#ifdef __CCE_AICORE__ + AscendC::Ors(dst, src, scalar, count); +#endif + } + __aicore__ inline Ors(LocalTensor& dst, T& scalar, LocalTensor& src, const int32_t& count) { +#ifdef __CCE_AICORE__ + AscendC::Ors(dst, scalar, src, count); +#endif + } +}; + +template +struct Power : public ElemwiseBinaryOP { + __aicore__ inline Power(LocalTensor& dstLocal, LocalTensor& src0Local, LocalTensor& src1Local, + uint32_t count) { +#ifdef __CCE_AICORE__ + static constexpr PowerConfig config = {PowerAlgo::DOUBLE_FLOAT_TECH}; + AscendC::Power(dstLocal, src0Local, src1Local, count); +#endif + } + + __aicore__ inline Power(LocalTensor& dstLocal, LocalTensor& src0Local, T& src1Scalar, uint32_t count) { +#ifdef __CCE_AICORE__ + static constexpr PowerConfig config = {PowerAlgo::DOUBLE_FLOAT_TECH}; + AscendC::Power(dstLocal, src0Local, src1Scalar, count); +#endif + } + + __aicore__ inline Power(LocalTensor& dstLocal, T& src0Scalar, LocalTensor& src1Local, uint32_t count) { +#ifdef __CCE_AICORE__ + static constexpr PowerConfig config = {PowerAlgo::DOUBLE_FLOAT_TECH}; + AscendC::Power(dstLocal, src0Scalar, src1Local, count); +#endif + } +}; +#pragma endregion "双输入" + +#pragma region "三输入" + +template +struct Select : public ElemwiseTernaryOP { + __aicore__ inline Select(const LocalTensor& dstLocal, const LocalTensor& selMask, + const LocalTensor& src0Local, const LocalTensor& src1Local, uint32_t count) { +#ifdef __CCE_AICORE__ + AscendC::Select(dstLocal, selMask, src0Local, src1Local, static_cast(selMode), count); +#endif + } + + __aicore__ inline Select(const LocalTensor& dstLocal, const LocalTensor& selMask, + const LocalTensor& src0Local, T src1Scalar, uint32_t count) { +#ifdef __CCE_AICORE__ + AscendC::Select(dstLocal, selMask, src0Local, src1Scalar, + AscendC::SELMODE::VSEL_TENSOR_SCALAR_MODE, count); +#endif + } + + __aicore__ inline Select(const LocalTensor& dstLocal, const LocalTensor& selMask, + T src0Scalar, const LocalTensor& src1Local, uint32_t count) { +#ifdef __CCE_AICORE__ + AscendC::Select(dstLocal, selMask, src0Scalar, src1Local, + AscendC::SELMODE::VSEL_TENSOR_SCALAR_MODE, count); +#endif + } +}; + +/** + * dst = src1 + src2 * alpha + */ +template +struct FusedMulAdd : public ElemwiseTernaryOP { + __aicore__ inline FusedMulAdd(const LocalTensor& dst, const LocalTensor& src1, const LocalTensor& src2, + const LocalTensor& alpha, int count) { +#ifdef __CCE_AICORE__ + AscendC::FusedMulAdd(src2, alpha, src1, count); + AscendC::Copy(dst, src2, count); +#endif + } +}; + +/** + * dst = src1 + src2 * scalar + */ +template +struct Axpy : public ElemwiseTernaryOP { + __aicore__ inline Axpy(const LocalTensor& dst, const T& scalar, const LocalTensor& src1, + const LocalTensor& src2, int count) { +#ifdef __CCE_AICORE__ + AscendC::Axpy(src1, src2, scalar, count); + AscendC::Copy(dst, src1, count); +#endif + } + __aicore__ inline Axpy(const LocalTensor& dst, const LocalTensor& src1, const LocalTensor& src2, + const T& scalar, int count) { +#ifdef __CCE_AICORE__ + AscendC::Axpy(src1, src2, scalar, count); + AscendC::Copy(dst, src1, count); +#endif + } +}; +#pragma endregion "三输入" + + +} //namespace Tile +} //namespace ATVOS + + + +#if 0 + +template +struct IsCastOp { + constexpr static bool Value = false; +}; + +template +struct IsCastOp> { + constexpr static bool Value = true; +}; + +template +struct IsCastNoOp { + constexpr static bool Value = false; +}; + +template +struct IsCastNoOp> { + constexpr static bool Value = std::is_same::value; +}; + +template +struct IsCopyInBrcOp { + constexpr static bool Value = false; +}; + +template +struct IsCopyInBrcOp> { + constexpr static bool Value = true; +}; + +template +struct IsCopyInOp { + constexpr static bool Value = false; +}; + +template +struct IsCopyInOp> { + constexpr static bool Value = true; +}; + +template +struct IsCopyInOp> { + constexpr static bool Value = true; +}; + +template +struct IsVecBrcOp { + constexpr static bool Value = false; +}; + +template +struct IsVecBrcOp> { + constexpr static bool Value = true; +}; + +template +struct IsCopyOutOp { + constexpr static bool Value = false; +}; + +template +struct IsCopyOutOp> { + constexpr static bool Value = true; +}; + +template +struct IsCopyOutOp> { + constexpr static bool Value = true; +}; + +template +struct IsReduceOp { + constexpr static bool Value = false; +}; + +template +struct IsReduceOp> { + constexpr static bool Value = true; +}; + +template +struct IsDuplicateOp { + constexpr static bool Value = false; +}; + +template +struct IsDuplicateOp> { + constexpr static bool Value = true; +}; + +#endif + + +#endif //_ATVOS_VECTOR_ALU_H_ \ No newline at end of file diff --git a/best_practices/atvos/main/main.cpp b/best_practices/atvos/main/main.cpp new file mode 100644 index 0000000000000000000000000000000000000000..c48a9f3338173fd933dd9798e906cb32506e3ecf --- /dev/null +++ b/best_practices/atvos/main/main.cpp @@ -0,0 +1,94 @@ +#include + +#include "../include/atvos.h" +#include "../include/tile/tile_alu.h" +#include "../include/block/block_elemwise.h" +#include "../include/kernel/kernel_elemwise.h" +#include "../include/device/device_vector.h" + + + +using namespace ATVOS; + + +/* ------------方案1: ATVOSS方案 -------*/ +// 1, 描述算子原型 +using Input0 = Input> >; //描述每个Block的输入,以及处理的Shape stride大小 +using Input1 = Input> >; //描述每个Block的输入, 自动推导 +using out = Output> >; //描述每个Block的输入,自动推导 + +using OpTraits = std::tuple; + +// 2, 描述Compute 过程 +template +struct MyComputer { + using T = Traits_; + constexpr static auto myAdd = Expr>( Placeholder::In1{}, Placeholder::In2{}); + constexpr static auto subs = Expr>(myAdd, 0.5 ); + constexpr static auto out = std::tuple{subs}; +}; + +// 3, 选择 Block 模版(进行调度) +template +using MyBlock = BlockElemwise::out), Traits_>; + +// 4, 选择 kerne 类型( 处理整尾核) +template +using MyKernel = KernelElemwise< MyBlock>; + +// 5, Device 侧 host+device 混合编程 +using device = DeviceAdapt< MyKernel >; + +int main(int argc, const char* argv[]) { + + + device op; + + void* gmAddr = nullptr; + auto tensor = MakeTensor(gmAddr, MakeShape(1024,1024)); //硬件可能的Shape + + device::Arguments arguments( tensor, tensor, MakeTensor(gmAddr, MakeShape(1024,1024)), 20) ; + + // 校验参数是否合理(当前device能否支持arguments) + auto status = op.CanImplement(arguments); + size_t sizeWorkspace = op.GetWorkspaceSize(arguments); + + op.Run(arguments); + + std::cout << "run" << std::endl; + + return 0; + +} + + +#if 0 + +void kernel() { + TPipe pipe; + constexpr TQue queIn; //编译时分配资源 + constexpr TQue queOut; //编译时分配资源 + constexpr TQue queCalc; //编译时分配资源 + + //vector 操作 + LocalTensor tensor; + LocalTensor tensorOut, tensorOut2; + DataCopy(tensor, gmTesnor, 1024); + queIn.Enque(tensor); + queIn.Deque(tensor); + + Add(tensorOut, tensor, tensor, 1024) + + //Cube 操作 + TQue::EnQue(tensorOut); + TQue::DeQue(tensorOut); + Conv(tensorOut2, tensorOut, ...); + TQue::EnQue(tensorOut2); + TQue::DeQue(tensorOut2); + + DataCopy(gm2, tensorOut2, 1024) + + pipe.staticInitBuffer(queIn); + pipe.staticInitBuffer(queOut); +} +#endif diff --git a/best_practices/atvos_demo/include/block/block_elewise.h b/best_practices/atvos_demo/include/block/block_elewise.h new file mode 100644 index 0000000000000000000000000000000000000000..8f842a6c1d18b9dbed0b69af46947f0738631090 --- /dev/null +++ b/best_practices/atvos_demo/include/block/block_elewise.h @@ -0,0 +1,198 @@ +#ifndef _BLOCK_ELE_WISE_H +#define _BLOCK_ELE_WISE_H +#include +#include "block_tensor.h" +#include "../utils/buf_pool/ele_buf_pool.h" + +namespace ATVC::Block { + +template >> +class BlockEleWise{ + //TODO: block层的切分策略 融合到当前demo中: MakeConfig + // PrepareParams: 输入输出gm指针 转换为tuple 包含 LocalTensor资源分配 + // CopyIn CopyOut GlobalTensor -> LocalTensor +public: + using ExpressMaker = ExprMaker; + __aicore__ inline BlockEleWise(){ + bufPool_.InitBufPool(6, 1024, 4, 2048); // Todo: 需要通过tile layout配置 + } + + template + __aicore__ inline Tile::Config MakeConfig(const Config& configBlock, ArgTup& argTuple) + { + Tile::Config configTile{12, 1, 2048}; + return configTile; + } + + template + __aicore__ inline void Run(const Config& configBlock, ArgTup& argTuple) + { + // 配置规划,特别是需要确定使用的 Local Memory 大小 + Tile::Config configTile = MakeConfig(configBlock, argTuple); + auto expr = ExprMaker{}.template Get(); + using Expr = typename decltype(expr)::Type; + // using LocalVars = ATVC::ExprTmpl::LocalVars_t; // TODO: 这里 临时buffer不应该由Expr生成, 应该在host 切分信息计算获得。存放在configBlock中? + // using Params = ATVC::ExprTmpl::Params_t; + // using InParams = ATVC::ExprTmpl::InParams_t; + // using OutParams = ATVC::ExprTmpl::OutParams_t; + + + // /**------------test-----*/ + AscendC::LocalTensor in1 = bufPool_.inQueue.template AllocTensor(); + bufPool_.inQueue.EnQue(in1); + + AscendC::LocalTensor in2 = bufPool_.inQueue.template AllocTensor(); + bufPool_.inQueue.EnQue(in2); + AscendC::LocalTensor out = bufPool_.outQueue.template AllocTensor(); + bufPool_.outQueue.EnQue(out); + + AscendC::LocalTensor in1_1 = bufPool_.inQueue.template DeQue(); + AscendC::LocalTensor in2_1 = bufPool_.inQueue.template DeQue(); + AscendC::LocalTensor out_1 = bufPool_.outQueue.template DeQue(); + + auto ins = AscendC::Std::make_tuple(in1_1, in2_1, out_1); + // // CopyIn(inParams, ins, configTile); + Tile::Evaluate(ins); + AscendC::DumpTensor(out, 222, 64); + // // CopyOut(outParams, configTile); + + bufPool_.inQueue.FreeTensor(in1_1); + bufPool_.inQueue.template FreeTensor(in2_1); + bufPool_.outQueue.template FreeTensor(out_1); + + // // 得到 tuple,向量变成了 Block::Tensor 类型 + // auto params = PrepareParams(argTuple); + // // // 下面得到 tuple,即对 Tensor 的访问都成了引用 + // auto inParams = GetInParams(params); + // auto outParams = GetOutParams(params); + // // // 初始化局部变量及其需要的空间, + // // // auto localVars = PrepareLocalVars(params); + // uint32_t tupeSize = AscendC::Std::tuple_size::value; + // AscendC::printf("copy in tupeSize:%u \n", tupeSize); + // // 循环执行 + // for (int i = 0; i < 1; ++i) { + // // 把 argTuple 里的向量替换成 params 里的对应 Local Tensor引用 + // // AscendC::LocalTensor in1 = bufPool_.inQueue.template AllocTensor(); + // // AscendC::LocalTensor in2 = bufPool_.inQueue.template AllocTensor(); + // // auto ins = AscendC::Std::make_tuple(in1, in2); + // // auto combinedArgs = CombineArgs(inParams, outParams); // 这里 in/out的类型不一样,去申请ub tensor 的操作不一样 输入输出单独传递 + // // CopyIn(inParams, ins, configTile); + // // Tile::Evaluate(combinedArgs); + // // CopyOut(outParams, configTile); + // } + // // // 尾块的处理 + // // for (int i = 0; i < configTile.tileBlockCount; ++i) { + + // // } + } +private: +template +__aicore__ inline auto CombineArgs(ArgIn& inParams, ArgOut& outParams) +{ + auto sizeIn = AscendC::Std::tuple_size::value; + auto sizeOut = AscendC::Std::tuple_size::value; + AscendC::LocalTensor in1 = bufPool_.inQueue.template AllocTensor(); + AscendC::LocalTensor in2 = bufPool_.inQueue.template AllocTensor(); + AscendC::LocalTensor out1 = bufPool_.outQueue.template AllocTensor(); + return AscendC::Std::make_tuple(in1, in2, out1); +} + + template + __aicore__ inline constexpr auto ConstructParam(ArgTup& args) + { // 构建 tensor 关联gmTensor信息 + return typename AscendC::Std::decay_t( + AscendC::Std::get(args)); + } + + template + __aicore__ inline constexpr auto PrepareParamsImpl(ArgTup& args, AscendC::Std::index_sequence) + { + return AscendC::Std::make_tuple( + ConstructParam< + Util::TMP::FindUnique_t::template Checker, Params>>( + args)...); + } + + template + __aicore__ inline constexpr auto PrepareParams(ArgTup& argTuple) + { + return PrepareParamsImpl( + argTuple, AscendC::Std::make_index_sequence>{}); + } +// template +// __aicore__ inline constexpr auto PrepareLocalVars(ArgTup& argTuple) +// { +// return PrepareParamsImpl( +// argTuple, AscendC::Std::make_index_sequence>{}); +// } + + template + __aicore__ inline constexpr auto GetOneParam(ParamTup& params) + { + using Param = Util::TMP::Get_t; + if constexpr (((Param::usage == usages) || ...)) { + auto tensor = AscendC::Std::forward_as_tuple(AscendC::Std::get(params)); + //todo: 分配 UB tesnor给当前输入输出 + if constexpr (Param::usage == ATVC::ExprTmpl::ParamUsage::in || Param::usage == ATVC::ExprTmpl::ParamUsage::in_out) { + tensor.isInTensor = true; + } else if constexpr (Param::usage == ATVC::ExprTmpl::ParamUsage::out) { + tensor.isInTensor = false; + } + return AscendC::Std::forward_as_tuple(AscendC::Std::get(params)); + } else { + return AscendC::Std::tuple<>{}; + } + } + + template + __aicore__ inline constexpr auto GetInParamsImpl(ParamTup& params, AscendC::Std::index_sequence) + { + return AscendC::Std::make_tuple( + GetOneParam(params)...); + } + + template + __aicore__ inline constexpr auto GetOutParamsImpl(ParamTup& params, AscendC::Std::index_sequence) + { + return AscendC::Std::make_tuple( + GetOneParam(params)...); + } + template + __aicore__ inline auto GetInParams(ParamTup& params) + { + constexpr auto size = Util::TMP::Size_v; + static_assert(size == AscendC::Std::tuple_size::value); + return GetInParamsImpl( + params, AscendC::Std::make_index_sequence{}); + } + + template + __aicore__ inline auto GetOutParams(ParamTup& params) + { + constexpr auto size = Util::TMP::Size_v; + static_assert(size == AscendC::Std::tuple_size::value); + return GetOutParamsImpl( + params, AscendC::Std::make_index_sequence{}); + } + + template + __aicore__ inline void CopyIn(InParams& inParams, InUbs& inUns, Tile::Config &configTile) + { + uint32_t tupeSize = AscendC::Std::tuple_size::value; + AscendC::printf("copy in tupeSize:%u \n", tupeSize); + // AscendC::Std::get<0>(inParams).CopyIn(configTile.gmOffset, configTile.basicNum); + // AscendC::Std::get<1>(inParams).CopyIn(configTile.gmOffset, configTile.basicNum); + } + + // template + // __aicore__ inline void CopyOut(OutParams& outParams, Tile::Config &configTile) + // { + // uint32_t tupeSize = AscendC::Std::tuple_size::value; + // AscendC::Std::get<0>(outParams).CopyOut(configTile.gmOffset, configTile.basicNum); + // } +private: + ATVC::UTILS::EleBufPool bufPool_; // kernel调用block的时候 传入bufPool(kernel层实例化bufPool_,多次调用kernel的时候可以复用) +}; +} // namespace ATVC::Block + +#endif \ No newline at end of file diff --git a/best_practices/atvos_demo/include/block/block_tensor.h b/best_practices/atvos_demo/include/block/block_tensor.h new file mode 100644 index 0000000000000000000000000000000000000000..9700c7dfcf3084f0212430fbda57824a99bb0b39 --- /dev/null +++ b/best_practices/atvos_demo/include/block/block_tensor.h @@ -0,0 +1,67 @@ +#ifndef _ATVC_BLOCK_TENSOR_H +#define _ATVC_BLOCK_TENSOR_H +#include "../utils/buf_pool/ele_buf_pool.h" +#include "../tile/tile_operator.h" + + +namespace ATVC::Block { + +template +class Tensor { +public: + __aicore__ inline Tensor() = default; + + __aicore__ inline Tensor(AscendC::GlobalTensor& gmTensor) + { + SetGmTensor(gmTensor); + } + + __aicore__ inline void SetSize(std::size_t size) + { + size_ = size; + } + __aicore__ inline uint64_t GetSize() const + { + return size_; + } + __aicore__ inline uint64_t GetCurGmOffset() const + { + return curGmOffset_; + } + + __aicore__ inline AscendC::LocalTensor* GetUbTensor() const + { + return ubTensor_; + } + __aicore__ inline AscendC::GlobalTensor* GetGmTensor() const + { + return gmTensor_; + } + __aicore__ inline void AssignUbTensor(ATVC::UTILS::EleBufPool &bufPool) + { + AscendC::LocalTensor* ubTensor = &(bufPool.inQueue.template AllocTensor()); + ubTensor_ = ubTensor; + } + __aicore__ inline void SetGmTensor(AscendC::GlobalTensor* gmTensor) const + { + gmTensor_ = gmTensor; + } + + __aicore__ inline void CopyIn(uint64_t offset, uint32_t copyCnt) + { + ATVC::Tile::CopyIn(ubTensor_, gmTensor_[offset], copyCnt); + } + + __aicore__ inline void CopyOut(uint64_t offset, uint32_t copyCnt) + { + ATVC::Tile::CopyIn(gmTensor_[offset], ubTensor_, copyCnt); + } + + AscendC::LocalTensor* ubTensor_; + AscendC::GlobalTensor* gmTensor_; + uint64_t size_; + uint64_t curGmOffset_; + bool isInTensor = false; +}; +} +#endif \ No newline at end of file diff --git a/best_practices/atvos_demo/include/common.h b/best_practices/atvos_demo/include/common.h new file mode 100644 index 0000000000000000000000000000000000000000..64d2503a83ecc33b4466dfe90d09f8a54d07ebcb --- /dev/null +++ b/best_practices/atvos_demo/include/common.h @@ -0,0 +1,46 @@ +#ifndef _ATVC_COMMON_H +#define _ATVC_COMMON_H +namespace ATVC { + +namespace Tile { +struct Config {//tile层的切分信息 + uint32_t wholeBlockCount = 0; + uint32_t tileBlockCount = 0; + uint32_t basicNum = 0; + uint64_t gmOffset= 0; +}; +} + + +namespace Block { +struct Config {// block层的切分信息 + uint32_t wholeBlockCount = 0; // 一个block中 + uint32_t tileBlockCount = 0; + uint32_t basicNum = 0; +}; +} + +namespace Kernel { + struct UbAssign { + uint32_t ubInNum = 1; // 输入需要占用的UB tensor 个数 + uint32_t ubOutCnt = 1; // 输出需要占用的UB tensor 个数 + uint32_t ubTmpCnt = 0; // tmp需要占用的UB tensor 个数 + uint32_t eleNumInTensor = 1024; // 一个in tensor的元素个数 + uint32_t eleNumOutTensor = 1024; // 一个out tensor的元素个数 + uint32_t eleNumTmpTensor = 1024; // 一个tmp tensor的元素个数 +}; +struct Config{ // kernel 层的切分信息 + uint32_t blockNum = 1; + uint32_t basicNum = 0; + UbAssign ubAssign; +}; +} + +struct AttrStruct { + ATVC::Kernel::Config configKernel{1, 1024*10}; + ATVC::Block::Config configBlock{1, 1, 1024*10}; + ATVC::Tile::Config configTile{10, 0, 1024*10, 0}; +}; +} + +#endif \ No newline at end of file diff --git a/best_practices/atvos_demo/include/device/device_elewise.h b/best_practices/atvos_demo/include/device/device_elewise.h new file mode 100644 index 0000000000000000000000000000000000000000..2c05501d0cdd02456c75c852ef5b1f62edf24b1f --- /dev/null +++ b/best_practices/atvos_demo/include/device/device_elewise.h @@ -0,0 +1,271 @@ +#ifndef _DEVICE_ELE_WISE_H +#define _DEVICE_ELE_WISE_H +#include +#include "acl/acl.h" +#include "device_tensor.h" +#define CHECK_ACL(x) \ + do { \ + aclError __ret = x; \ + if (__ret != ACL_ERROR_NONE) { \ + std::cerr << __FILE__ << ":" << __LINE__ << " aclError:" << __ret << std::endl; \ + } \ + } while (0) + +void InitializeACL(aclrtContext &context, aclrtStream &stream, int32_t deviceId) +{ + CHECK_ACL(aclInit(nullptr)); + CHECK_ACL(aclrtSetDevice(deviceId)); + CHECK_ACL(aclrtCreateContext(&context, deviceId)); + CHECK_ACL(aclrtCreateStream(&stream)); +} + +void CleanACL(aclrtStream &stream, aclrtContext &context, int32_t deviceId) +{ + CHECK_ACL(aclrtDestroyStream(stream)); + CHECK_ACL(aclrtDestroyContext(context)); + CHECK_ACL(aclrtResetDevice(deviceId)); + CHECK_ACL(aclFinalize()); +} + + +// template +// __global__ __aicore__ void MyKernel(GM_ADDR configDevice, GM_ADDR x, GM_ADDR y, GM_ADDR z) +// { +// KernelOp op; +// AscendC::TPipe pipeIn; +// AscendC::printf("x: %p\n", x); +// AscendC::printf("y: %p\n", y); +// AscendC::printf("z: %p\n", z); +// op.Run(configDevice, x, y, z); +// } + + +// struct AttrStruct { +// ATVC::Kernel::Config configKernel{1, 1024*10}; +// ATVC::Block::Config configBlock{1, 1, 1024*10}; +// ATVC::Tile::Config configTile{10, 0, 1024*10, 0}; +// }; +namespace ATVC::Device { + +template +void LaunchKernelWithDataTuple(uint32_t blockNum, aclrtStream& stream, uint8_t* configDevice, ArgTup& argTuple) +{ + uint8_t* z = std::get<0>(argTuple).GetPtr(); + uint8_t* x = std::get<1>(argTuple).GetPtr(); + uint8_t* y = std::get<2>(argTuple).GetPtr(); + MyKernel<<>>(configDevice, x, y, z); + // template + // __global__ __aicore__ void MyKernel(GM_ADDR configDevice, ArgTuple inputOutputs, AttrStruct attr) + // MyKernel<<>>(configDevice, argTuple, attr); +} + +/* +ExprMaker maker: 描述计算表达信息 +Args&&... args: 输入输出信息 +*/ +template +class DeviceEleWise +// TODO: MakeConfig 总的切分策略计算 UB空间分配策略 +{ +public: + using ExprMaker = typename KernelOp::ExprMaker; + DeviceEleWise(){}; + // 计算kernel切分策略 + ATVC::AttrStruct MakeConfig() { + Kernel::Config configKernel{1, 1024*10}; + Block::Config configBlock{1, 1, 1024*10}; + Tile::Config configTile{10, 0, 1024*10, 0}; + return ATVC::AttrStruct{configKernel, configBlock, configTile}; + } + + template + int64_t Run(Args&&... args) + { + // 配置规划: host tiling计算 ub空间分配 + auto config = MakeConfig(); + auto expr = ExprMaker{}.template Get(); // Tensor 为表达 Device 层向量的类型 + // using Expr = decltype(expr); + using Expr = typename decltype(expr)::Type; + // static_assert(!Expr::hasData, "设备侧使用的表达式模板不应包含数据"); + using Params = ATVC::ExprTmpl::Params_t; + using InParams = ATVC::ExprTmpl::InParams_t; + using OutParams = ATVC::ExprTmpl::OutParams_t; + // 参数打包成 tuple + auto argTuple = std::forward_as_tuple(std::forward(args)...); // 转发引用 需要用forward + aclrtContext context; + int32_t deviceId = 0; + aclrtStream stream = nullptr; + InitializeACL(context, stream, deviceId); + // 得到 tuple,向量变成了 Device 专用的 Tensor 类型 + auto params = PrepareParams(argTuple); + // 下面得到 tuple,即对 Tensor 的访问都成了引用 + // tuple of references + auto inParams = GetInParams(params); + auto outParams = GetOutParams(params); + + // 把 argTuple 里的入参向量复制到 inParams 里的对应 Tensor 里 + CopyIn(inParams, argTuple); + + // 把 config 复制到 GM + uint8_t* configDevice; + // CopyConfig(config, configDevice); + // 传递参数到设备侧执行 + LaunchKernelWithDataTuple(config.configKernel.blockNum, stream, configDevice, params); + CleanACL(stream, context, deviceId); + // 把 outParams 里的 Tensor 复制到 argTuple 里的对应出参向量里 + CopyOut(outParams, argTuple); + + // Tensor 和 configDevice 需手工清理 + // ClearConfig(configDevice); + // ClearTensors(params); + return 0; + } + +private: + template + auto GetInParams(ParamTup& params) + { + constexpr auto size = Util::TMP::Size_v; + static_assert(size == std::tuple_size_v); + return GetInParamsImpl( + params, std::make_index_sequence{}); + } + + template + auto GetOutParams(ParamTup& params) + { + constexpr auto size = Util::TMP::Size_v; + static_assert(size == std::tuple_size_v); + return GetOutParamsImpl( + params, std::make_index_sequence{}); + } + + template + void CopyIn(InParamTup& inParams, ArgTup& args) + { + constexpr auto size = Util::TMP::Size_v; + static_assert(size == std::tuple_size_v); + CopyInImpl(inParams, args, + std::make_index_sequence{}); + } + + template + void CopyOut(OutParamTup& outParams, ArgTup& args) + { + constexpr auto size = Util::TMP::Size_v; + static_assert(size == std::tuple_size_v); + CopyOutImpl(outParams, args, + std::make_index_sequence{}); + } + + template + constexpr auto ConstructParam(ArgTup& args) + { // device内存分配 + // 从 args 元组中获取第 ParamType::number - 1 个元素 + // 将 ParamType::Type 转换为非引用、非常量类型 + return typename std::decay_t( + std::get(args)); + } + + template + constexpr auto PrepareParamsImpl(ArgTup& args, std::index_sequence) + { + return std::make_tuple( + ConstructParam< + Util::TMP::FindUnique_t::template Checker, Params>>( + args)...); + } + + template + constexpr auto PrepareParams(ArgTup& argTuple) + { + return PrepareParamsImpl( + argTuple, std::make_index_sequence>{}); + } + + template + constexpr auto ConvertOneArg(ParamTup& params, ArgTup& args) + { + constexpr auto pos = + Util::TMP::Find_v::template Checker, Params>; + if constexpr (pos < Util::TMP::Size_v) { + return std::get(params); + } else { + return std::get(args); + } + } + + template + constexpr auto ConvertArgsImpl(ParamTup& params, ArgTup& args, + std::index_sequence) + { + return std::make_tuple(ConvertOneArg(params, args)...); + } + + template + auto ConvertArgs(ParamTup& params, ArgTup& args) + { + return ConvertArgsImpl( + params, args, + std::make_index_sequence>{}); + } + + template + constexpr auto GetOneParam(ParamTup& params) + { + using Param = Util::TMP::Get_t; + if constexpr (((Param::usage == usages) || ...)) { + return std::forward_as_tuple(std::get(params)); + } else { + return std::tuple<>{}; + } + } + + template + constexpr auto GetInParamsImpl(ParamTup& params, std::index_sequence) + { + return std::tuple_cat( + GetOneParam(params)...); + } + + template + constexpr auto GetOutParamsImpl(ParamTup& params, std::index_sequence) + { + return std::tuple_cat( + GetOneParam(params)...); + } + + template + void CopyInOneParam(T& param, ArgTup& args) + { + using Param = Util::TMP::Get_t; + param.CopyIn(std::get(args)); + } + + template + void CopyInImpl(InParamTup& inParams, ArgTup& args, + std::index_sequence) + { + (CopyInOneParam(std::get(inParams), args), ...); + } + + template + void CopyOutOneParam(T& param, ArgTup& args) + { + using Param = Util::TMP::Get_t; + param.CopyOut(std::get(args)); + } + + template + void CopyOutImpl(OutParamTup& outParams, ArgTup& args, + std::index_sequence) + { + (CopyOutOneParam(std::get(outParams), args), ...); + } +}; + +} // namespace ATVC::Device +#endif \ No newline at end of file diff --git a/best_practices/atvos_demo/include/device/device_tensor.h b/best_practices/atvos_demo/include/device/device_tensor.h new file mode 100644 index 0000000000000000000000000000000000000000..6e015fd117f285a64106d8b088f20cc4b08301e9 --- /dev/null +++ b/best_practices/atvos_demo/include/device/device_tensor.h @@ -0,0 +1,71 @@ +#include +#include "acl/acl.h" +namespace ATVC::Device { + +template +class Tensor { +public: + Tensor() = default; + explicit Tensor(std::vector& src) + { + SetSize(src.size()); + } + + T& operator[](std::size_t pos) + { + return ptr_[pos]; + } + const T& operator[](std::size_t pos) const + { + return ptr_[pos]; + } + + void Clear() + { + delete[] ptr_; + ptr_ = nullptr; + len_ = 0; + } + + void SetSize(std::size_t size) + { + if (ptr_ != nullptr) { + throw std::logic_error( + "MyTensor::SetSize can only be called on an empty object"); + } + aclrtMalloc((void **)&ptr_, size * sizeof(T), ACL_MEM_MALLOC_HUGE_FIRST); + std::cout<<"Malloc Tensor: "<& src) + { + if (src.size() > len_) { + throw std::logic_error("Source vector is too big"); + } + std::cout<<"CopyIn Tensor: "<& dst) + { + if (dst.size() < len_) { + throw std::logic_error("Destination vector is too small"); + } + std::cout<<"CopyOut Tensor: "< + +namespace ATVC::Kernel { + +template +class KernelEleWise{ + //TODO: kenel层的切分策略 融合到当前demo中: MakeConfig + // PrepareParams: 输入输出gm指针 转换为tuple 包含 GlobalTensor资源分配 +public: + using ExprMaker = typename BlockOp::ExpressMaker; + __aicore__ inline KernelEleWise(){}; + + // 计算block切分策略 + __aicore__ inline Block::Config MakeConfig(GM_ADDR configKernel) { + Block::Config configBlock{10, 1, 1024}; + return configBlock; + } + + // 需要从Params中萃取每个输入输出的Dtype 生成每个输入输出的Gm地址, gm内存大小, 生成GlobalTensor组成的tuple + template + __aicore__ inline auto PrepareParams(Args... args) { + AscendC::Std::tuple, AscendC::GlobalTensor, AscendC::GlobalTensor> params{}; + // AscendC::GlobalTensor globalTensor; + // globalTensor.SetGlobalBuffer(reinterpret_cast<__gm__ DataType*>(src), srcDataSize); + return params; + } + + // 运行接口 + template + __aicore__ inline void Run(GM_ADDR configKernel, Args... args) + { + // 配置Block层切分策略 + Block::Config configBlock = MakeConfig(configKernel); + + auto expr = ExprMaker{}.template Get(); + using Expr = typename decltype(expr)::Type; + // static_assert(!Expr::hasData, "设备侧使用的表达式模板不应包含数据"); + using Params = ATVC::ExprTmpl::Params_t; + using InParams = ATVC::ExprTmpl::InParams_t; + using OutParams = ATVC::ExprTmpl::OutParams_t; + + // 得到 tuple,向量变成了 GlobalTensor 类型, + // 并根据 blockId 调整每个 GlobalTensor 的偏移量和大小 + auto params = PrepareParams(args...); + uint32_t tupeSize = AscendC::Std::tuple_size::value; + AscendC::printf("[kernel] copy in tupeSize:%u \n", tupeSize); + AscendC::printf("[kernel] copy in params:%u \n", params); + // 执行计算 + // 循环执行 + + BlockOp blockOp; + for (int i = 0; i < 1; ++i) { + blockOp.Run(configBlock, params); + } + // // 尾块 + // for (int i = 0; i < configBlock.tileBlockCount; ++i) { + // blockOp.Run(configBlock, params); + // } + } +}; + +} // namespace ATVC::Kernel +#endif \ No newline at end of file diff --git a/best_practices/atvos_demo/include/tile/tile_elewise.h b/best_practices/atvos_demo/include/tile/tile_elewise.h new file mode 100644 index 0000000000000000000000000000000000000000..40891a329215b2b6dc15f8b6554f4bc70f63cded --- /dev/null +++ b/best_practices/atvos_demo/include/tile/tile_elewise.h @@ -0,0 +1,17 @@ +#ifndef TILE_ELE_WISE_H +#define TILE_ELE_WISE_H +#include "tile_evaluator_common.h" +// #include "tile_operator.h" + + +namespace ATVC::Tile { + +template +__aicore__ inline void Evaluate(ArgTup& args) +{ + auto localVar = AscendC::Std::tuple<>{}; + DemoEval::Evaluator{}(Expr{}, args, localVar); +} + +} // namespace ATVC::Tile +#endif \ No newline at end of file diff --git a/best_practices/atvos_demo/include/tile/tile_evaluator_common.h b/best_practices/atvos_demo/include/tile/tile_evaluator_common.h new file mode 100644 index 0000000000000000000000000000000000000000..5c431e38deb2b08dcdef699bebfd2e358c265664 --- /dev/null +++ b/best_practices/atvos_demo/include/tile/tile_evaluator_common.h @@ -0,0 +1,261 @@ +#ifndef TILE_EVAL_COMMON_H +#define TILE_EVAL_COMMON_H + +#include "../utils/expression/expression.h" +/* Evaluation */ + +namespace DemoEval { + +using Util::TMP::FindUnique_t; +using Util::TMP::Size_v; + +using namespace ATVC::ExprTmpl; + +// Primary template +template +struct Evaluator { + using Type = T; + + template + __aicore__ decltype(auto) operator()(const T& value, + ArgTup& args, + LocalVarTup& localVars) const + { + if constexpr (std::is_invocable_v) { + return value(args, localVars); + } else { + return value; + } + } +}; + +template +constexpr auto DefineAutoLocalVar(const Expression& /*expr*/) +{ + using ResultType = typename Evaluator::Type; + return Expression>{}; +} + +namespace Detail { + +template +constexpr auto GetParamTupleImpl(std::index_sequence) +{ + return std::make_tuple( + typename FindUnique_t::template Checker, + Params>::Type{}...); +} + +template +constexpr auto GetLocalVarTupleImpl(std::index_sequence) +{ + return std::make_tuple( + typename FindUnique_t::template Checker, + LocalVars>::Type{}...); +} + +template +void MakeAlikeDatum(std::vector& dst, const std::vector& src) +{ + dst.resize(src.size()); +} + +// template +// void MakeAlikeDatum(Tensor& dst, const Tensor& src) +// { +// dst.SetSize(src.GetSize()); +// } + +template +void MakeAlikeOneLocalVar(T& localVar, const ArgTup& args) +{ + static_assert(IsLocalVar_v, "A LocalVar is needed"); + if constexpr (!std::is_same_v) { + MakeAlikeDatum(localVar, AscendC::Std::get(args)); + } +} + +template +void MakeAlikeLocalVarsImpl(LocalVarTup& localVars, const ArgTup& args, + std::index_sequence) +{ + (MakeAlikeOneLocalVar>( + AscendC::Std::get(localVars), args), + ...); +} + +} // namespace Detail + +template +constexpr auto GetParamTuple() +{ + constexpr auto paramCount = Size_v; + return Detail::GetParamTupleImpl( + std::make_index_sequence{}); +}; + +template +constexpr auto GetLocalVarTuple() +{ + constexpr auto localVarCount = Size_v; + return Detail::GetLocalVarTupleImpl( + std::make_index_sequence{}); +}; + +template +void MakeAlikeLocalVars(LocalVarTup& localVars, const ArgTup& args) +{ + Detail::MakeAlikeLocalVarsImpl( + localVars, args, + std::make_index_sequence>{}); +} + +template +auto Evaluate(const Expression& expr, Args&&... args) +{ + // Declare local variables so that we can always use xTup& in + // Evaluator<>::operator() + auto argTuple = std::forward_as_tuple(std::forward(args)...); + using LocalVars = LocalVars_t; + [[maybe_unused]] Params_t dummy; // Force checking the params + auto localVars = GetLocalVarTuple(); + MakeAlikeLocalVars(localVars, argTuple); + return Evaluator{}(expr.data, argTuple, localVars); +} + +// Treat Evaluator> as Evaluator +template +struct Evaluator> : Evaluator {}; + +// Partial specialization for LocalVar +template +struct Evaluator> { + using Type = T&; + + template + __aicore__ decltype(auto) operator()(LocalVar /*unused*/, + ArgTup& /*args*/, + LocalVarTup& localVars) const + { + static_assert(N > 0, "LocalVar number starts from 1"); + return AscendC::Std::get(localVars); + } +}; + +// Partial specialization for Param +template +struct Evaluator> { + using Type = T; + + template + __aicore__ decltype(auto) operator()(Param /*unused*/, + ArgTup& args, + LocalVarTup& /*localVars*/) const + { + static_assert(N > 0, "Param number starts from 1"); + constexpr auto index = N - 1; + using NthType = + typename AscendC::Std::tuple_element>::type; + if constexpr (std::is_same_v || + std::is_same_v || + std::is_same_v) { + return AscendC::Std::get(args); + } else { + static_assert( + U == ParamUsage::in, + "Only in-parameters allow implicit type conversions"); + return static_cast(AscendC::Std::get(args)); + } + } +}; + +template +__aicore__ void Assign(T& dst, const U& src) +{ + dst = src; +} + +// Partial specializtion for E_y = E_x +template +struct Evaluator> { + using Type = void; + + template + __aicore__ void operator()(const OpAssign& op, + ArgTup& args, + LocalVarTup& localVars) const + { + return Assign(Evaluator{}(op.lhs, args, localVars), + Evaluator{}(op.rhs, args, localVars)); + } +}; + +// Partial specializtion for E_x, E_y +template +struct Evaluator> { + using Type = void; + + template + __aicore__ auto operator()(const OpAndThen& op, + ArgTup& args, + LocalVarTup& localVars) const + { + // operator, evaluates sequentially + return Evaluator{}(op.lhs, args, localVars), + Evaluator{}(op.rhs, args, localVars); + } +}; + +template +__aicore__ auto Add(const T& src1, const U& src2) +{ + return src1 + src2; +} + +template +__aicore__ inline void AddAssign(AscendC::LocalTensor& dst, + const AscendC::LocalTensor& src1, + const AscendC::LocalTensor& src2) +{ + AscendC::printf("AddAssign 1"); + AscendC::Add(dst, src1, src2, src2.GetSize()); + AscendC::Adds(dst, dst, (T)3, 32); + AscendC::DumpTensor(dst, 111, 64); +} + +// Partial specializtion for E_x + E_y +template +struct Evaluator> { + using Type = decltype(Add(std::declval::Type>(), + std::declval::Type>())); + + template + __aicore__ auto operator()(const OpAdd& op, + ArgTup& args, + LocalVarTup& localVars) const + { + return Add(Evaluator{}(op.lhs, args, localVars), + Evaluator{}(op.rhs, args, localVars)); + } +}; + +// Partial specializtion for E_z = E_x + E_y +template +struct Evaluator>> { + using Type = void; + + template + __aicore__ auto operator()(const OpAssign>& op, + ArgTup& args, + LocalVarTup& localVars) const + { + return AddAssign(Evaluator{}(op.lhs, args, localVars), + Evaluator{}(op.rhs.lhs, args, localVars), + Evaluator{}(op.rhs.rhs, args, localVars)); + } +}; + +} // namespace DemoEval + +#endif \ No newline at end of file diff --git a/best_practices/atvos_demo/include/tile/tile_operator.h b/best_practices/atvos_demo/include/tile/tile_operator.h new file mode 100644 index 0000000000000000000000000000000000000000..49c0a3507c0b195b71746a2575982bb7062464ed --- /dev/null +++ b/best_practices/atvos_demo/include/tile/tile_operator.h @@ -0,0 +1,16 @@ +#ifndef __ATVC_TILE_OPERATOR_H__ +#define __ATVC_TILE_OPERATOR_H__ +#include "tile_evaluator_common.h" + +namespace ATVC::Tile { + template + __aicore__ inline void CopyIn(AscendC::LocalTensor dst, AscendC::GlobalTensor src, uint32_t copyCnt){ + AscendC::DataCopy(dst, src, copyCnt); + } + + template + __aicore__ inline void CopyIn(AscendC::GlobalTensor dst, AscendC::LocalTensor src, uint32_t copyCnt){ + AscendC::DataCopy(dst, src, copyCnt); + } +} +#endif \ No newline at end of file diff --git a/best_practices/atvos_demo/include/utils/buf_pool/ele_buf_pool.h b/best_practices/atvos_demo/include/utils/buf_pool/ele_buf_pool.h new file mode 100644 index 0000000000000000000000000000000000000000..35a63dfd9f6cf2e60d36aa58e4733ee61a946e9a --- /dev/null +++ b/best_practices/atvos_demo/include/utils/buf_pool/ele_buf_pool.h @@ -0,0 +1,40 @@ +#ifndef __ATVC_UTILS_ELE_BUF_POOL_H__ +#define __ATVC_UTILS_ELE_BUF_POOL_H__ +namespace ATVC::UTILS { + +class EleBufPool { +public: + __aicore__ inline EleBufPool(){} + + __aicore__ inline void InitBufPool(uint32_t inBufNum, uint64_t inBytes, uint32_t outBufNum, uint64_t outBytes){ + GetTPipePtr()->InitBuffer(inQueue, inBufNum, inBytes); + GetTPipePtr()->InitBuffer(outQueue, outBufNum, outBytes); + } + + // __aicore__ inline AscendC::LocalTensor& GetInTnesor(){ + // AscendC::LocalTensor inTensor = inQueue.template AllocTensor(); + // return inTensor; + // } + // __aicore__ inline void GetOutTnesor(AscendC::LocalTensor &outTensor){ + // outTensor = outQueue.template AllocTensor(); + // } + + __aicore__ inline void EnqueInTnesor(AscendC::LocalTensor* inTensor){ + inQueue.EnQue(*inTensor); + } + __aicore__ inline void EnqueOutTnesor(AscendC::LocalTensor* outTensor){ + outQueue.EnQue(*outTensor); + } + __aicore__ inline void DequeInTnesor(){ + inQueue.template DeQue(); + } + __aicore__ inline void DequeOutTnesor(AscendC::LocalTensor* outTensor){ + outQueue.template DeQue(); + } + + AscendC::TQue inQueue; + AscendC::TQue outQueue; + AscendC::TBuf tempQueue; +}; +} +#endif diff --git a/best_practices/atvos_demo/include/utils/expression/expression.h b/best_practices/atvos_demo/include/utils/expression/expression.h new file mode 100644 index 0000000000000000000000000000000000000000..5e6cac33a498a45faf9a08309bdd24a17b872c9f --- /dev/null +++ b/best_practices/atvos_demo/include/utils/expression/expression.h @@ -0,0 +1,308 @@ +#ifndef EXPRESSION_H +#define EXPRESSION_H +#include +#include +#include +#include +#include +#include +#include +#include "utility.h" + +/* Expression templates */ + +// NOLINTBEGIN(cppcoreguidelines-avoid-const-or-ref-data-members) +// NOLINTBEGIN(cppcoreguidelines-c-copy-assignment-signature) +// NOLINTBEGIN(misc-unconventional-assign-operator) + +namespace ATVC::ExprTmpl { + +using Util::TMP::TypeList; + +template +struct HasDataTrait { + static constexpr bool value = true; +}; + +template +struct HasDataTrait> { + static constexpr bool value = T::hasData; +}; + +// Type for a basic "expression". If T is trivial (as it should normally +// be), Expression is trivially copy-constructible and trivially move- +// constructible, but is not trivially assignable. Objects of the same +// Expression type cannot be assigned, because the data member is const. +// Objects of different Expression types can be assigned, but it will +// result in a new Expression object (and is not a "normal" assignment). +template +struct Expression { + static_assert(!std::is_rvalue_reference_v, + "Rvalue references cannot be stored"); + using Type = T; + static constexpr bool hasData = HasDataTrait::value; + + T const data{}; + + template + [[nodiscard]] constexpr auto operator=(Expression u); +}; + +// Let T deduce to value type or lvalue reference type +template +Expression(T&& value) -> Expression; + +template +using IsExpression = Util::IsSpecializationOf; + +template +inline constexpr bool IsExpression_v = IsExpression::value; + +template +struct LocalVar { + static_assert(!std::is_reference_v, + "A LocalVar must not be a reference"); + using Type = T; + using Like = L; + static constexpr std::size_t number = N; + static constexpr bool hasData = false; + + template + constexpr auto operator=(Expression) + { + static_assert(Util::AlwaysFalse_v, + "Please use Expression for assignment"); + } +}; + +template +struct IsLocalVar : std::false_type {}; + +template +struct IsLocalVar> : std::true_type {}; + +template +inline constexpr bool IsLocalVar_v = IsLocalVar::value; + +enum class ParamUsage { + in, + out, + in_out, + //temporary, +}; + +template +struct Param { + using Type = T; + static constexpr std::size_t number = N; + static constexpr ParamUsage usage = U; + static constexpr bool hasData = false; + + template + constexpr auto operator=(Expression) + { + static_assert(Util::AlwaysFalse_v, + "Please use Expression for assignment"); + } +}; + +template +struct IsParam : std::false_type {}; + +template +struct IsParam> : std::true_type {}; + +template +inline constexpr bool IsParam_v = IsParam::value; + +template +constexpr auto DefineLocalVar() +{ + return Expression>{}; +} + +template +constexpr auto DefineLocalVarLike(Expression /*unused*/) +{ + static_assert(IsParam_v, "A LocalVar can only be like a Param"); + return Expression>{}; +} + +template +__host_aicore__ constexpr auto DefineParam() +{ + return Expression>{}; +} + +template +struct CheckVarNum { + template + struct Checker { + static constexpr bool value = (T::number == N); + }; +}; + +template +struct BinaryOp { + static_assert(!(std::is_rvalue_reference_v || + std::is_rvalue_reference_v), + "Rvalue references cannot be stored"); + static constexpr bool hasData = + HasDataTrait::value || HasDataTrait::value; + using IsBinaryOp = void; + using LhsType = T; + using RhsType = U; + T lhs; + U rhs; +}; + +namespace Detail { + +template +struct LocalVarCollector { + using Type = TypeList<>; +}; + +template +struct LocalVarCollector>> { + using Type = TypeList; +}; + +template +struct LocalVarCollector> { + using Type = Util::TMP::Concatenate_t< + typename LocalVarCollector::Type, + typename LocalVarCollector::Type>; +}; + +template +struct ParamCollector { + using Type = TypeList<>; +}; + +template +struct ParamCollector>> { + using Type = TypeList; +}; + +template +struct ParamCollector> { + using Type = Util::TMP::Concatenate_t< + typename ParamCollector::Type, + typename ParamCollector::Type>; +}; + +} // namespace Detail + +template +struct LocalVars { + using Type = + Util::TMP::Unique_t::Type>; + +private: + static constexpr std::size_t size = Util::TMP::Size_v; + + template + struct InRange + : std::bool_constant<(U::number > 0 && U::number <= size)> {}; + static_assert(Util::TMP::Check_v, + "LocalVars must be numbered sequentially from 1"); +}; + +template +using LocalVars_t = typename LocalVars::Type; + +template +struct Params { + using Type = + Util::TMP::Unique_t::Type>; + +private: + static constexpr std::size_t size = Util::TMP::Size_v; + + template + struct InRange + : std::bool_constant<(U::number > 0 && U::number <= size)> {}; + static_assert(Util::TMP::Check_v, + "Params must be numbered sequentially from 1"); +}; + +template +using Params_t = typename Params::Type; + +template +struct InParams { + template + struct IsInVar : std::bool_constant {}; + using Type = Util::TMP::Filter_t>; +}; + +template +using InParams_t = typename InParams::Type; + +template +struct OutParams { + template + struct IsOutVar : std::bool_constant {}; + using Type = Util::TMP::Filter_t>; +}; + +template +using OutParams_t = typename OutParams::Type; + +template +struct OpAssign : BinaryOp {}; + +template +template +__host_aicore__ constexpr auto Expression::operator=(Expression u) +{ + static_assert( + (IsParam_v || IsLocalVar_v || std::is_lvalue_reference_v), + "Only a Param, LocalVar, or reference can appear on the left side " + "of assignment"); + return Expression>{{data, u.data}}; +} + +template +struct OpAndThen : BinaryOp {}; + +template +__host_aicore__ constexpr auto operator,(Expression t, Expression u) +{ + return Expression>{{t.data, u.data}}; +} + +// Disallow dangerous expressions like (Expression{2}, 3) +template +__host_aicore__ constexpr auto operator,(Expressiont, U&& u) = delete; + +template +struct OpAdd : BinaryOp {}; + +template +__host_aicore__ constexpr auto operator+(Expression lhs, Expression rhs) +{ + return Expression>{{lhs.data, rhs.data}}; +} + +template +__host_aicore__ constexpr auto operator+(Expression lhs, U&& rhs) +{ + return Expression>{{lhs.data, std::forward(rhs)}}; +} + +template +__host_aicore__ constexpr auto operator+(T&& lhs, Expression rhs) +{ + return Expression>{{std::forward(lhs), rhs.data}}; +} + +// Base class to express that a class is a maker of calculation expression template +class Maker {}; + +} // namespace ATVC::ExprTmpl + +#endif \ No newline at end of file diff --git a/best_practices/atvos_demo/include/utils/expression/utility.h b/best_practices/atvos_demo/include/utils/expression/utility.h new file mode 100644 index 0000000000000000000000000000000000000000..7e56558ca1ee13c5e658d0b88bc8cd1a92549b81 --- /dev/null +++ b/best_practices/atvos_demo/include/utils/expression/utility.h @@ -0,0 +1,206 @@ +#ifndef UTILITY_H +#define UTILITY_H +/* Utilities */ + +namespace Util { + +template +struct AlwaysFalse : std::false_type {}; + +template +inline constexpr bool AlwaysFalse_v = AlwaysFalse::value; + +template