InfiniTensor
diff --git a/‎.github/workflows/main.yaml‎
Lines changed: 91 additions & 0 deletions b/‎.github/workflows/main.yaml‎
Lines changed: 91 additions & 0 deletions
diff --git a/‎.gitignore‎
Lines changed: 10 additions & 0 deletions b/‎.gitignore‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 72 additions & 19 deletions b/‎README.md‎
Lines changed: 72 additions & 19 deletions
diff --git a/‎include/data_type.h‎
Lines changed: 20 additions & 0 deletions b/‎include/data_type.h‎
Lines changed: 20 additions & 0 deletions
diff --git a/‎include/device.h‎
Lines changed: 8 additions & 3 deletions b/‎include/device.h‎
Lines changed: 8 additions & 3 deletions
diff --git a/‎include/handle.h‎
Lines changed: 12 additions & 0 deletions b/‎include/handle.h‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎include/handle/handle_export.h‎
Lines changed: 12 additions & 0 deletions b/‎include/handle/handle_export.h‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎include/infini_operators.h‎
Lines changed: 13 additions & 1 deletion b/‎include/infini_operators.h‎
Lines changed: 13 additions & 1 deletion
diff --git a/‎include/operators.h‎
Lines changed: 2 additions & 4 deletions b/‎include/operators.h‎
Lines changed: 2 additions & 4 deletions
diff --git a/‎include/ops/add/add.h‎
Lines changed: 27 additions & 0 deletions b/‎include/ops/add/add.h‎
Lines changed: 27 additions & 0 deletions
@@ -0,0 +1,91 @@
+name: CI
+
+on:
+  push:
+    branches:
+      - main
+      - dev
+  pull_request:
+
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+
+    steps:
+    - name: Checkout code
+      uses: actions/checkout@v3
+
+    - name: Install Python
+      uses: actions/setup-python@v4
+      with:
+        python-version: '3.x'
+
+    - name: Install Python dependencies
+      run: |
+        pip install numpy
+        pip install torch
+
+    - name: Install xmake
+      uses: xmake-io/github-action-setup-xmake@v1
+      with:
+        xmake-version: latest
+    
+    - name: configure xmake
+      run: xmake f --cpu=true -cv
+
+    - name: Set INFINI_ROOT
+      run: |
+        export INFINI_ROOT=$GITHUB_WORKSPACE/.infini
+        mkdir -p $INFINI_ROOT
+        echo "INFINI_ROOT=$INFINI_ROOT" >> $GITHUB_ENV
+
+    - name: Build with XMake
+      run: xmake build && xmake install
+
+    - name: Run Python Tests
+      run: |
+        GREEN='\033[0;32m'
+        RED='\033[0;31m'
+        NC='\033[0m' # No Color
+
+        PASSED_TESTS=()
+        FAILED_TESTS=()
+        for script in operatorspy/tests/*.py; do
+          if [ "$(basename $script)" != "__init__.py" ] && [ "$(basename $script)" != "test_utils.py" ]; then
+            echo "Running $script"
+            START_TIME=$(date +%s)
+            if ! python3 $script --cpu; then
+              echo "$script failed"
+              FAILED_TESTS+=($script)
+            else
+              echo "$script passed"
+              PASSED_TESTS+=($script)
+            fi
+            END_TIME=$(date +%s)
+            DURATION=$(( END_TIME - START_TIME ))
+            MINUTES=$(( DURATION / 60 ))
+            SECONDS=$(( DURATION % 60 ))
+            echo "Execution time for $script: ${MINUTES}m ${SECONDS}s"
+          fi
+        done
+
+        if [ ${#FAILED_TESTS[@]} -ne 0 ]; then
+          echo "The following tests passed:"
+          for test in "${PASSED_TESTS[@]}"; do
+            echo -e "${GREEN}$test${NC}"
+          done
+          echo "The following tests failed:"
+          for test in "${FAILED_TESTS[@]}"; do
+            echo -e "${RED}$test${NC}"
+          done
+          exit 1
+        else
+          echo "The following tests passed:"
+          for test in "${PASSED_TESTS[@]}"; do
+          echo -e "${GREEN}$test${NC}"
+          done
+          echo "${GREEN}All tests passed${NC}"
+        fi
+      env:
+        INFINI_ROOT: ${{ env.INFINI_ROOT }}
@@ -13,3 +13,13 @@ __pycache__/
 
 # Lib
 lib/
+out/
+
+# Log
+*.log
+
+# Cache
+cache/
+
+# Json
+*.json
@@ -1,30 +1,77 @@
-# 算子库
+# InfiniOperators 算子库
 
-跨平台高性能通用算子库。形式为 C 接口动态库。
+跨平台高性能统一算子库。形式为 C 接口动态库。
 
-采用二段式算子设计，每个算子都实现并对外暴露以下的 C 接口:
+## 简介
 
-- 第一阶段：构造算子 Descriptor。用户提供的算子名称、硬件、以及算子配置（如计算的数据类型、计算排布等），相应模组会被 load 到硬件上。
+### 算子接口设计
+
+采用3+1段式算子设计，每个算子都实现并对外暴露以下的 C 接口:
+
+- 第一阶段：构造硬件控柄（Handle）。用户提供控柄地址、硬件类型以及硬件序号。控柄所在的内存空间由用户管理。
 
   ```C
-  void* createOpDescriptor(Device, void *config);
+  infiniopStatus_t infiniopCreateHandle(infiniopHandle_t *handle_ptr, int device, int device_id);
   ```
 
-- 第二阶段：计算。根据一阶段的 Descriptor，执行相应计算，用户需要提供输入输出张量，以及硬件计算流（CPU 为 NULL）。
+- 第二阶段：构造算子描述（Descriptor）。用户提供描述符地址、硬件控柄、以及算子涉及的张量描述（含张量数据类型、形状和步长）。这一步会完成算子所需的与张量数据无关的预计算。
 
   ```C
-  void op(void *descriptor, Tensor output, Tensor input, void *stream);
+  infiniopStatus_t infiniopCreateOpDescriptor(infiniopHandle_t handle, infiniopOpDescriptor_t *desc_ptr, infiniopTensorDescriptor_t t, ...);
   ```
 
-- 销毁 Descriptor。
+- 第三阶段（可选）：计算额外工作空间。根据算子描述，计算算子所需的额外工作空间大小，并存储于用户提供的位置。具体空间分配由用户负责。
 
   ```C
-  void destroyOpDescriptor(void *descriptor);
+  infiniopStatus_t infiniopGetOpWorkspaceSize(infiniopOpDescriptor_t desc, uint64_t *size);
   ```
 
+- 第四阶段：计算。根据算子描述符，在指定的硬件上执行相应计算，用户需要提供输入输出的数据，以及硬件计算流（CPU 为 NULL）。
+
+  ```C
+  infiniopStatus_t infiniopGetOp(infiniopOpDescriptor_t desc, [void *workspace, uint64_t workspace_size,] void *output_data, void *input_data, ..., void *stream);
+  ```
+
+- 销毁描述和硬件控柄。
+
+  ```C
+  infiniopStatus_t infiniopDestroyOpDescriptor(infiniopOpDescriptor_t desc);
+  infiniopStatus_t infiniopDestroyHandle(infiniopHandle_t handle);
+  ```
+
+### 张量（Tensor）描述设计
+
+张量描述由以下几个部分组成：
+
+1.数据类型，由打包大小（即一个元素代表几个数据）、符号位、元素大小、尾数位数、指数位数共4字节表示。定义如下：
+
+```C
+typedef struct DataLayout {
+    unsigned short
+        packed : 8,
+        sign : 1,
+        size : 7,
+        mantissa : 8,
+        exponent : 8;
+} DataLayout;
+```
+
+2.维度信息。张量有多少个维度。类型为uint64_t。
+
+3.张量形状。张量每个维度的大小。类型为uint64_t*。
+
+4.张量步长。张量每个维度的步长。类型为uint64_t*。
+
+创建和销毁张量描述符的接口：
+
+```C
+infiniopStatus_t infiniopCreateTensorDescriptor(infiniopTensorDescriptor_t *desc_ptr, DataLayout layout, uint64_t ndim, uint64_t *shape, uint64_t *strides);
+infiniopStatus_t infiniopDestroyTensorDescriptor(infiniopTensorDescriptor_t desc);
+```
+
 ## 一、使用说明
 
-### 配置
+### 1. 配置
 
 #### 查看当前配置
 
@@ -52,23 +99,27 @@ xmake f --nv-gpu=true --cuda=$CUDA_HOME -cv
 xmake f --cambricon-mlu=true -cv
 ```
 
-### 编译
+#### 配置 NPU
+
+````xmake
+xmake f --ascend-npu=true -cv
+````
+
+### 2. 编译安装
 
 ```xmake
-xmake
+xmake build && xmake install
 ```
 
-### 将编译好的算子库添加至环境变量 `INFINI_ROOT`
+### 3. 设置环境变量
 
-```bash
-export INFINI_ROOT=[PATH_TO_LIBRARY]
-```
+按输出提示设置 `INFINI_ROOT` 和 `LD_LIBRARY_PATH` 环境变量。
 
-### 运行算子测试
+### 4. 运行算子测试
 
 ```bash
 cd operatorspy/tests
-python operator_name.py
+python operator_name.py [--cpu | --cuda | --cambricon | --ascend]
 ```
 
 ## 二、开发说明
@@ -82,6 +133,8 @@ python operator_name.py
 │   │   ├── [operator_name].h  # 对外暴露的算子 C 接口定义，descriptor 定义
 │   ├── tensor
 │   │   ├── tensor_descriptor.h  # 对外暴露的张量 descriptor 定义
+│   ├── handle
+│   │   ├── handle_export.h  # 对外暴露的硬件 handle 定义
 │   ├── *.h  # 对外暴露的核心结构体定义
 ├── src
 │   ├── devices
@@ -105,7 +158,7 @@ python operator_name.py
 
 - 在 `src/device.h` 和 `operatorspy/devices.py` 中增加新的硬件类型，注意两者需要一一对应；
 - 在 `xmake.lua` 中增加新硬件的编译选项以及编译方式；
-- 在 `src/ops/devices/[device_name]` 下编写特定硬件的通用代码；
+- 在 `src/ops/devices/[device_name]` 下编写特定硬件的handle实现和通用代码；
 - 实现该硬件的算子；
 
 ### 增加新的算子
 
@@ -8,8 +8,28 @@ typedef struct DataLayout {
         size : 7,
         mantissa : 8,
         exponent : 8;
+
+#ifdef __cplusplus
+    bool operator==(const DataLayout &other) const {
+        union TypePun {
+            DataLayout layout;
+            unsigned int i;
+        } pun;
+        pun.layout = *this;
+        auto a_ = pun.i;
+        pun.layout = other;
+        auto b_ = pun.i;
+        return a_ == b_;
+    }
+
+    bool operator!=(const DataLayout &other) const {
+        return !(*this == other);
+    }
+#endif
 } DataLayout;
 
+typedef struct DataLayout DT;
+
 // clang-format off
 const static struct DataLayout
     I8   = {1, 1, 1,  7,  0},
 
@@ -2,9 +2,14 @@
 #define __DEVICE_H__
 
 enum DeviceEnum {
-    DevCpu,
-    DevNvGpu,
-    DevCambriconMlu,
+    DevCpu = 0,
+    DevNvGpu = 1,
+    DevCambriconMlu = 2,
+    DevAscendNpu = 3,
+    DevMetaxGpu = 4,
+    DevMthreadsGpu = 5,
 };
 
+typedef enum DeviceEnum Device;
+
 #endif// __DEVICE_H__
@@ -0,0 +1,12 @@
+#ifndef INFINIOP_HANDLE_H
+#define INFINIOP_HANDLE_H
+
+#include "device.h"
+
+typedef struct HandleStruct {
+    Device device;
+} HandleStruct;
+
+typedef HandleStruct *infiniopHandle_t;
+
+#endif
@@ -0,0 +1,12 @@
+#ifndef INFINIOP_HANDLE_EXPORT_H
+#define INFINIOP_HANDLE_EXPORT_H
+#include "../status.h"
+#include "../handle.h"
+#include "../export.h"
+#include "../device.h"
+
+__C __export infiniopStatus_t infiniopCreateHandle(infiniopHandle_t *handle_ptr, Device device, int device_id);
+
+__C __export infiniopStatus_t infiniopDestroyHandle(infiniopHandle_t handle);
+
+#endif // INFINIOP_HANDLE_EXPORT_H
@@ -1,6 +1,18 @@
+#include "handle/handle_export.h"
+#include "ops/add/add.h"
+#include "ops/attention/attention.h"
+#include "ops/avg_pool/avg_pool.h"
 #include "ops/causal_softmax/causal_softmax.h"
+#include "ops/global_avg_pool/global_avg_pool.h"
+#include "ops/expand/expand.h"
+#include "ops/gemm/gemm.h"
+#include "ops/conv/conv.h"
 #include "ops/matmul/matmul.h"
-#include "ops/reform/reform.h"
+#include "ops/max_pool/max_pool.h"
+#include "ops/mlp/mlp.h"
+#include "ops/random_sample/random_sample.h"
+#include "ops/rearrange/rearrange.h"
+#include "ops/relu/relu.h"
 #include "ops/rms_norm/rms_norm.h"
 #include "ops/rotary_embedding/rotary_embedding.h"
 #include "ops/swiglu/swiglu.h"
 
@@ -1,11 +1,9 @@
 #ifndef __OPERATORS_H__
 #define __OPERATORS_H__
 
-#include "data_type.h"
 #include "device.h"
 #include "tensor.h"
-
-typedef enum DeviceEnum Device;
-typedef struct DataLayout DT;
+#include "handle.h"
+#include "status.h"
 
 #endif// __OPERATORS_H__
@@ -0,0 +1,27 @@
+#ifndef ADD_H
+#define ADD_H
+
+#include "../../export.h"
+#include "../../operators.h"
+
+typedef struct AddDescriptor {
+    Device device;
+} AddDescriptor;
+
+typedef AddDescriptor *infiniopAddDescriptor_t;
+
+__C __export infiniopStatus_t infiniopCreateAddDescriptor(infiniopHandle_t handle,
+                                                          infiniopAddDescriptor_t *desc_ptr,
+                                                          infiniopTensorDescriptor_t c,
+                                                          infiniopTensorDescriptor_t a,
+                                                          infiniopTensorDescriptor_t b);
+
+__C __export infiniopStatus_t infiniopAdd(infiniopAddDescriptor_t desc,
+                                          void *c,
+                                          void const *a,
+                                          void const *b,
+                                          void *stream);
+
+__C __export infiniopStatus_t infiniopDestroyAddDescriptor(infiniopAddDescriptor_t desc);
+
+#endif
-Original file line number
+Diff line change
 # Lib
 lib/
 +out/
++
 +# Log
 +*.log
++
 +# Cache
 +cache/
++
 +# Json
 +*.json