diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 05df2ca..d32c22c 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -1,25 +1,25 @@
-name: Build and test cpu
-on:
-  push:
-    paths-ignore:
-      - '**.md'
-      - 'LICENSE'
-  pull_request:
-    paths:
-      - '**.md'
-      - 'LICENSE'
-
-jobs:
-  build:
-    name: Build
-    runs-on: ubuntu-latest
-    steps:
-    - uses: actions/checkout@v3
-      with:
-        submodules: recursive
-
-    - name: Build
-      run: make
-
-    - name: Test cpu
-      run: make test-cpp
+name: Build and test cpu
+on:
+  push:
+    paths-ignore:
+      - '**.md'
+      - 'LICENSE'
+  pull_request:
+    paths:
+      - '**.md'
+      - 'LICENSE'
+
+jobs:
+  build:
+    name: Build
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v3
+      with:
+        submodules: recursive
+
+    - name: Build
+      run: make
+
+    - name: Test cpu
+      run: make test-cpp
diff --git a/.gitignore b/.gitignore
index 98e980a..15ad474 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,46 +1,46 @@
-# Prerequisites
-*.d
-
-# Compiled Object files
-*.slo
-*.lo
-*.o
-*.obj
-
-# Precompiled Headers
-*.gch
-*.pch
-
-# Compiled Dynamic libraries
-*.so
-*.dylib
-*.dll
-
-# Fortran module files
-*.mod
-*.smod
-
-# Compiled Static libraries
-*.lai
-*.la
-*.a
-*.lib
-
-# Executables
-*.exe
-*.out
-*.app
-
-build/
-build_debug/
-
-.vscode/
-
-# python
-*.egg-info
-*.pyc
-
-# onnx model
-*.onnx
-*.pb
-*.npy
+# Prerequisites
+*.d
+
+# Compiled Object files
+*.slo
+*.lo
+*.o
+*.obj
+
+# Precompiled Headers
+*.gch
+*.pch
+
+# Compiled Dynamic libraries
+*.so
+*.dylib
+*.dll
+
+# Fortran module files
+*.mod
+*.smod
+
+# Compiled Static libraries
+*.lai
+*.la
+*.a
+*.lib
+
+# Executables
+*.exe
+*.out
+*.app
+
+build/
+build_debug/
+
+.vscode/
+
+# python
+*.egg-info
+*.pyc
+
+# onnx model
+*.onnx
+*.pb
+*.npy
diff --git a/.gitmodules b/.gitmodules
index e856b94..f7abc37 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,6 +1,6 @@
-[submodule "3rd-party/nlohmann_json_cmake_fetchcontent"]
-	path = 3rd-party/nlohmann_json_cmake_fetchcontent
-	url = git@github.com:ArthurSonzogni/nlohmann_json_cmake_fetchcontent.git
-[submodule "3rd-party/googletest"]
-	path = 3rd-party/googletest
-	url = git@github.com:google/googletest.git
+[submodule "3rd-party/nlohmann_json_cmake_fetchcontent"]
+	path = 3rd-party/nlohmann_json_cmake_fetchcontent
+	url = git@github.com:ArthurSonzogni/nlohmann_json_cmake_fetchcontent.git
+[submodule "3rd-party/googletest"]
+	path = 3rd-party/googletest
+	url = git@github.com:google/googletest.git
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 836a7e0..62fcf74 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,96 +1,96 @@
-# Do not change these options in this file. Use cmake.config, cmake -DOPTION=VALUE, or ccmake to specify them.
-option(BUILD_TEST "Build tests" OFF)
-
-cmake_minimum_required(VERSION 3.17)
-
-include(CMakeDependentOption)
-project(InfiniTensor C CXX)
-
-cmake_dependent_option(BUILD_TEST_CORE "Build tests for core components" ON BUILD_TEST OFF)
-
-set(DEFAULT_BUILD_TYPE "RelWithDebInfo")
-# Build Type
-if(CMAKE_BUILD_TYPE STREQUAL "Debug")
-    message("Configuring for Debug build.")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -O0")
-    add_compile_definitions(DEBUG_MODE)
-elseif(CMAKE_BUILD_TYPE STREQUAL "Release")
-    message("Configuring for Release build.")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2")
-    add_compile_definitions(NDEBUG)
-elseif(CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo")
-    message("Configuring for RelWithDebInfo build.")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -O2")
-else()
-    message("Build type not specified. Configuring for RelWithDebInfo build.")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -O2")
-endif()
-
-
-set(CMAKE_CXX_STANDARD 17)
-set(CMAKE_CXX_EXTENSIONS OFF) # -std=gnu++11 when on, -std=c++11 when off
-add_compile_options(-Wno-error=unused-variable)
-
-find_package(
-  Python
-  COMPONENTS Interpreter Development
-  REQUIRED)
-
-# OpenMP
-find_package(OpenMP)
-if(OpenMP_C_FOUND)
-  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
-endif()
-if(OpenMP_CXX_FOUND)
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
-endif()
-
-include_directories(include)
-
-if(BUILD_TEST)
-  set(BUILD_GMOCK
-      OFF
-      CACHE BOOL "Do not build gmock" FORCE)
-  set(INSTALL_GTEST
-      OFF
-      CACHE BOOL "Do not install gtest" FORCE)
-  add_subdirectory(3rd-party/googletest)
-  include_directories(3rd-party/googletest/googletest/include)
-endif()
-
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -Wall -Werror -Wno-error=deprecated-declarations -Wno-error=pointer-arith")
-set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -UNDEBUG") # Enable assertion
-set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -UNDEBUG") # Enable assertion
-
-
-# Source files
-file(GLOB_RECURSE SRC src/core/*.cc src/kernels/cpu/*.cc src/operators/*.cc src/utils/*.cc)
-
-if(USE_INTELCPU)
-  file(GLOB_RECURSE SRC_INTELCPU src/intelcpu/*.cc src/kernels/intelcpu/*.cc )
-  list (APPEND SRC ${SRC_INTELCPU})
-endif()
-
-# Libraries
-add_library(InfiniTensor SHARED ${SRC})
-
-function(build_test files)
-  # Non-recursive glob for skip failed tests
-  file(GLOB TEST_SOURCES ${files})
-  foreach(testsourcefile ${TEST_SOURCES})
-    get_filename_component(testname ${testsourcefile} NAME_WE)
-    add_executable(${testname} ${testsourcefile})
-    target_link_libraries(${testname} InfiniTensor GTest::gtest_main)
-    add_test(NAME ${testname} COMMAND ${testname})
-  endforeach(testsourcefile ${TEST_SOURCES})
-endfunction()
-
-if(BUILD_TEST)
-  add_compile_definitions(BUILD_TEST=1)
-  enable_testing()
-  if(BUILD_TEST_CORE)
-    build_test(test/core/*.cc)
-    build_test(test/operators/*.cc)
-    build_test(test/kernels/nativecpu/*.cc)
-  endif()
-endif()
+# Do not change these options in this file. Use cmake.config, cmake -DOPTION=VALUE, or ccmake to specify them.
+option(BUILD_TEST "Build tests" OFF)
+
+cmake_minimum_required(VERSION 3.17)
+
+include(CMakeDependentOption)
+project(InfiniTensor C CXX)
+
+cmake_dependent_option(BUILD_TEST_CORE "Build tests for core components" ON BUILD_TEST OFF)
+
+set(DEFAULT_BUILD_TYPE "RelWithDebInfo")
+# Build Type
+if(CMAKE_BUILD_TYPE STREQUAL "Debug")
+    message("Configuring for Debug build.")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -O0")
+    add_compile_definitions(DEBUG_MODE)
+elseif(CMAKE_BUILD_TYPE STREQUAL "Release")
+    message("Configuring for Release build.")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2")
+    add_compile_definitions(NDEBUG)
+elseif(CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo")
+    message("Configuring for RelWithDebInfo build.")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -O2")
+else()
+    message("Build type not specified. Configuring for RelWithDebInfo build.")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -O2")
+endif()
+
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_EXTENSIONS OFF) # -std=gnu++11 when on, -std=c++11 when off
+add_compile_options(-Wno-error=unused-variable)
+
+find_package(
+  Python
+  COMPONENTS Interpreter Development
+  REQUIRED)
+
+# OpenMP
+find_package(OpenMP)
+if(OpenMP_C_FOUND)
+  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
+endif()
+if(OpenMP_CXX_FOUND)
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+endif()
+
+include_directories(include)
+
+if(BUILD_TEST)
+  set(BUILD_GMOCK
+      OFF
+      CACHE BOOL "Do not build gmock" FORCE)
+  set(INSTALL_GTEST
+      OFF
+      CACHE BOOL "Do not install gtest" FORCE)
+  add_subdirectory(3rd-party/googletest)
+  include_directories(3rd-party/googletest/googletest/include)
+endif()
+
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -Wall -Werror -Wno-error=deprecated-declarations -Wno-error=pointer-arith")
+set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -UNDEBUG") # Enable assertion
+set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -UNDEBUG") # Enable assertion
+
+
+# Source files
+file(GLOB_RECURSE SRC src/core/*.cc src/kernels/cpu/*.cc src/operators/*.cc src/utils/*.cc)
+
+if(USE_INTELCPU)
+  file(GLOB_RECURSE SRC_INTELCPU src/intelcpu/*.cc src/kernels/intelcpu/*.cc )
+  list (APPEND SRC ${SRC_INTELCPU})
+endif()
+
+# Libraries
+add_library(InfiniTensor SHARED ${SRC})
+
+function(build_test files)
+  # Non-recursive glob for skip failed tests
+  file(GLOB TEST_SOURCES ${files})
+  foreach(testsourcefile ${TEST_SOURCES})
+    get_filename_component(testname ${testsourcefile} NAME_WE)
+    add_executable(${testname} ${testsourcefile})
+    target_link_libraries(${testname} InfiniTensor GTest::gtest_main)
+    add_test(NAME ${testname} COMMAND ${testname})
+  endforeach(testsourcefile ${TEST_SOURCES})
+endfunction()
+
+if(BUILD_TEST)
+  add_compile_definitions(BUILD_TEST=1)
+  enable_testing()
+  if(BUILD_TEST_CORE)
+    build_test(test/core/*.cc)
+    build_test(test/operators/*.cc)
+    build_test(test/kernels/nativecpu/*.cc)
+  endif()
+endif()
diff --git a/LICENSE b/LICENSE
index 261eeb9..29f81d8 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,201 +1,201 @@
-                                 Apache License
-                           Version 2.0, January 2004
-                        http://www.apache.org/licenses/
-
-   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
-   1. Definitions.
-
-      "License" shall mean the terms and conditions for use, reproduction,
-      and distribution as defined by Sections 1 through 9 of this document.
-
-      "Licensor" shall mean the copyright owner or entity authorized by
-      the copyright owner that is granting the License.
-
-      "Legal Entity" shall mean the union of the acting entity and all
-      other entities that control, are controlled by, or are under common
-      control with that entity. For the purposes of this definition,
-      "control" means (i) the power, direct or indirect, to cause the
-      direction or management of such entity, whether by contract or
-      otherwise, or (ii) ownership of fifty percent (50%) or more of the
-      outstanding shares, or (iii) beneficial ownership of such entity.
-
-      "You" (or "Your") shall mean an individual or Legal Entity
-      exercising permissions granted by this License.
-
-      "Source" form shall mean the preferred form for making modifications,
-      including but not limited to software source code, documentation
-      source, and configuration files.
-
-      "Object" form shall mean any form resulting from mechanical
-      transformation or translation of a Source form, including but
-      not limited to compiled object code, generated documentation,
-      and conversions to other media types.
-
-      "Work" shall mean the work of authorship, whether in Source or
-      Object form, made available under the License, as indicated by a
-      copyright notice that is included in or attached to the work
-      (an example is provided in the Appendix below).
-
-      "Derivative Works" shall mean any work, whether in Source or Object
-      form, that is based on (or derived from) the Work and for which the
-      editorial revisions, annotations, elaborations, or other modifications
-      represent, as a whole, an original work of authorship. For the purposes
-      of this License, Derivative Works shall not include works that remain
-      separable from, or merely link (or bind by name) to the interfaces of,
-      the Work and Derivative Works thereof.
-
-      "Contribution" shall mean any work of authorship, including
-      the original version of the Work and any modifications or additions
-      to that Work or Derivative Works thereof, that is intentionally
-      submitted to Licensor for inclusion in the Work by the copyright owner
-      or by an individual or Legal Entity authorized to submit on behalf of
-      the copyright owner. For the purposes of this definition, "submitted"
-      means any form of electronic, verbal, or written communication sent
-      to the Licensor or its representatives, including but not limited to
-      communication on electronic mailing lists, source code control systems,
-      and issue tracking systems that are managed by, or on behalf of, the
-      Licensor for the purpose of discussing and improving the Work, but
-      excluding communication that is conspicuously marked or otherwise
-      designated in writing by the copyright owner as "Not a Contribution."
-
-      "Contributor" shall mean Licensor and any individual or Legal Entity
-      on behalf of whom a Contribution has been received by Licensor and
-      subsequently incorporated within the Work.
-
-   2. Grant of Copyright License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      copyright license to reproduce, prepare Derivative Works of,
-      publicly display, publicly perform, sublicense, and distribute the
-      Work and such Derivative Works in Source or Object form.
-
-   3. Grant of Patent License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      (except as stated in this section) patent license to make, have made,
-      use, offer to sell, sell, import, and otherwise transfer the Work,
-      where such license applies only to those patent claims licensable
-      by such Contributor that are necessarily infringed by their
-      Contribution(s) alone or by combination of their Contribution(s)
-      with the Work to which such Contribution(s) was submitted. If You
-      institute patent litigation against any entity (including a
-      cross-claim or counterclaim in a lawsuit) alleging that the Work
-      or a Contribution incorporated within the Work constitutes direct
-      or contributory patent infringement, then any patent licenses
-      granted to You under this License for that Work shall terminate
-      as of the date such litigation is filed.
-
-   4. Redistribution. You may reproduce and distribute copies of the
-      Work or Derivative Works thereof in any medium, with or without
-      modifications, and in Source or Object form, provided that You
-      meet the following conditions:
-
-      (a) You must give any other recipients of the Work or
-          Derivative Works a copy of this License; and
-
-      (b) You must cause any modified files to carry prominent notices
-          stating that You changed the files; and
-
-      (c) You must retain, in the Source form of any Derivative Works
-          that You distribute, all copyright, patent, trademark, and
-          attribution notices from the Source form of the Work,
-          excluding those notices that do not pertain to any part of
-          the Derivative Works; and
-
-      (d) If the Work includes a "NOTICE" text file as part of its
-          distribution, then any Derivative Works that You distribute must
-          include a readable copy of the attribution notices contained
-          within such NOTICE file, excluding those notices that do not
-          pertain to any part of the Derivative Works, in at least one
-          of the following places: within a NOTICE text file distributed
-          as part of the Derivative Works; within the Source form or
-          documentation, if provided along with the Derivative Works; or,
-          within a display generated by the Derivative Works, if and
-          wherever such third-party notices normally appear. The contents
-          of the NOTICE file are for informational purposes only and
-          do not modify the License. You may add Your own attribution
-          notices within Derivative Works that You distribute, alongside
-          or as an addendum to the NOTICE text from the Work, provided
-          that such additional attribution notices cannot be construed
-          as modifying the License.
-
-      You may add Your own copyright statement to Your modifications and
-      may provide additional or different license terms and conditions
-      for use, reproduction, or distribution of Your modifications, or
-      for any such Derivative Works as a whole, provided Your use,
-      reproduction, and distribution of the Work otherwise complies with
-      the conditions stated in this License.
-
-   5. Submission of Contributions. Unless You explicitly state otherwise,
-      any Contribution intentionally submitted for inclusion in the Work
-      by You to the Licensor shall be under the terms and conditions of
-      this License, without any additional terms or conditions.
-      Notwithstanding the above, nothing herein shall supersede or modify
-      the terms of any separate license agreement you may have executed
-      with Licensor regarding such Contributions.
-
-   6. Trademarks. This License does not grant permission to use the trade
-      names, trademarks, service marks, or product names of the Licensor,
-      except as required for reasonable and customary use in describing the
-      origin of the Work and reproducing the content of the NOTICE file.
-
-   7. Disclaimer of Warranty. Unless required by applicable law or
-      agreed to in writing, Licensor provides the Work (and each
-      Contributor provides its Contributions) on an "AS IS" BASIS,
-      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-      implied, including, without limitation, any warranties or conditions
-      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
-      PARTICULAR PURPOSE. You are solely responsible for determining the
-      appropriateness of using or redistributing the Work and assume any
-      risks associated with Your exercise of permissions under this License.
-
-   8. Limitation of Liability. In no event and under no legal theory,
-      whether in tort (including negligence), contract, or otherwise,
-      unless required by applicable law (such as deliberate and grossly
-      negligent acts) or agreed to in writing, shall any Contributor be
-      liable to You for damages, including any direct, indirect, special,
-      incidental, or consequential damages of any character arising as a
-      result of this License or out of the use or inability to use the
-      Work (including but not limited to damages for loss of goodwill,
-      work stoppage, computer failure or malfunction, or any and all
-      other commercial damages or losses), even if such Contributor
-      has been advised of the possibility of such damages.
-
-   9. Accepting Warranty or Additional Liability. While redistributing
-      the Work or Derivative Works thereof, You may choose to offer,
-      and charge a fee for, acceptance of support, warranty, indemnity,
-      or other liability obligations and/or rights consistent with this
-      License. However, in accepting such obligations, You may act only
-      on Your own behalf and on Your sole responsibility, not on behalf
-      of any other Contributor, and only if You agree to indemnify,
-      defend, and hold each Contributor harmless for any liability
-      incurred by, or claims asserted against, such Contributor by reason
-      of your accepting any such warranty or additional liability.
-
-   END OF TERMS AND CONDITIONS
-
-   APPENDIX: How to apply the Apache License to your work.
-
-      To apply the Apache License to your work, attach the following
-      boilerplate notice, with the fields enclosed by brackets "[]"
-      replaced with your own identifying information. (Don't include
-      the brackets!)  The text should be enclosed in the appropriate
-      comment syntax for the file format. We also recommend that a
-      file or class name and description of purpose be included on the
-      same "printed page" as the copyright notice for easier
-      identification within third-party archives.
-
-   Copyright [yyyy] [name of copyright owner]
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/Makefile b/Makefile
index 35ef7ef..1028192 100644
--- a/Makefile
+++ b/Makefile
@@ -1,18 +1,18 @@
-﻿.PHONY : build clean format install-python test-cpp test-onnx
-
-TYPE ?= Release
-TEST ?= ON
-
-CMAKE_OPT = -DCMAKE_BUILD_TYPE=$(TYPE)
-CMAKE_OPT += -DBUILD_TEST=$(TEST)
-
-build:
-	mkdir -p build/$(TYPE)
-	cd build/$(TYPE) && cmake $(CMAKE_OPT) ../.. && make -j8
-
-clean:
-	rm -rf build
-
-test-cpp:
-	@echo
-	cd build/$(TYPE) && make test
+﻿.PHONY : build clean format install-python test-cpp test-onnx
+
+TYPE ?= Release
+TEST ?= ON
+
+CMAKE_OPT = -DCMAKE_BUILD_TYPE=$(TYPE)
+CMAKE_OPT += -DBUILD_TEST=$(TEST)
+
+build:
+	mkdir -p build/$(TYPE)
+	cd build/$(TYPE) && cmake $(CMAKE_OPT) ../.. && make -j8
+
+clean:
+	rm -rf build
+
+test-cpp:
+	@echo
+	cd build/$(TYPE) && make test
diff --git "a/docs/\351\241\271\347\233\256\351\203\250\347\275\262.md" "b/docs/\351\241\271\347\233\256\351\203\250\347\275\262.md"
index 5690349..12ac118 100644
--- "a/docs/\351\241\271\347\233\256\351\203\250\347\275\262.md"
+++ "b/docs/\351\241\271\347\233\256\351\203\250\347\275\262.md"
@@ -1,35 +1,35 @@
-### 环境准备
-建议使用Linux系统或Mac系统，windows下使用WSL，配置方法和Linux一致。
-
-1. 安装gcc、g++，请确认版本为 11.3 及以上的稳定版本
-``` bash
-# linux 使用apt安装
-sudo apt install gcc g++
-
-# mac 使用Homebrew安装
-brew install gcc
-```
-
-2. 安装CMake，请确认版本为 3.17 及以上的稳定版本
-``` bash
-# linux 使用apt安装
-sudo apt install cmake
-
-# mac 使用Homebrew安装
-brew install cmake
-```
-
-2. 安装make
-``` bash
-# linux 使用apt安装
-sudo apt install make
-
-# mac 使用Homebrew安装
-brew install make
-```
-
-### 构建命令
-配置好上述环境后，进入项目目录后可以通过以下命令进行构建。
-- `make`/`make build`: 构建整个项目;
-- `make test-cpp`: 构建项目后执行测例;
+### 环境准备
+建议使用Linux系统或Mac系统，windows下使用WSL，配置方法和Linux一致。
+
+1. 安装gcc、g++，请确认版本为 11.3 及以上的稳定版本
+``` bash
+# linux 使用apt安装
+sudo apt install gcc g++
+
+# mac 使用Homebrew安装
+brew install gcc
+```
+
+2. 安装CMake，请确认版本为 3.17 及以上的稳定版本
+``` bash
+# linux 使用apt安装
+sudo apt install cmake
+
+# mac 使用Homebrew安装
+brew install cmake
+```
+
+2. 安装make
+``` bash
+# linux 使用apt安装
+sudo apt install make
+
+# mac 使用Homebrew安装
+brew install make
+```
+
+### 构建命令
+配置好上述环境后，进入项目目录后可以通过以下命令进行构建。
+- `make`/`make build`: 构建整个项目;
+- `make test-cpp`: 构建项目后执行测例;
 - `make clean`：清理生成文件
\ No newline at end of file
diff --git a/include/core/allocator.h b/include/core/allocator.h
index 002601d..a1ca6d4 100644
--- a/include/core/allocator.h
+++ b/include/core/allocator.h
@@ -1,59 +1,61 @@
-#pragma once
-#include "core/runtime.h"
-#include "core/tensor.h"
-#ifdef BUILD_TEST
-#include "gtest/gtest.h"
-#endif
-#include <cstddef>
-#include <map>
-#include <unordered_set>
-
-namespace infini {
-  class Allocator
-  {
-  private:
-    Runtime runtime;
-
-    size_t used;
-
-    size_t peak;
-
-    size_t alignment;
-
-    // pointer to the memory actually allocated
-    void *ptr;
-
-    // =================================== 作业 ===================================
-    // TODO：可能需要设计一个数据结构来存储free block，以便于管理和合并
-    // HINT: 可以使用一个 map 来存储 free block，key 为 block 的起始/结尾地址，value 为 block 的大小
-    // =================================== 作业 ===================================
-
-  public:
-    Allocator(Runtime runtime);
-
-    virtual ~Allocator();
-
-    // function: simulate memory allocation
-    // arguments：
-    //     size: size of memory block to be allocated
-    // return: head address offset of the allocated memory block
-    size_t alloc(size_t size);
-
-    // function: simulate memory free
-    // arguments:
-    //     addr: head address offset of memory block to be free
-    //     size: size of memory block to be freed
-    void free(size_t addr, size_t size);
-
-    // function: perform actual memory allocation
-    // return: pointer to the head address of the allocated memory
-    void *getPtr();
-
-    void info();
-
-  private:
-    // function: memory alignment, rouned up
-    // return: size of the aligned memory block
-    size_t getAlignedSize(size_t size);
-  };
-}
+#pragma once
+#include "core/runtime.h"
+#include "core/tensor.h"
+#ifdef BUILD_TEST
+#include "gtest/gtest.h"
+#endif
+#include <cstddef>
+#include <map>
+#include <unordered_set>
+
+namespace infini {
+  class Allocator
+  {
+  private:
+    Runtime runtime;
+
+    size_t used;
+
+    size_t peak;
+
+    size_t alignment;
+
+    // pointer to the memory actually allocated
+    void *ptr;
+
+    // =================================== 作业 ===================================
+    // TODO：可能需要设计一个数据结构来存储free block，以便于管理和合并
+    // HINT: 可以使用一个 map 来存储 free block，key 为 block 的起始/结尾地址，value 为 block 的大小
+    // =================================== 作业 ===================================
+
+    map<size_t, size_t> free_blocks; // added
+
+  public:
+    Allocator(Runtime runtime);
+
+    virtual ~Allocator();
+
+    // function: simulate memory allocation
+    // arguments：
+    //     size: size of memory block to be allocated
+    // return: head address offset of the allocated memory block
+    size_t alloc(size_t size);
+
+    // function: simulate memory free
+    // arguments:
+    //     addr: head address offset of memory block to be free
+    //     size: size of memory block to be freed
+    void free(size_t addr, size_t size);
+
+    // function: perform actual memory allocation
+    // return: pointer to the head address of the allocated memory
+    void *getPtr();
+
+    void info();
+
+  private:
+    // function: memory alignment, rouned up
+    // return: size of the aligned memory block
+    size_t getAlignedSize(size_t size);
+  };
+}
diff --git a/include/core/blob.h b/include/core/blob.h
index 01684f6..0e0955a 100644
--- a/include/core/blob.h
+++ b/include/core/blob.h
@@ -1,25 +1,25 @@
-#pragma once
-#include "core/common.h"
-#include "core/ref.h"
-
-namespace infini {
-
-class RuntimeObj;
-using Runtime = Ref<RuntimeObj>;
-
-class BlobObj
-{
-  Runtime runtime;
-  void *ptr;
-
-public:
-  BlobObj(Runtime runtime, void *ptr) : runtime(runtime), ptr(ptr) {}
-  BlobObj(BlobObj &other) = delete;
-  BlobObj &operator=(BlobObj const &) = delete;
-  ~BlobObj() {};
-
-  template <typename T>
-  T getPtr() const { return reinterpret_cast<T>(ptr); }
-};
-
-} // namespace infini
+#pragma once
+#include "core/common.h"
+#include "core/ref.h"
+
+namespace infini {
+
+class RuntimeObj;
+using Runtime = Ref<RuntimeObj>;
+
+class BlobObj
+{
+  Runtime runtime;
+  void *ptr;
+
+public:
+  BlobObj(Runtime runtime, void *ptr) : runtime(runtime), ptr(ptr) {}
+  BlobObj(BlobObj &other) = delete;
+  BlobObj &operator=(BlobObj const &) = delete;
+  ~BlobObj() {};
+
+  template <typename T>
+  T getPtr() const { return reinterpret_cast<T>(ptr); }
+};
+
+} // namespace infini
diff --git a/include/core/common.h b/include/core/common.h
index e4fd65b..5fbd58f 100644
--- a/include/core/common.h
+++ b/include/core/common.h
@@ -1,85 +1,85 @@
-#pragma once
-#include "utils/exception.h"
-#include <cassert>
-#include <functional>
-#include <iostream>
-#include <list>
-#include <map>
-#include <optional>
-#include <set>
-#include <sstream>
-#include <string>
-#include <tuple>
-#include <unordered_map>
-#include <unordered_set>
-#include <variant>
-#include <vector>
-
-namespace infini {
-using std::list;
-using std::map;
-using std::optional;
-using std::pair;
-using std::set;
-using std::string;
-using std::tie;
-using std::to_string;
-using std::tuple;
-using std::unordered_map;
-using std::vector;
-
-// Metaprogramming utilities
-#define _CAT(A, B) A##B
-#define _SELECT(NAME, NUM) _CAT(NAME##_, NUM)
-#define _GET_COUNT(_1, _2, _3, _4, _5, _6, _7, _8, _9, _10, COUNT, ...) COUNT
-#define _VA_SIZE(...) _GET_COUNT(__VA_ARGS__, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1)
-#define _VA_SELECT(NAME, ...) _SELECT(NAME, _VA_SIZE(__VA_ARGS__))(__VA_ARGS__)
-
-// Assert: conditions should have no side effect
-#define _IT_ASSERT_2(condition, info)                                          \
-    static_cast<bool>(condition)                                               \
-        ? void(0)                                                              \
-        : throw ::infini::Exception(                                           \
-              std::string("[") + __FILE__ + ":" + std::to_string(__LINE__) +   \
-              "] Assertion failed (" + #condition + "): " + info)
-#define _IT_ASSERT_1(condition) _IT_ASSERT_2(condition, "")
-#define IT_ASSERT(...) _VA_SELECT(_IT_ASSERT, __VA_ARGS__)
-
-#define IT_TODO_HALT() _IT_ASSERT_2(false, "Unimplemented")
-#define IT_TODO_HALT_MSG(msg) _IT_ASSERT_2(false, msg)
-#define IT_ASSERT_TODO(condition) _IT_ASSERT_2(condition, "Unimplemented")
-#define IT_TODO_SKIP() puts("Unimplemented " __FILE__ ":" __LINE__)
-
-// std::to_underlying is avaiable since C++23
-template <typename T> auto enum_to_underlying(T e) {
-    return static_cast<std::underlying_type_t<T>>(e);
-}
-
-template <typename T> std::string vecToString(const std::vector<T> &vec) {
-    std::stringstream ss;
-    ss << "[";
-    for (size_t i = 0; i < vec.size(); ++i) {
-        ss << vec.at(i);
-        if (i < vec.size() - 1) {
-            ss << ",";
-        }
-    }
-    ss << "]";
-    return ss.str();
-}
-
-template <typename T> std::string vecToString(const T *st, size_t length) {
-    std::stringstream ss;
-    ss << "[";
-    size_t i = 0;
-    for (i = 0; i < length; i++) {
-        ss << *(st + i);
-        if (i < length - 1) {
-            ss << ",";
-        }
-    }
-    ss << "]";
-    return ss.str();
-}
-
-} // namespace infini
+#pragma once
+#include "utils/exception.h"
+#include <cassert>
+#include <functional>
+#include <iostream>
+#include <list>
+#include <map>
+#include <optional>
+#include <set>
+#include <sstream>
+#include <string>
+#include <tuple>
+#include <unordered_map>
+#include <unordered_set>
+#include <variant>
+#include <vector>
+
+namespace infini {
+using std::list;
+using std::map;
+using std::optional;
+using std::pair;
+using std::set;
+using std::string;
+using std::tie;
+using std::to_string;
+using std::tuple;
+using std::unordered_map;
+using std::vector;
+
+// Metaprogramming utilities
+#define _CAT(A, B) A##B
+#define _SELECT(NAME, NUM) _CAT(NAME##_, NUM)
+#define _GET_COUNT(_1, _2, _3, _4, _5, _6, _7, _8, _9, _10, COUNT, ...) COUNT
+#define _VA_SIZE(...) _GET_COUNT(__VA_ARGS__, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1)
+#define _VA_SELECT(NAME, ...) _SELECT(NAME, _VA_SIZE(__VA_ARGS__))(__VA_ARGS__)
+
+// Assert: conditions should have no side effect
+#define _IT_ASSERT_2(condition, info)                                          \
+    static_cast<bool>(condition)                                               \
+        ? void(0)                                                              \
+        : throw ::infini::Exception(                                           \
+              std::string("[") + __FILE__ + ":" + std::to_string(__LINE__) +   \
+              "] Assertion failed (" + #condition + "): " + info)
+#define _IT_ASSERT_1(condition) _IT_ASSERT_2(condition, "")
+#define IT_ASSERT(...) _VA_SELECT(_IT_ASSERT, __VA_ARGS__)
+
+#define IT_TODO_HALT() _IT_ASSERT_2(false, "Unimplemented")
+#define IT_TODO_HALT_MSG(msg) _IT_ASSERT_2(false, msg)
+#define IT_ASSERT_TODO(condition) _IT_ASSERT_2(condition, "Unimplemented")
+#define IT_TODO_SKIP() puts("Unimplemented " __FILE__ ":" __LINE__)
+
+// std::to_underlying is avaiable since C++23
+template <typename T> auto enum_to_underlying(T e) {
+    return static_cast<std::underlying_type_t<T>>(e);
+}
+
+template <typename T> std::string vecToString(const std::vector<T> &vec) {
+    std::stringstream ss;
+    ss << "[";
+    for (size_t i = 0; i < vec.size(); ++i) {
+        ss << vec.at(i);
+        if (i < vec.size() - 1) {
+            ss << ",";
+        }
+    }
+    ss << "]";
+    return ss.str();
+}
+
+template <typename T> std::string vecToString(const T *st, size_t length) {
+    std::stringstream ss;
+    ss << "[";
+    size_t i = 0;
+    for (i = 0; i < length; i++) {
+        ss << *(st + i);
+        if (i < length - 1) {
+            ss << ",";
+        }
+    }
+    ss << "]";
+    return ss.str();
+}
+
+} // namespace infini
diff --git a/include/core/data_type.h b/include/core/data_type.h
index aa0e126..f7fd2dc 100644
--- a/include/core/data_type.h
+++ b/include/core/data_type.h
@@ -1,104 +1,104 @@
-#pragma once
-#include "core/common.h"
-#include <cstdint>
-
-namespace infini {
-
-class DataType {
-  public:
-    // <https://onnx.ai/onnx/intro/concepts.html#element-type>
-    static const DataType Undefine;
-    static const DataType Float32;
-    static const DataType UInt8;
-    static const DataType Int8;
-    static const DataType UInt16;
-    static const DataType Int16;
-    static const DataType Int32;
-    static const DataType Int64;
-    static const DataType String;
-    static const DataType Bool;
-    static const DataType Float16;
-    static const DataType Double;
-    static const DataType UInt32;
-    static const DataType UInt64;
-    static const DataType BFloat16;
-    // "sizePerElement" show the DType to cpu_type
-    // DataType::Bool -> int8_t   DataType::Float16 -> uint16_t
-    static constexpr size_t sizePerElement[]{0,
-                                             sizeof(float),
-                                             sizeof(uint8_t),
-                                             sizeof(int8_t),
-                                             sizeof(uint16_t),
-                                             sizeof(int16_t),
-                                             sizeof(int32_t),
-                                             sizeof(int64_t),
-                                             sizeof(std::string),
-                                             sizeof(int8_t),
-                                             sizeof(uint16_t),
-                                             sizeof(double),
-                                             sizeof(uint32_t),
-                                             sizeof(uint64_t),
-                                             0,
-                                             0,
-                                             sizeof(uint16_t)};
-
-    static constexpr std::string_view names[]{
-        "Undefine",    "Float32", "UInt8",  "Int8",   "UInt16",
-        "Int16",       "Int32",   "Int64",  "String", "Bool",
-        "Float16",     "Double",  "UInt32", "UInt64", "PlaceHolder",
-        "PlaceHolder", "BFloat16"};
-
-    static constexpr int cpuType[]{-1, 0, 2, 3, 4, 5,  6,  7, -1,
-                                   3,  4, 9, 1, 8, -1, -1, 4};
-
-  private:
-    int index;
-
-  public:
-    // FIXME: default ctor should be deleted but json requires it. Solution:
-    // https://github.com/nlohmann/json#how-can-i-use-get-for-non-default-constructiblenon-copyable-types
-    DataType() = default;
-    constexpr DataType(int index) : index(index) {}
-    bool operator==(const DataType &rhs) const { return index == rhs.index; }
-    bool operator<(const DataType &rhs) const { return index < rhs.index; }
-
-    template <typename T> static int get() {
-        IT_TODO_HALT_MSG("Unsupported data type");
-    }
-    size_t getSize() const { return sizePerElement[index]; }
-    string toString() const { return string(names[index]); }
-    int cpuTypeInt() const { return cpuType[index]; }
-    int getIndex() const { return index; }
-};
-
-// Method definitions are out of the declaration due to GCC bug:
-// https://stackoverflow.com/questions/49707184/explicit-specialization-in-non-namespace-scope-does-not-compile-in-gcc
-template <> inline int DataType::get<float>() { return 0; }
-template <> inline int DataType::get<uint32_t>() { return 1; }
-template <> inline int DataType::get<uint8_t>() { return 2; }
-template <> inline int DataType::get<int8_t>() { return 3; }
-template <> inline int DataType::get<uint16_t>() { return 4; }
-template <> inline int DataType::get<int16_t>() { return 5; }
-template <> inline int DataType::get<int32_t>() { return 6; }
-template <> inline int DataType::get<int64_t>() { return 7; }
-template <> inline int DataType::get<uint64_t>() { return 8; }
-template <> inline int DataType::get<double>() { return 9; }
-
-template <int index> struct DT {};
-template <> struct DT<0> { using t = bool; };
-template <> struct DT<1> { using t = float; };
-template <> struct DT<2> { using t = uint8_t; };
-template <> struct DT<3> { using t = int8_t; };
-template <> struct DT<4> { using t = uint16_t; };
-template <> struct DT<5> { using t = int16_t; };
-template <> struct DT<6> { using t = int32_t; };
-template <> struct DT<7> { using t = int64_t; };
-template <> struct DT<8> { using t = char; };
-template <> struct DT<9> { using t = int8_t; };
-template <> struct DT<10> { using t = uint16_t; };
-template <> struct DT<11> { using t = double; };
-template <> struct DT<12> { using t = uint32_t; };
-template <> struct DT<13> { using t = uint64_t; };
-template <> struct DT<16> { using t = uint16_t; };
-
-} // namespace infini
+#pragma once
+#include "core/common.h"
+#include <cstdint>
+
+namespace infini {
+
+class DataType {
+  public:
+    // <https://onnx.ai/onnx/intro/concepts.html#element-type>
+    static const DataType Undefine;
+    static const DataType Float32;
+    static const DataType UInt8;
+    static const DataType Int8;
+    static const DataType UInt16;
+    static const DataType Int16;
+    static const DataType Int32;
+    static const DataType Int64;
+    static const DataType String;
+    static const DataType Bool;
+    static const DataType Float16;
+    static const DataType Double;
+    static const DataType UInt32;
+    static const DataType UInt64;
+    static const DataType BFloat16;
+    // "sizePerElement" show the DType to cpu_type
+    // DataType::Bool -> int8_t   DataType::Float16 -> uint16_t
+    static constexpr size_t sizePerElement[]{0,
+                                             sizeof(float),
+                                             sizeof(uint8_t),
+                                             sizeof(int8_t),
+                                             sizeof(uint16_t),
+                                             sizeof(int16_t),
+                                             sizeof(int32_t),
+                                             sizeof(int64_t),
+                                             sizeof(std::string),
+                                             sizeof(int8_t),
+                                             sizeof(uint16_t),
+                                             sizeof(double),
+                                             sizeof(uint32_t),
+                                             sizeof(uint64_t),
+                                             0,
+                                             0,
+                                             sizeof(uint16_t)};
+
+    static constexpr std::string_view names[]{
+        "Undefine",    "Float32", "UInt8",  "Int8",   "UInt16",
+        "Int16",       "Int32",   "Int64",  "String", "Bool",
+        "Float16",     "Double",  "UInt32", "UInt64", "PlaceHolder",
+        "PlaceHolder", "BFloat16"};
+
+    static constexpr int cpuType[]{-1, 0, 2, 3, 4, 5,  6,  7, -1,
+                                   3,  4, 9, 1, 8, -1, -1, 4};
+
+  private:
+    int index;
+
+  public:
+    // FIXME: default ctor should be deleted but json requires it. Solution:
+    // https://github.com/nlohmann/json#how-can-i-use-get-for-non-default-constructiblenon-copyable-types
+    DataType() = default;
+    constexpr DataType(int index) : index(index) {}
+    bool operator==(const DataType &rhs) const { return index == rhs.index; }
+    bool operator<(const DataType &rhs) const { return index < rhs.index; }
+
+    template <typename T> static int get() {
+        IT_TODO_HALT_MSG("Unsupported data type");
+    }
+    size_t getSize() const { return sizePerElement[index]; }
+    string toString() const { return string(names[index]); }
+    int cpuTypeInt() const { return cpuType[index]; }
+    int getIndex() const { return index; }
+};
+
+// Method definitions are out of the declaration due to GCC bug:
+// https://stackoverflow.com/questions/49707184/explicit-specialization-in-non-namespace-scope-does-not-compile-in-gcc
+template <> inline int DataType::get<float>() { return 0; }
+template <> inline int DataType::get<uint32_t>() { return 1; }
+template <> inline int DataType::get<uint8_t>() { return 2; }
+template <> inline int DataType::get<int8_t>() { return 3; }
+template <> inline int DataType::get<uint16_t>() { return 4; }
+template <> inline int DataType::get<int16_t>() { return 5; }
+template <> inline int DataType::get<int32_t>() { return 6; }
+template <> inline int DataType::get<int64_t>() { return 7; }
+template <> inline int DataType::get<uint64_t>() { return 8; }
+template <> inline int DataType::get<double>() { return 9; }
+
+template <int index> struct DT {};
+template <> struct DT<0> { using t = bool; };
+template <> struct DT<1> { using t = float; };
+template <> struct DT<2> { using t = uint8_t; };
+template <> struct DT<3> { using t = int8_t; };
+template <> struct DT<4> { using t = uint16_t; };
+template <> struct DT<5> { using t = int16_t; };
+template <> struct DT<6> { using t = int32_t; };
+template <> struct DT<7> { using t = int64_t; };
+template <> struct DT<8> { using t = char; };
+template <> struct DT<9> { using t = int8_t; };
+template <> struct DT<10> { using t = uint16_t; };
+template <> struct DT<11> { using t = double; };
+template <> struct DT<12> { using t = uint32_t; };
+template <> struct DT<13> { using t = uint64_t; };
+template <> struct DT<16> { using t = uint16_t; };
+
+} // namespace infini
diff --git a/include/core/graph.h b/include/core/graph.h
index c45580c..6801bac 100644
--- a/include/core/graph.h
+++ b/include/core/graph.h
@@ -1,121 +1,121 @@
-#pragma once
-#include "core/allocator.h"
-#include "core/operator.h"
-#include "core/tensor.h"
-#include <algorithm>
-#include <cstdint>
-
-namespace infini
-{
-
-    class GraphObj : public Object
-    {
-    protected:
-        Runtime runtime;
-        TensorVec tensors;
-        OpVec ops;
-        Allocator allocator;
-
-    public:
-        explicit GraphObj(Runtime runtime)
-            : runtime(runtime), allocator(runtime), sorted(false){};
-        string toString() const override;
-        Runtime getRuntime() const { return runtime; }
-
-        Tensor addTensor(Shape dim, DataType dtype = DataType::Float32);
-        Tensor addTensor(const Tensor &tensor);
-        TensorVec addTensor(const TensorVec &tensors);
-        void removeOperator(Operator op)
-        {
-            auto it = std::find(ops.begin(), ops.end(), op);
-            if (it != ops.end())
-                ops.erase(it);
-        }
-
-        void removeTensor(Tensor tensor)
-        {
-            auto it = std::find(tensors.begin(), tensors.end(), tensor);
-            if (it != tensors.end())
-                tensors.erase(it);
-        }
-
-        const TensorVec &getTensors() const { return tensors; }
-        const OpVec &getOperators() const { return ops; }
-        Tensor getTensor(int) const;
-
-        /**
-         * @brief Sort the nodes in topological order.
-         * It returns true if the sorting is successful.
-         * Otherwise false is returned, means that there are rings in the graph,
-         * so the topological sorting fails.
-         */
-        bool topo_sort();
-
-        void optimize();
-
-        void shape_infer();
-
-        void dataMalloc();
-
-        /**
-         * @brief Add an operator and create its outputs. Output tensor arguments
-         * should be empty Refs (e.g., nullptr).
-         */
-        template <typename T, typename... Args>
-        Ref<T> addOp(Args &&...args)
-        {
-            Ref<T> op = infini::make_ref<T>(this, std::forward<Args>(args)...);
-            addOperatorAndConnect(op);
-            return op;
-        }
-
-        /**
-         * @brief Add an operator with its outputs specified.
-         */
-        template <typename T, typename... Args>
-        Ref<T> addOpWithOutputs(Args &&...args)
-        {
-            Ref<T> op = infini::make_ref<T>(nullptr, std::forward<Args>(args)...);
-            addOperatorAndConnect(op);
-            return op;
-        }
-
-        /**
-         * @brief Gets input tensors of this graph.
-         */
-        inline TensorVec getInputs() const
-        {
-            TensorVec ret;
-            for (const auto &t : tensors)
-                if (!t->getSource())
-                    ret.emplace_back(t);
-            return ret;
-        }
-
-        /**
-         * @brief Gets output tensors of this graph.
-         */
-        inline TensorVec getOutputs() const
-        {
-            TensorVec ret;
-            for (const auto &t : tensors)
-                if (t->getTargets().empty())
-                    ret.emplace_back(t);
-            return ret;
-        }
-
-        bool checkValid() const;
-
-    private:
-        /**
-         * @brief Add reverse connections and Op relationship in ctor.
-         */
-        void addOperatorAndConnect(const Operator &op);
-
-        /**
-         * @brief If the nodes is sorted in topological order.
-         */
-        bool sorted;
-    };
-
-} // namespace infini
+#pragma once
+#include "core/allocator.h"
+#include "core/operator.h"
+#include "core/tensor.h"
+#include <algorithm>
+#include <cstdint>
+
+namespace infini
+{
+
+    class GraphObj : public Object
+    {
+    protected:
+        Runtime runtime;
+        TensorVec tensors;
+        OpVec ops;
+        Allocator allocator;
+
+    public:
+        explicit GraphObj(Runtime runtime)
+            : runtime(runtime), allocator(runtime), sorted(false){};
+        string toString() const override;
+        Runtime getRuntime() const { return runtime; }
+
+        Tensor addTensor(Shape dim, DataType dtype = DataType::Float32);
+        Tensor addTensor(const Tensor &tensor);
+        TensorVec addTensor(const TensorVec &tensors);
+        void removeOperator(Operator op)
+        {
+            auto it = std::find(ops.begin(), ops.end(), op);
+            if (it != ops.end())
+                ops.erase(it);
+        }
+
+        void removeTensor(Tensor tensor)
+        {
+            auto it = std::find(tensors.begin(), tensors.end(), tensor);
+            if (it != tensors.end())
+                tensors.erase(it);
+        }
+
+        const TensorVec &getTensors() const { return tensors; }
+        const OpVec &getOperators() const { return ops; }
+        Tensor getTensor(int) const;
+
+        /**
+         * @brief Sort the nodes in topological order.
+         * It returns true if the sorting is successful.
+         * Otherwise false is returned, means that there are rings in the graph,
+         * so the topological sorting fails.
+         */
+        bool topo_sort();
+
+        void optimize();
+
+        void shape_infer();
+
+        void dataMalloc();
+
+        /**
+         * @brief Add an operator and create its outputs. Output tensor arguments
+         * should be empty Refs (e.g., nullptr).
+         */
+        template <typename T, typename... Args>
+        Ref<T> addOp(Args &&...args)
+        {
+            Ref<T> op = infini::make_ref<T>(this, std::forward<Args>(args)...);
+            addOperatorAndConnect(op);
+            return op;
+        }
+
+        /**
+         * @brief Add an operator with its outputs specified.
+         */
+        template <typename T, typename... Args>
+        Ref<T> addOpWithOutputs(Args &&...args)
+        {
+            Ref<T> op = infini::make_ref<T>(nullptr, std::forward<Args>(args)...);
+            addOperatorAndConnect(op);
+            return op;
+        }
+
+        /**
+         * @brief Gets input tensors of this graph.
+         */
+        inline TensorVec getInputs() const
+        {
+            TensorVec ret;
+            for (const auto &t : tensors)
+                if (!t->getSource())
+                    ret.emplace_back(t);
+            return ret;
+        }
+
+        /**
+         * @brief Gets output tensors of this graph.
+         */
+        inline TensorVec getOutputs() const
+        {
+            TensorVec ret;
+            for (const auto &t : tensors)
+                if (t->getTargets().empty())
+                    ret.emplace_back(t);
+            return ret;
+        }
+
+        bool checkValid() const;
+
+    private:
+        /**
+         * @brief Add reverse connections and Op relationship in ctor.
+         */
+        void addOperatorAndConnect(const Operator &op);
+
+        /**
+         * @brief If the nodes is sorted in topological order.
+         */
+        bool sorted;
+    };
+
+} // namespace infini
diff --git a/include/core/kernel.h b/include/core/kernel.h
index a762424..6d13ba1 100644
--- a/include/core/kernel.h
+++ b/include/core/kernel.h
@@ -1,87 +1,87 @@
-#pragma once
-#include "core/common.h"
-#include "core/operator.h"
-#include "core/tensor.h"
-#include "utils/operator_utils.h"
-#include <functional>
-
-namespace infini
-{
-
-    class RuntimeObj;
-
-    class Kernel
-    {
-    public:
-        Kernel() {}
-        virtual ~Kernel() {}
-
-        /**
-         * @brief Executes an op with a default parameter.
-         */
-        virtual void compute(const Operator &op,
-                             const RuntimeObj *context) const = 0;
-    };
-
-    class KernelRegistry
-    {
-    public:
-        using KernelRecord =
-            tuple<Kernel *const, const string, const int>; // Kernel, name, ID
-
-    private:
-        std::map<KernelAttrs, KernelRecord> kernels;
-        int nKernels = 0;
-
-    public:
-        ~KernelRegistry()
-        {
-            for (auto &[k, v] : kernels)
-                delete std::get<0>(v);
-        }
-        static KernelRegistry &getInstance()
-        {
-            static KernelRegistry instance;
-            return instance;
-        }
-        bool registerKernel(const KernelAttrs &key, Kernel *kernel, string name)
-        {
-            IT_ASSERT(kernels.find(key) == kernels.end(),
-                      "Kernel already registered");
-            kernels.emplace(key, KernelRecord{kernel, name, ++nKernels});
-            return true;
-        }
-        Kernel *getKernel(const KernelAttrs &kernelAttrs) const
-        {
-            auto it = kernels.find(kernelAttrs);
-            IT_ASSERT(it != kernels.end(), "Kernel not found for key {" +
-                                               get_kernel_attrs_str(kernelAttrs) +
-                                               "}");
-            return std::get<0>(it->second);
-        }
-        const KernelRecord &getKernelItem(const KernelAttrs &kernelAttrs) const
-        {
-            return kernels.at(kernelAttrs);
-        }
-    };
-
-    class CpuKernelWithoutConfig : public Kernel
-    {
-    public:
-        virtual void compute(const Operator &op,
-                             const RuntimeObj *context) const = 0;
-    };
-
-} // namespace infini
-
-#define _REGISTER_KERNEL_1(device, opType, kernel, name, cnt)                 \
-    namespace infini                                                          \
-    {                                                                         \
-        static const bool _CAT(_register_kernel_, cnt) =                      \
-            KernelRegistry::getInstance().registerKernel(KernelAttrs{device,  \
-                                                                     opType}, \
-                                                         new kernel(), name); \
-    }
-
-#define REGISTER_KERNEL(device, opType, kernel, name) \
-    _REGISTER_KERNEL_1(device, opType, kernel, name, __COUNTER__)
+#pragma once
+#include "core/common.h"
+#include "core/operator.h"
+#include "core/tensor.h"
+#include "utils/operator_utils.h"
+#include <functional>
+
+namespace infini
+{
+
+    class RuntimeObj;
+
+    class Kernel
+    {
+    public:
+        Kernel() {}
+        virtual ~Kernel() {}
+
+        /**
+         * @brief Executes an op with a default parameter.
+         */
+        virtual void compute(const Operator &op,
+                             const RuntimeObj *context) const = 0;
+    };
+
+    class KernelRegistry
+    {
+    public:
+        using KernelRecord =
+            tuple<Kernel *const, const string, const int>; // Kernel, name, ID
+
+    private:
+        std::map<KernelAttrs, KernelRecord> kernels;
+        int nKernels = 0;
+
+    public:
+        ~KernelRegistry()
+        {
+            for (auto &[k, v] : kernels)
+                delete std::get<0>(v);
+        }
+        static KernelRegistry &getInstance()
+        {
+            static KernelRegistry instance;
+            return instance;
+        }
+        bool registerKernel(const KernelAttrs &key, Kernel *kernel, string name)
+        {
+            IT_ASSERT(kernels.find(key) == kernels.end(),
+                      "Kernel already registered");
+            kernels.emplace(key, KernelRecord{kernel, name, ++nKernels});
+            return true;
+        }
+        Kernel *getKernel(const KernelAttrs &kernelAttrs) const
+        {
+            auto it = kernels.find(kernelAttrs);
+            IT_ASSERT(it != kernels.end(), "Kernel not found for key {" +
+                                               get_kernel_attrs_str(kernelAttrs) +
+                                               "}");
+            return std::get<0>(it->second);
+        }
+        const KernelRecord &getKernelItem(const KernelAttrs &kernelAttrs) const
+        {
+            return kernels.at(kernelAttrs);
+        }
+    };
+
+    class CpuKernelWithoutConfig : public Kernel
+    {
+    public:
+        virtual void compute(const Operator &op,
+                             const RuntimeObj *context) const = 0;
+    };
+
+} // namespace infini
+
+#define _REGISTER_KERNEL_1(device, opType, kernel, name, cnt)                 \
+    namespace infini                                                          \
+    {                                                                         \
+        static const bool _CAT(_register_kernel_, cnt) =                      \
+            KernelRegistry::getInstance().registerKernel(KernelAttrs{device,  \
+                                                                     opType}, \
+                                                         new kernel(), name); \
+    }
+
+#define REGISTER_KERNEL(device, opType, kernel, name) \
+    _REGISTER_KERNEL_1(device, opType, kernel, name, __COUNTER__)
diff --git a/include/core/object.h b/include/core/object.h
index 2db50ad..5ea4265 100644
--- a/include/core/object.h
+++ b/include/core/object.h
@@ -1,71 +1,71 @@
-#pragma once
-#include "core/common.h"
-#include "ref.h"
-
-namespace infini {
-
-using UidBaseType = int;
-
-class Uid {
-  private:
-    UidBaseType uid;
-
-  public:
-    Uid(UidBaseType uid) : uid(uid) {}
-    Uid &operator=(const Uid &rhs) = delete;
-
-    operator UidBaseType() const { return uid; }
-};
-
-class Guid : public Uid {
-  private:
-    UidBaseType generateGuid() {
-        static UidBaseType guidCnt = 0;
-        return ++guidCnt;
-    }
-
-  public:
-    Guid() : Uid(generateGuid()) {}
-    Guid(const Guid &rhs) : Uid(generateGuid()) {}
-};
-
-/**
- * @brief Family unique ID. Cloned tensors shared the same FUID.
- */
-class Fuid : public Uid {
-  private:
-    UidBaseType generateFuid() {
-        static UidBaseType fuidCnt = 0;
-        return ++fuidCnt;
-    }
-
-  public:
-    Fuid() : Uid(generateFuid()) {}
-    Fuid(const Fuid &fuid) : Uid(fuid) {}
-};
-
-class Object {
-  protected:
-    Guid guid;
-
-  public:
-    virtual ~Object(){};
-    virtual string toString() const = 0;
-    void print() { std::cout << toString() << std::endl; }
-    UidBaseType getGuid() const { return guid; }
-};
-
-inline std::ostream &operator<<(std::ostream &os, const Object &obj) {
-    os << obj.toString();
-    return os;
-}
-
-// Overload for Ref-wrapped Object
-template <typename T,
-          typename std::enable_if_t<std::is_base_of_v<Object, T>> * = nullptr>
-inline std::ostream &operator<<(std::ostream &os, const Ref<T> &obj) {
-    os << obj->toString();
-    return os;
-}
-
-} // namespace infini
+#pragma once
+#include "core/common.h"
+#include "ref.h"
+
+namespace infini {
+
+using UidBaseType = int;
+
+class Uid {
+  private:
+    UidBaseType uid;
+
+  public:
+    Uid(UidBaseType uid) : uid(uid) {}
+    Uid &operator=(const Uid &rhs) = delete;
+
+    operator UidBaseType() const { return uid; }
+};
+
+class Guid : public Uid {
+  private:
+    UidBaseType generateGuid() {
+        static UidBaseType guidCnt = 0;
+        return ++guidCnt;
+    }
+
+  public:
+    Guid() : Uid(generateGuid()) {}
+    Guid(const Guid &rhs) : Uid(generateGuid()) {}
+};
+
+/**
+ * @brief Family unique ID. Cloned tensors shared the same FUID.
+ */
+class Fuid : public Uid {
+  private:
+    UidBaseType generateFuid() {
+        static UidBaseType fuidCnt = 0;
+        return ++fuidCnt;
+    }
+
+  public:
+    Fuid() : Uid(generateFuid()) {}
+    Fuid(const Fuid &fuid) : Uid(fuid) {}
+};
+
+class Object {
+  protected:
+    Guid guid;
+
+  public:
+    virtual ~Object(){};
+    virtual string toString() const = 0;
+    void print() { std::cout << toString() << std::endl; }
+    UidBaseType getGuid() const { return guid; }
+};
+
+inline std::ostream &operator<<(std::ostream &os, const Object &obj) {
+    os << obj.toString();
+    return os;
+}
+
+// Overload for Ref-wrapped Object
+template <typename T,
+          typename std::enable_if_t<std::is_base_of_v<Object, T>> * = nullptr>
+inline std::ostream &operator<<(std::ostream &os, const Ref<T> &obj) {
+    os << obj->toString();
+    return os;
+}
+
+} // namespace infini
diff --git a/include/core/op_type.h b/include/core/op_type.h
index ffe2d6e..806f09d 100644
--- a/include/core/op_type.h
+++ b/include/core/op_type.h
@@ -1,43 +1,43 @@
-﻿#pragma once
-#ifndef OP_TYPE_H
-#define OP_TYPE_H
-
-#include <cstdint>
-#include <string>
-#include <unordered_set>
-
-namespace infini
-{
-    struct OpType
-    {
-        using underlying_t = uint16_t;
-        enum : underlying_t
-        {
-            Unknown,
-            Add,
-            Cast,
-            Clip,
-            Concat,
-            Div,
-            Mul,
-            MatMul,
-            Relu,
-            Sub,
-            Transpose,
-
-        } type;
-
-        constexpr OpType(decltype(type) t) : type(t) {}
-        constexpr explicit OpType(underlying_t val) : type((decltype(type))val) {}
-        constexpr underlying_t underlying() const { return type; }
-
-        bool operator==(OpType others) const { return type == others.type; }
-        bool operator!=(OpType others) const { return type != others.type; }
-        bool operator<(OpType others) const { return type < others.type; }
-
-        const char *toString() const;
-    };
-
-} // namespace infini
-
-#endif // OP_TYPE_H
+﻿#pragma once
+#ifndef OP_TYPE_H
+#define OP_TYPE_H
+
+#include <cstdint>
+#include <string>
+#include <unordered_set>
+
+namespace infini
+{
+    struct OpType
+    {
+        using underlying_t = uint16_t;
+        enum : underlying_t
+        {
+            Unknown,
+            Add,
+            Cast,
+            Clip,
+            Concat,
+            Div,
+            Mul,
+            MatMul,
+            Relu,
+            Sub,
+            Transpose,
+
+        } type;
+
+        constexpr OpType(decltype(type) t) : type(t) {}
+        constexpr explicit OpType(underlying_t val) : type((decltype(type))val) {}
+        constexpr underlying_t underlying() const { return type; }
+
+        bool operator==(OpType others) const { return type == others.type; }
+        bool operator!=(OpType others) const { return type != others.type; }
+        bool operator<(OpType others) const { return type < others.type; }
+
+        const char *toString() const;
+    };
+
+} // namespace infini
+
+#endif // OP_TYPE_H
diff --git a/include/core/operator.h b/include/core/operator.h
index 0641007..66b5596 100644
--- a/include/core/operator.h
+++ b/include/core/operator.h
@@ -1,93 +1,93 @@
-#pragma once
-
-#include "core/op_type.h"
-#include "core/tensor.h"
-
-namespace infini
-{
-    using KernelAttrs = std::tuple<Device, OpType::underlying_t>;
-
-    class GraphObj;
-    class OperatorObj : public Object
-    {
-        friend class GraphObj;
-
-    protected:
-        OpType type;
-        TensorVec inputs;
-        TensorVec outputs;
-        vector<WRef<OperatorObj>> predecessors;
-        vector<WRef<OperatorObj>> successors;
-
-    public:
-        OperatorObj(OpType opType, TensorVec inputs, TensorVec outputs);
-        virtual optional<vector<Shape>> inferShape(const TensorVec &inputs) = 0;
-        virtual vector<DataType> inferDataType(const TensorVec &inputs) const;
-        /**
-         * @brief Constructs outputs (if requried) and check whether the operator is
-         * valid.
-         *
-         * @param graph If graph is not nullptr, outputs should be created in this
-         * function.
-         */
-        bool checkValid(GraphObj *graph);
-
-    public: // getter and setter
-        const TensorVec &getInputs() const { return inputs; }
-        const TensorVec &getOutputs() const { return outputs; }
-        Tensor getInputs(size_t i) const { return inputs.at(i); }
-        Tensor getOutput() const
-        {
-            IT_ASSERT(outputs.size() == 1, "Unimplemented");
-            return outputs[0];
-        }
-        Tensor getOutput(size_t i) const
-        {
-            IT_ASSERT(i < outputs.size(), "Index exceeded");
-            return outputs.at(i);
-        }
-        OpVec getPredecessors() const { return wrefs_to_refs(predecessors); }
-        OpVec getSuccessors() const { return wrefs_to_refs(successors); }
-        OpType getOpType() const { return type; }
-        // HACK: set correct data type
-        DataType getDType() const { return getInputs(0)->getDType(); }
-        DataType getOutDType() const { return getOutput()->getDType(); }
-        virtual int numInputs() const = 0;
-        virtual int numOutputs() const = 0;
-
-        /**
-         * @brief Clone this operator and replace its inputs and outputs.
-         *
-         * @param newInputs
-         * @param newOutputs
-         * @return Operator
-         */
-        virtual Operator clone(const TensorVec &newInputs,
-                               const TensorVec &newOutputs) const = 0;
-
-    protected:
-        optional<vector<Shape>> inferShape();
-        vector<DataType> inferDataType() const;
-
-    private:
-        void addPredecessors(const Operator &op) { predecessors.emplace_back(op); }
-        void addSuccessors(const Operator &op) { successors.emplace_back(op); }
-        void removePredecessors(const Operator &op);
-        void removeSuccessors(const Operator &op);
-        void replaceInput(Tensor t1, Tensor t2);
-    };
-
-#define OP_CLONE(OpObj)                                                \
-    virtual Operator clone(const TensorVec &newInputs,                 \
-                           const TensorVec &newOutputs) const override \
-    {                                                                  \
-        auto op = infini::make_ref<OpObj>(*this);                      \
-        op->inputs = newInputs;                                        \
-        op->outputs = newOutputs;                                      \
-        op->predecessors.clear();                                      \
-        op->successors.clear();                                        \
-        IT_ASSERT(op->checkValid(nullptr));                            \
-        return op;                                                     \
-    }
-
-} // namespace infini
+#pragma once
+
+#include "core/op_type.h"
+#include "core/tensor.h"
+
+namespace infini
+{
+    using KernelAttrs = std::tuple<Device, OpType::underlying_t>;
+
+    class GraphObj;
+    class OperatorObj : public Object
+    {
+        friend class GraphObj;
+
+    protected:
+        OpType type;
+        TensorVec inputs;
+        TensorVec outputs;
+        vector<WRef<OperatorObj>> predecessors;
+        vector<WRef<OperatorObj>> successors;
+
+    public:
+        OperatorObj(OpType opType, TensorVec inputs, TensorVec outputs);
+        virtual optional<vector<Shape>> inferShape(const TensorVec &inputs) = 0;
+        virtual vector<DataType> inferDataType(const TensorVec &inputs) const;
+        /**
+         * @brief Constructs outputs (if requried) and check whether the operator is
+         * valid.
+         *
+         * @param graph If graph is not nullptr, outputs should be created in this
+         * function.
+         */
+        bool checkValid(GraphObj *graph);
+
+    public: // getter and setter
+        const TensorVec &getInputs() const { return inputs; }
+        const TensorVec &getOutputs() const { return outputs; }
+        Tensor getInputs(size_t i) const { return inputs.at(i); }
+        Tensor getOutput() const
+        {
+            IT_ASSERT(outputs.size() == 1, "Unimplemented");
+            return outputs[0];
+        }
+        Tensor getOutput(size_t i) const
+        {
+            IT_ASSERT(i < outputs.size(), "Index exceeded");
+            return outputs.at(i);
+        }
+        OpVec getPredecessors() const { return wrefs_to_refs(predecessors); }
+        OpVec getSuccessors() const { return wrefs_to_refs(successors); }
+        OpType getOpType() const { return type; }
+        // HACK: set correct data type
+        DataType getDType() const { return getInputs(0)->getDType(); }
+        DataType getOutDType() const { return getOutput()->getDType(); }
+        virtual int numInputs() const = 0;
+        virtual int numOutputs() const = 0;
+
+        /**
+         * @brief Clone this operator and replace its inputs and outputs.
+         *
+         * @param newInputs
+         * @param newOutputs
+         * @return Operator
+         */
+        virtual Operator clone(const TensorVec &newInputs,
+                               const TensorVec &newOutputs) const = 0;
+
+    protected:
+        optional<vector<Shape>> inferShape();
+        vector<DataType> inferDataType() const;
+
+    private:
+        void addPredecessors(const Operator &op) { predecessors.emplace_back(op); }
+        void addSuccessors(const Operator &op) { successors.emplace_back(op); }
+        void removePredecessors(const Operator &op);
+        void removeSuccessors(const Operator &op);
+        void replaceInput(Tensor t1, Tensor t2);
+    };
+
+#define OP_CLONE(OpObj)                                                \
+    virtual Operator clone(const TensorVec &newInputs,                 \
+                           const TensorVec &newOutputs) const override \
+    {                                                                  \
+        auto op = infini::make_ref<OpObj>(*this);                      \
+        op->inputs = newInputs;                                        \
+        op->outputs = newOutputs;                                      \
+        op->predecessors.clear();                                      \
+        op->successors.clear();                                        \
+        IT_ASSERT(op->checkValid(nullptr));                            \
+        return op;                                                     \
+    }
+
+} // namespace infini
diff --git a/include/core/ref.h b/include/core/ref.h
index 3393f6e..d7f2976 100644
--- a/include/core/ref.h
+++ b/include/core/ref.h
@@ -1,43 +1,43 @@
-#pragma once
-#include "core/common.h"
-#include <functional>
-#include <memory>
-#include <type_traits>
-
-namespace infini {
-
-template <typename T> using Ref = std::shared_ptr<T>;
-template <typename T> using WRef = std::weak_ptr<T>;
-
-template <typename T> struct is_ref : std::false_type {};
-template <typename T> struct is_ref<Ref<T>> : std::true_type {};
-template <typename T> struct is_ref<WRef<T>> : std::true_type {};
-
-template <typename T, typename... Params> Ref<T> make_ref(Params &&...params) {
-    static_assert(is_ref<T>::value == false, "Ref should not be nested");
-    return std::make_shared<T>(std::forward<Params>(params)...);
-}
-
-template <class T, class U,
-          typename std::enable_if_t<std::is_base_of_v<U, T>> * = nullptr>
-Ref<T> as(const Ref<U> &ref) {
-    return std::dynamic_pointer_cast<T>(ref);
-}
-
-template <typename T>
-std::vector<WRef<T>> refs_to_wrefs(const std::vector<Ref<T>> &refs) {
-    std::vector<WRef<T>> wrefs;
-    for (const auto &ref : refs)
-        wrefs.emplace_back(ref);
-    return wrefs;
-}
-
-template <typename T>
-std::vector<Ref<T>> wrefs_to_refs(const std::vector<WRef<T>> &wrefs) {
-    std::vector<Ref<T>> refs;
-    for (const auto &wref : wrefs)
-        refs.emplace_back(wref);
-    return refs;
-}
-
-} // namespace infini
+#pragma once
+#include "core/common.h"
+#include <functional>
+#include <memory>
+#include <type_traits>
+
+namespace infini {
+
+template <typename T> using Ref = std::shared_ptr<T>;
+template <typename T> using WRef = std::weak_ptr<T>;
+
+template <typename T> struct is_ref : std::false_type {};
+template <typename T> struct is_ref<Ref<T>> : std::true_type {};
+template <typename T> struct is_ref<WRef<T>> : std::true_type {};
+
+template <typename T, typename... Params> Ref<T> make_ref(Params &&...params) {
+    static_assert(is_ref<T>::value == false, "Ref should not be nested");
+    return std::make_shared<T>(std::forward<Params>(params)...);
+}
+
+template <class T, class U,
+          typename std::enable_if_t<std::is_base_of_v<U, T>> * = nullptr>
+Ref<T> as(const Ref<U> &ref) {
+    return std::dynamic_pointer_cast<T>(ref);
+}
+
+template <typename T>
+std::vector<WRef<T>> refs_to_wrefs(const std::vector<Ref<T>> &refs) {
+    std::vector<WRef<T>> wrefs;
+    for (const auto &ref : refs)
+        wrefs.emplace_back(ref);
+    return wrefs;
+}
+
+template <typename T>
+std::vector<Ref<T>> wrefs_to_refs(const std::vector<WRef<T>> &wrefs) {
+    std::vector<Ref<T>> refs;
+    for (const auto &wref : wrefs)
+        refs.emplace_back(wref);
+    return refs;
+}
+
+} // namespace infini
diff --git a/include/core/runtime.h b/include/core/runtime.h
index 1b64cd9..d70c603 100644
--- a/include/core/runtime.h
+++ b/include/core/runtime.h
@@ -1,69 +1,69 @@
-#pragma once
-#include "core/common.h"
-#include "core/op_type.h"
-#include "core/ref.h"
-
-namespace infini
-{
-  class TensorObj;
-  class OperatorObj;
-  class GraphObj;
-  class RuntimeObj;
-  class BlobObj;
-
-  using Tensor = Ref<TensorObj>;
-  using Operator = Ref<OperatorObj>;
-  using Graph = Ref<GraphObj>;
-  using Runtime = Ref<RuntimeObj>;
-  using Blob = Ref<BlobObj>;
-
-  using TensorVec = vector<Tensor>;
-  using OpVec = vector<Operator>;
-
-  enum class Device
-  {
-    CPU = 1
-  };
-
-  class RuntimeObj : public std::enable_shared_from_this<RuntimeObj>
-  {
-  protected:
-    Device device;
-
-  public:
-    explicit RuntimeObj(Device device)
-        : device(device) {}
-    RuntimeObj(RuntimeObj &other) = delete;
-    RuntimeObj &operator=(RuntimeObj const &) = delete;
-    virtual ~RuntimeObj() {}
-
-    virtual void run(const Graph &graph) const = 0;
-    virtual void *alloc(size_t size) = 0;
-    virtual void dealloc(void *ptr) = 0;
-
-    bool isCpu() const
-    {
-      return true;
-    }
-
-    virtual string toString() const = 0;
-  };
-
-  class NativeCpuRuntimeObj : public RuntimeObj
-  {
-  public:
-    NativeCpuRuntimeObj() : RuntimeObj(Device::CPU) {}
-
-    static Ref<NativeCpuRuntimeObj> &getInstance()
-    {
-      static Ref<NativeCpuRuntimeObj> instance =
-          make_ref<NativeCpuRuntimeObj>();
-      return instance;
-    }
-    void dealloc(void *ptr) override;
-    void run(const Graph &graph) const override;
-    void *alloc(size_t size) override;
-    string toString() const override;
-  };
-
-} // namespace infini
+#pragma once
+#include "core/common.h"
+#include "core/op_type.h"
+#include "core/ref.h"
+
+namespace infini
+{
+  class TensorObj;
+  class OperatorObj;
+  class GraphObj;
+  class RuntimeObj;
+  class BlobObj;
+
+  using Tensor = Ref<TensorObj>;
+  using Operator = Ref<OperatorObj>;
+  using Graph = Ref<GraphObj>;
+  using Runtime = Ref<RuntimeObj>;
+  using Blob = Ref<BlobObj>;
+
+  using TensorVec = vector<Tensor>;
+  using OpVec = vector<Operator>;
+
+  enum class Device
+  {
+    CPU = 1
+  };
+
+  class RuntimeObj : public std::enable_shared_from_this<RuntimeObj>
+  {
+  protected:
+    Device device;
+
+  public:
+    explicit RuntimeObj(Device device)
+        : device(device) {}
+    RuntimeObj(RuntimeObj &other) = delete;
+    RuntimeObj &operator=(RuntimeObj const &) = delete;
+    virtual ~RuntimeObj() {}
+
+    virtual void run(const Graph &graph) const = 0;
+    virtual void *alloc(size_t size) = 0;
+    virtual void dealloc(void *ptr) = 0;
+
+    bool isCpu() const
+    {
+      return true;
+    }
+
+    virtual string toString() const = 0;
+  };
+
+  class NativeCpuRuntimeObj : public RuntimeObj
+  {
+  public:
+    NativeCpuRuntimeObj() : RuntimeObj(Device::CPU) {}
+
+    static Ref<NativeCpuRuntimeObj> &getInstance()
+    {
+      static Ref<NativeCpuRuntimeObj> instance =
+          make_ref<NativeCpuRuntimeObj>();
+      return instance;
+    }
+    void dealloc(void *ptr) override;
+    void run(const Graph &graph) const override;
+    void *alloc(size_t size) override;
+    string toString() const override;
+  };
+
+} // namespace infini
diff --git a/include/core/tensor.h b/include/core/tensor.h
index 93eec14..3c845e2 100644
--- a/include/core/tensor.h
+++ b/include/core/tensor.h
@@ -1,164 +1,164 @@
-#pragma once
-#include "core/blob.h"
-#include "core/data_type.h"
-#include "core/object.h"
-#include "core/runtime.h"
-#include <cmath>
-#include <cstring>
-#include <fstream>
-
-namespace infini
-{
-    class GraphObj;
-    using ShapeElem = int;
-    using Shape = vector<ShapeElem>;
-    class TensorObj : public Object
-    {
-        friend class GraphObj;
-
-    protected:
-        int dim;
-
-        DataType dtype;
-        vector<WRef<OperatorObj>> targets;
-        WRef<OperatorObj> source;
-        Blob data;
-        Runtime runtime;
-
-    private:
-        Shape shape;
-        size_t _size; // Cache of Π(shape).
-        Fuid fuid;    // Cloned tensors share the same id. Tensors constructed from
-                      // scratch have a new id.
-
-    public:
-        TensorObj(Shape shape, DataType dtype, Runtime runtime);
-        virtual ~TensorObj() {}
-        string toString() const override;
-
-        size_t size() const { return _size; }
-        size_t getBytes() const { return _size * dtype.getSize(); }
-
-        Shape getDims() const { return shape; }
-        void setShape(Shape shape_);
-        size_t getRank() const { return shape.size(); }
-        UidBaseType getFuid() const { return fuid; }
-
-        void setData(
-            std::function<void(void *, size_t, DataType)> const &generator) const;
-
-        void setDataBlob(const Blob &blob);
-
-        void printData() const;
-        bool equalData(const Tensor &rhs, double relativeError = 1e-6) const;
-
-        template <typename T>
-        bool equalData(const vector<T> &dataVector)
-        {
-            IT_ASSERT(size() == dataVector.size());
-            IT_ASSERT(DataType::get<T>() == dtype.cpuTypeInt());
-            return equalDataImpl(getRawDataPtr<T *>(), dataVector.data(), size());
-        }
-
-        template <typename T>
-        T getRawDataPtr() const
-        {
-            static_assert(std::is_pointer_v<T>,
-                          "Raw data pointer has a type of pointer");
-            IT_ASSERT(data != nullptr);
-            return data->getPtr<T>();
-        }
-
-        DataType getDType() const { return dtype; }
-        Runtime getRuntime() const { return runtime; }
-
-        OpVec getTargets() const { return wrefs_to_refs(targets); }
-        Operator getSource() const { return source.lock(); }
-
-    private:
-        template <class T>
-        string dataToString() const
-        {
-            std::stringstream builder;
-            builder << "Tensor: " << guid << std::endl;
-
-            auto numDims = shape.size();
-            auto dimSzVec = vector<int>(numDims, 1);
-            auto ptr = data->getPtr<T *>();
-            dimSzVec[numDims - 1] = shape[numDims - 1];
-
-            for (int i = numDims - 1; i != 0; --i)
-                dimSzVec[i - 1] = dimSzVec[i] * shape[i - 1];
-
-            for (size_t i = 0, iEnd = size(); i < iEnd; ++i)
-            {
-                for (size_t j = 0; j < numDims; ++j)
-                    if (i % dimSzVec[j] == 0)
-                        builder << "[";
-
-                builder << ptr[i];
-                for (size_t j = 0; j < numDims; ++j)
-                    if ((int)i % dimSzVec[j] == dimSzVec[j] - 1)
-                        builder << "]";
-
-                if (i != size() - 1)
-                    builder << ", ";
-
-                auto column = (size_t)dimSzVec[numDims - 1];
-                if (i % column == column - 1)
-                    builder << std::endl;
-            }
-            return builder.str();
-        }
-
-        template <typename T>
-        bool equalDataImpl(const T *a, const T *b, size_t size,
-                           double relativeError = 1e-6) const
-        {
-            for (size_t i = 0; i < size; ++i)
-            {
-                if constexpr (std::is_integral_v<T>)
-                {
-                    if (a[i] != b[i])
-                        return false;
-                }
-                else if constexpr (std::is_floating_point_v<T>)
-                {
-                    if (std::min(fabs(a[i]), fabs(b[i])) == 0. &&
-                        fabs(a[i] - b[i]) > relativeError)
-                    {
-                        printf("Error on %lu: %f %f\n", i, a[i], b[i]);
-                        return false;
-                    }
-                    else if (std::min(fabs(a[i]), fabs(b[i])) != 0. &&
-                             fabs(a[i] - b[i]) /
-                                     std::max(fabs(a[i]), fabs(b[i])) >
-                                 relativeError)
-                    {
-                        printf("Error on %lu: %f %f\n", i, a[i], b[i]);
-                        return false;
-                    }
-                }
-                else
-                {
-                    static_assert(!sizeof(T), "Unsupported data type");
-                }
-            }
-            return true;
-        }
-
-        void addTarget(const Operator &op) { targets.emplace_back(op); }
-        void setSource(const Operator &op) { source = op; }
-        void removeTarget(const Operator &op)
-        {
-            for (auto itr = targets.begin(); itr != targets.end();)
-            {
-                if (itr->lock() == op)
-                    itr = targets.erase(itr);
-                else
-                    ++itr;
-            }
-        }
-    };
-
-} // namespace infini
+#pragma once
+#include "core/blob.h"
+#include "core/data_type.h"
+#include "core/object.h"
+#include "core/runtime.h"
+#include <cmath>
+#include <cstring>
+#include <fstream>
+
+namespace infini
+{
+    class GraphObj;
+    using ShapeElem = int;
+    using Shape = vector<ShapeElem>;
+    class TensorObj : public Object
+    {
+        friend class GraphObj;
+
+    protected:
+        int dim;
+
+        DataType dtype;
+        vector<WRef<OperatorObj>> targets;
+        WRef<OperatorObj> source;
+        Blob data;
+        Runtime runtime;
+
+    private:
+        Shape shape;
+        size_t _size; // Cache of Π(shape).
+        Fuid fuid;    // Cloned tensors share the same id. Tensors constructed from
+                      // scratch have a new id.
+
+    public:
+        TensorObj(Shape shape, DataType dtype, Runtime runtime);
+        virtual ~TensorObj() {}
+        string toString() const override;
+
+        size_t size() const { return _size; }
+        size_t getBytes() const { return _size * dtype.getSize(); }
+
+        Shape getDims() const { return shape; }
+        void setShape(Shape shape_);
+        size_t getRank() const { return shape.size(); }
+        UidBaseType getFuid() const { return fuid; }
+
+        void setData(
+            std::function<void(void *, size_t, DataType)> const &generator) const;
+
+        void setDataBlob(const Blob &blob);
+
+        void printData() const;
+        bool equalData(const Tensor &rhs, double relativeError = 1e-6) const;
+
+        template <typename T>
+        bool equalData(const vector<T> &dataVector)
+        {
+            IT_ASSERT(size() == dataVector.size());
+            IT_ASSERT(DataType::get<T>() == dtype.cpuTypeInt());
+            return equalDataImpl(getRawDataPtr<T *>(), dataVector.data(), size());
+        }
+
+        template <typename T>
+        T getRawDataPtr() const
+        {
+            static_assert(std::is_pointer_v<T>,
+                          "Raw data pointer has a type of pointer");
+            IT_ASSERT(data != nullptr);
+            return data->getPtr<T>();
+        }
+
+        DataType getDType() const { return dtype; }
+        Runtime getRuntime() const { return runtime; }
+
+        OpVec getTargets() const { return wrefs_to_refs(targets); }
+        Operator getSource() const { return source.lock(); }
+
+    private:
+        template <class T>
+        string dataToString() const
+        {
+            std::stringstream builder;
+            builder << "Tensor: " << guid << std::endl;
+
+            auto numDims = shape.size();
+            auto dimSzVec = vector<int>(numDims, 1);
+            auto ptr = data->getPtr<T *>();
+            dimSzVec[numDims - 1] = shape[numDims - 1];
+
+            for (int i = numDims - 1; i != 0; --i)
+                dimSzVec[i - 1] = dimSzVec[i] * shape[i - 1];
+
+            for (size_t i = 0, iEnd = size(); i < iEnd; ++i)
+            {
+                for (size_t j = 0; j < numDims; ++j)
+                    if (i % dimSzVec[j] == 0)
+                        builder << "[";
+
+                builder << ptr[i];
+                for (size_t j = 0; j < numDims; ++j)
+                    if ((int)i % dimSzVec[j] == dimSzVec[j] - 1)
+                        builder << "]";
+
+                if (i != size() - 1)
+                    builder << ", ";
+
+                auto column = (size_t)dimSzVec[numDims - 1];
+                if (i % column == column - 1)
+                    builder << std::endl;
+            }
+            return builder.str();
+        }
+
+        template <typename T>
+        bool equalDataImpl(const T *a, const T *b, size_t size,
+                           double relativeError = 1e-6) const
+        {
+            for (size_t i = 0; i < size; ++i)
+            {
+                if constexpr (std::is_integral_v<T>)
+                {
+                    if (a[i] != b[i])
+                        return false;
+                }
+                else if constexpr (std::is_floating_point_v<T>)
+                {
+                    if (std::min(fabs(a[i]), fabs(b[i])) == 0. &&
+                        fabs(a[i] - b[i]) > relativeError)
+                    {
+                        printf("Error on %lu: %f %f\n", i, a[i], b[i]);
+                        return false;
+                    }
+                    else if (std::min(fabs(a[i]), fabs(b[i])) != 0. &&
+                             fabs(a[i] - b[i]) /
+                                     std::max(fabs(a[i]), fabs(b[i])) >
+                                 relativeError)
+                    {
+                        printf("Error on %lu: %f %f\n", i, a[i], b[i]);
+                        return false;
+                    }
+                }
+                else
+                {
+                    static_assert(!sizeof(T), "Unsupported data type");
+                }
+            }
+            return true;
+        }
+
+        void addTarget(const Operator &op) { targets.emplace_back(op); }
+        void setSource(const Operator &op) { source = op; }
+        void removeTarget(const Operator &op)
+        {
+            for (auto itr = targets.begin(); itr != targets.end();)
+            {
+                if (itr->lock() == op)
+                    itr = targets.erase(itr);
+                else
+                    ++itr;
+            }
+        }
+    };
+
+} // namespace infini
diff --git a/include/operators/concat.h b/include/operators/concat.h
index 86287fd..d1a9591 100644
--- a/include/operators/concat.h
+++ b/include/operators/concat.h
@@ -1,32 +1,32 @@
-#pragma once
-#include "core/operator.h"
-
-namespace infini {
-/**
- * @brief Concatenate several tensors into one. All the input tensors should
- * have the same shape except for the concatenated dimension.
- *
- */
-class ConcatObj : public OperatorObj {
-    int dim;
-
-  public:
-    /**
-     * @brief Construct a new Concat object.
-     *
-     * @param graph The computation graph that this operator belongs to.
-     * @param inputs The input tensors to be concatenated.
-     * @param output Concatenated tensor.
-     * @param dim The dimension to concatenate on.
-     */
-    ConcatObj(GraphObj *graph, TensorVec inputs, Tensor output, int dim);
-    OP_CLONE(ConcatObj);
-
-    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
-
-    std::string toString() const override;
-    int numInputs() const override { return inputs.size(); }
-    int numOutputs() const override { return 1; }
-    int getDim() const { return dim; }
-};
-} // namespace infini
+#pragma once
+#include "core/operator.h"
+
+namespace infini {
+/**
+ * @brief Concatenate several tensors into one. All the input tensors should
+ * have the same shape except for the concatenated dimension.
+ *
+ */
+class ConcatObj : public OperatorObj {
+    int dim;
+
+  public:
+    /**
+     * @brief Construct a new Concat object.
+     *
+     * @param graph The computation graph that this operator belongs to.
+     * @param inputs The input tensors to be concatenated.
+     * @param output Concatenated tensor.
+     * @param dim The dimension to concatenate on.
+     */
+    ConcatObj(GraphObj *graph, TensorVec inputs, Tensor output, int dim);
+    OP_CLONE(ConcatObj);
+
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
+
+    std::string toString() const override;
+    int numInputs() const override { return inputs.size(); }
+    int numOutputs() const override { return 1; }
+    int getDim() const { return dim; }
+};
+} // namespace infini
diff --git a/include/operators/element_wise.h b/include/operators/element_wise.h
index 4260b2d..2f64a24 100644
--- a/include/operators/element_wise.h
+++ b/include/operators/element_wise.h
@@ -1,47 +1,47 @@
-#pragma once
-#include "core/operator.h"
-
-namespace infini
-{
-  /**
-   * @brief Base class of **binary** element-wise operators.
-   * Unary operators like activations are not the derived classes of
-   * ElementWiseObj.
-   *
-   */
-  class ElementWiseObj : public OperatorObj
-  {
-  public:
-    /**
-     * @brief Construct a new ElementWise object
-     *
-     * @param type Operator type.
-     * @param graph The computation graph that this operator belongs to.
-     * @param input0 The first input tensor.
-     * @param input1 The second input tensor.
-     * @param output The output tensor.
-     */
-    ElementWiseObj(OpType type, GraphObj *graph, Tensor input0, Tensor input1,
-                   Tensor output);
-    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
-
-    std::string toString() const override;
-    int numInputs() const override { return 2; }
-    int numOutputs() const override { return 1; }
-    };
-
-#define DEFINE_ELEMENT_WISE_OBJ(prefix, type)                    \
-  class prefix##Obj : public ElementWiseObj                      \
-  {                                                              \
-  public:                                                        \
-    prefix##Obj(GraphObj *graph, Tensor input0, Tensor input1,   \
-                Tensor output)                                   \
-        : ElementWiseObj(type, graph, input0, input1, output) {} \
-    OP_CLONE(prefix##Obj);                                       \
-  };
-
-  DEFINE_ELEMENT_WISE_OBJ(Add, OpType::Add)
-  DEFINE_ELEMENT_WISE_OBJ(Sub, OpType::Sub)
-  DEFINE_ELEMENT_WISE_OBJ(Mul, OpType::Mul)
-  DEFINE_ELEMENT_WISE_OBJ(Div, OpType::Div)
-}; // namespace infini
+#pragma once
+#include "core/operator.h"
+
+namespace infini
+{
+  /**
+   * @brief Base class of **binary** element-wise operators.
+   * Unary operators like activations are not the derived classes of
+   * ElementWiseObj.
+   *
+   */
+  class ElementWiseObj : public OperatorObj
+  {
+  public:
+    /**
+     * @brief Construct a new ElementWise object
+     *
+     * @param type Operator type.
+     * @param graph The computation graph that this operator belongs to.
+     * @param input0 The first input tensor.
+     * @param input1 The second input tensor.
+     * @param output The output tensor.
+     */
+    ElementWiseObj(OpType type, GraphObj *graph, Tensor input0, Tensor input1,
+                   Tensor output);
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
+
+    std::string toString() const override;
+    int numInputs() const override { return 2; }
+    int numOutputs() const override { return 1; }
+    };
+
+#define DEFINE_ELEMENT_WISE_OBJ(prefix, type)                    \
+  class prefix##Obj : public ElementWiseObj                      \
+  {                                                              \
+  public:                                                        \
+    prefix##Obj(GraphObj *graph, Tensor input0, Tensor input1,   \
+                Tensor output)                                   \
+        : ElementWiseObj(type, graph, input0, input1, output) {} \
+    OP_CLONE(prefix##Obj);                                       \
+  };
+
+  DEFINE_ELEMENT_WISE_OBJ(Add, OpType::Add)
+  DEFINE_ELEMENT_WISE_OBJ(Sub, OpType::Sub)
+  DEFINE_ELEMENT_WISE_OBJ(Mul, OpType::Mul)
+  DEFINE_ELEMENT_WISE_OBJ(Div, OpType::Div)
+}; // namespace infini
diff --git a/include/operators/matmul.h b/include/operators/matmul.h
index 4925895..517edff 100644
--- a/include/operators/matmul.h
+++ b/include/operators/matmul.h
@@ -1,60 +1,60 @@
-#pragma once
-#include "core/operator.h"
-
-namespace infini
-{
-    /**
-     * @brief Matrix multiplication.
-     *
-     */
-    class MatmulObj : public OperatorObj
-    {
-    private:
-        // InfiniTensor assumes a row-major tensor layout. `transA`=false means
-        // default dims, true means A should be transposed before matmul. This is in
-        // oppsite to the column-major BLAS.
-        bool transA, transB;
-
-        // Auxiliary attributes which are not a part of operator attributes.
-        int m, n, k;
-
-    public:
-        /**
-         * @brief Matmul operator with batch broadcast and tensor transpose
-         * supports. Only one tensor with singe batch can be broadcasted due to the
-         * BLAS interface restriction. Tranpose indicates whether the last two
-         * dimensions should be transposed before Matmul and does not affect other
-         * leading dimensions.
-         *
-         * Matmul show how operators are defined in InfiniTensor. The constructor of
-         * an operator can create output tensors for the operator or not, which
-         * depends on `graph`.
-         *
-         * @param graph The computation graph that this operator belongs to.
-         * @param A The input tensor.
-         * @param B The input tensor.
-         * @param C C is the output of Matmul. If outputs are going to be created in
-         * the constructor, C should be an empty Ref.
-         * @param transA If matrix A should be transposed when computing.
-         * @param transB If matrix B should be transposed when computing.
-         */
-        MatmulObj(GraphObj *graph, Tensor A, Tensor B, Tensor C,
-                  bool transA = false, bool transB = false);
-        OP_CLONE(MatmulObj);
-
-        std::string toString() const override;
-        optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
-
-        int numInputs() const override { return inputs.size(); }
-        int numOutputs() const override { return 1; }
-
-        bool getTransA() const { return transA; }
-        bool getTransB() const { return transB; }
-        void setTransA(bool transA) { this->transA = transA; }
-        void setTransB(bool transB) { this->transB = transB; }
-        int getM() const { return m; }
-        int getN() const { return n; }
-        int getK() const { return k; }
-    };
-
+#pragma once
+#include "core/operator.h"
+
+namespace infini
+{
+    /**
+     * @brief Matrix multiplication.
+     *
+     */
+    class MatmulObj : public OperatorObj
+    {
+    private:
+        // InfiniTensor assumes a row-major tensor layout. `transA`=false means
+        // default dims, true means A should be transposed before matmul. This is in
+        // oppsite to the column-major BLAS.
+        bool transA, transB;
+
+        // Auxiliary attributes which are not a part of operator attributes.
+        int m, n, k;
+
+    public:
+        /**
+         * @brief Matmul operator with batch broadcast and tensor transpose
+         * supports. Only one tensor with singe batch can be broadcasted due to the
+         * BLAS interface restriction. Tranpose indicates whether the last two
+         * dimensions should be transposed before Matmul and does not affect other
+         * leading dimensions.
+         *
+         * Matmul show how operators are defined in InfiniTensor. The constructor of
+         * an operator can create output tensors for the operator or not, which
+         * depends on `graph`.
+         *
+         * @param graph The computation graph that this operator belongs to.
+         * @param A The input tensor.
+         * @param B The input tensor.
+         * @param C C is the output of Matmul. If outputs are going to be created in
+         * the constructor, C should be an empty Ref.
+         * @param transA If matrix A should be transposed when computing.
+         * @param transB If matrix B should be transposed when computing.
+         */
+        MatmulObj(GraphObj *graph, Tensor A, Tensor B, Tensor C,
+                  bool transA = false, bool transB = false);
+        OP_CLONE(MatmulObj);
+
+        std::string toString() const override;
+        optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
+
+        int numInputs() const override { return inputs.size(); }
+        int numOutputs() const override { return 1; }
+
+        bool getTransA() const { return transA; }
+        bool getTransB() const { return transB; }
+        void setTransA(bool transA) { this->transA = transA; }
+        void setTransB(bool transB) { this->transB = transB; }
+        int getM() const { return m; }
+        int getN() const { return n; }
+        int getK() const { return k; }
+    };
+
 } // namespace infini
\ No newline at end of file
diff --git a/include/operators/transpose.h b/include/operators/transpose.h
index c32bbe5..d5dbb71 100644
--- a/include/operators/transpose.h
+++ b/include/operators/transpose.h
@@ -1,34 +1,34 @@
-#pragma once
-#include "core/operator.h"
-
-namespace infini
-{
-  /**
-   * @brief Transpose the input tensor similar to numpy.transpose.
-   *
-   */
-  class TransposeObj : public OperatorObj
-  {
-  public:
-    /**
-     * @brief Construct a new TransposeObj object.
-     *
-     * @param graph The graph to which this operator belongs.
-     * @param input The input tensor.
-     * @param output The output tensor.
-     * @param permute The permutation of the dimensions.
-     */
-    TransposeObj(GraphObj *graph, Tensor input, Tensor output,
-                 vector<int> permute);
-    OP_CLONE(TransposeObj);
-    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
-
-    std::string toString() const override;
-    int numInputs() const override { return 1; }
-    int numOutputs() const override { return 1; }
-    std::vector<int> getPermute() const { return transposePermute; }
-
-  private:
-    vector<int> transposePermute;
-  };
-} // namespace infini
+#pragma once
+#include "core/operator.h"
+
+namespace infini
+{
+  /**
+   * @brief Transpose the input tensor similar to numpy.transpose.
+   *
+   */
+  class TransposeObj : public OperatorObj
+  {
+  public:
+    /**
+     * @brief Construct a new TransposeObj object.
+     *
+     * @param graph The graph to which this operator belongs.
+     * @param input The input tensor.
+     * @param output The output tensor.
+     * @param permute The permutation of the dimensions.
+     */
+    TransposeObj(GraphObj *graph, Tensor input, Tensor output,
+                 vector<int> permute);
+    OP_CLONE(TransposeObj);
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
+
+    std::string toString() const override;
+    int numInputs() const override { return 1; }
+    int numOutputs() const override { return 1; }
+    std::vector<int> getPermute() const { return transposePermute; }
+
+  private:
+    vector<int> transposePermute;
+  };
+} // namespace infini
diff --git a/include/operators/unary.h b/include/operators/unary.h
index 83f3dd3..16e5755 100644
--- a/include/operators/unary.h
+++ b/include/operators/unary.h
@@ -1,104 +1,104 @@
-#pragma once
-#include "core/operator.h"
-
-namespace infini
-{
-  /**
-   * @brief The base class for unary operators.
-   *
-   */
-  class UnaryObj : public OperatorObj
-  {
-  public:
-    /**
-     * @brief Construct a new Unary object.
-     *
-     * @param type Operator type.
-     * @param graph The computation graph that this operator belongs to.
-     * @param input The input tensor.
-     * @param output The output tensor.
-     */
-    UnaryObj(OpType type, GraphObj *graph, Tensor input, Tensor output);
-    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
-
-    std::string toString() const override;
-    int numInputs() const override { return 1; }
-    int numOutputs() const override { return 1; }
-  };
-
-  class ClipObj : public OperatorObj
-  {
-  public:
-    ClipObj(GraphObj *graph, Tensor input, Tensor output,
-            std::optional<float> min, std::optional<float> max);
-    OP_CLONE(ClipObj);
-    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
-
-    std::string toString() const override;
-    std::optional<float> getMin() const { return minValue; };
-    std::optional<float> getMax() const { return maxValue; };
-    int numInputs() const override { return 1; }
-    int numOutputs() const override { return 1; }
-
-  private:
-    std::optional<float> minValue, maxValue;
-  };
-
-  enum class CastType
-  {
-    Float2Float16 = 0,
-    Float2Int64,
-    Float2Int32,
-    Float2Int16,
-    Float2Int8,
-    Float2BFloat16,
-    Int322Float,
-    Int322Int8,
-    Int322Int16,
-    Int322Int64,
-    Int162Float,
-    Int162Int32,
-    Int82Float,
-    Int82Int16,
-    Int82Int32,
-    Uint82Float,
-    Uint82Int32,
-    Uint82Int64,
-    Int642Int32,
-    Int642Uint32,
-    Int642Float,
-    Uint322Int64,
-    Float162Float,
-    BFloat162Float,
-    Float2Float,
-  };
-
-  class CastObj : public OperatorObj
-  {
-  public:
-    CastObj(GraphObj *graph, Tensor input, Tensor output, CastType type);
-    OP_CLONE(CastObj);
-    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
-    vector<DataType> inferDataType(const TensorVec &inputs) const override;
-
-    std::string toString() const override;
-    CastType getType() const { return castType; }
-    DataType getOutputDataType() const;
-    int numInputs() const override { return 1; }
-    int numOutputs() const override { return 1; }
-
-  private:
-    CastType castType;
-  };
-
-#define DEFINE_UNARY_OBJ(prefix, type)                        \
-  class prefix##Obj : public UnaryObj                         \
-  {                                                           \
-  public:                                                     \
-    prefix##Obj(GraphObj *graph, Tensor input, Tensor output) \
-        : UnaryObj(type, graph, input, output) {}             \
-    OP_CLONE(prefix##Obj);                                    \
-  };
-
-  DEFINE_UNARY_OBJ(Relu, OpType::Relu)
-}; // namespace infini
+#pragma once
+#include "core/operator.h"
+
+namespace infini
+{
+  /**
+   * @brief The base class for unary operators.
+   *
+   */
+  class UnaryObj : public OperatorObj
+  {
+  public:
+    /**
+     * @brief Construct a new Unary object.
+     *
+     * @param type Operator type.
+     * @param graph The computation graph that this operator belongs to.
+     * @param input The input tensor.
+     * @param output The output tensor.
+     */
+    UnaryObj(OpType type, GraphObj *graph, Tensor input, Tensor output);
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
+
+    std::string toString() const override;
+    int numInputs() const override { return 1; }
+    int numOutputs() const override { return 1; }
+  };
+
+  class ClipObj : public OperatorObj
+  {
+  public:
+    ClipObj(GraphObj *graph, Tensor input, Tensor output,
+            std::optional<float> min, std::optional<float> max);
+    OP_CLONE(ClipObj);
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
+
+    std::string toString() const override;
+    std::optional<float> getMin() const { return minValue; };
+    std::optional<float> getMax() const { return maxValue; };
+    int numInputs() const override { return 1; }
+    int numOutputs() const override { return 1; }
+
+  private:
+    std::optional<float> minValue, maxValue;
+  };
+
+  enum class CastType
+  {
+    Float2Float16 = 0,
+    Float2Int64,
+    Float2Int32,
+    Float2Int16,
+    Float2Int8,
+    Float2BFloat16,
+    Int322Float,
+    Int322Int8,
+    Int322Int16,
+    Int322Int64,
+    Int162Float,
+    Int162Int32,
+    Int82Float,
+    Int82Int16,
+    Int82Int32,
+    Uint82Float,
+    Uint82Int32,
+    Uint82Int64,
+    Int642Int32,
+    Int642Uint32,
+    Int642Float,
+    Uint322Int64,
+    Float162Float,
+    BFloat162Float,
+    Float2Float,
+  };
+
+  class CastObj : public OperatorObj
+  {
+  public:
+    CastObj(GraphObj *graph, Tensor input, Tensor output, CastType type);
+    OP_CLONE(CastObj);
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) override;
+    vector<DataType> inferDataType(const TensorVec &inputs) const override;
+
+    std::string toString() const override;
+    CastType getType() const { return castType; }
+    DataType getOutputDataType() const;
+    int numInputs() const override { return 1; }
+    int numOutputs() const override { return 1; }
+
+  private:
+    CastType castType;
+  };
+
+#define DEFINE_UNARY_OBJ(prefix, type)                        \
+  class prefix##Obj : public UnaryObj                         \
+  {                                                           \
+  public:                                                     \
+    prefix##Obj(GraphObj *graph, Tensor input, Tensor output) \
+        : UnaryObj(type, graph, input, output) {}             \
+    OP_CLONE(prefix##Obj);                                    \
+  };
+
+  DEFINE_UNARY_OBJ(Relu, OpType::Relu)
+}; // namespace infini
diff --git a/include/test.h b/include/test.h
index cef5a3e..a7d6341 100644
--- a/include/test.h
+++ b/include/test.h
@@ -1,4 +1,4 @@
-#pragma once
-#include "core/common.h"
-#include "utils/data_generator.h"
-#include "gtest/gtest.h"
+#pragma once
+#include "core/common.h"
+#include "utils/data_generator.h"
+#include "gtest/gtest.h"
diff --git a/include/utils/data_generator.h b/include/utils/data_generator.h
index 1b7d91a..4c23575 100644
--- a/include/utils/data_generator.h
+++ b/include/utils/data_generator.h
@@ -1,59 +1,59 @@
-#pragma once
-#include "core/common.h"
-#include <random>
-
-namespace infini {
-
-class DataGenerator {
-  private:
-    virtual void fill(uint32_t *data, size_t size) { IT_TODO_HALT(); }
-    virtual void fill(float *data, size_t size) { IT_TODO_HALT(); }
-
-public:
-    virtual ~DataGenerator() {}
-    void operator()(void *data, size_t size, DataType dataType) {
-        if (dataType == DataType::UInt32)
-            fill(reinterpret_cast<uint32_t *>(data), size);
-        else if (dataType == DataType::Float32)
-            fill(reinterpret_cast<float *>(data), size);
-        else
-            IT_TODO_HALT();
-    }
-};
-
-class IncrementalGenerator : public DataGenerator {
-  public:
-    virtual ~IncrementalGenerator() {}
-
-  private:
-    template <typename T> void fill(T *data, size_t size) {
-        for (size_t i = 0; i < size; i++) {
-            data[i] = i;
-        }
-    }
-
-    void fill(uint32_t *data, size_t size) override {
-        fill<uint32_t>(data, size);
-    }
-    void fill(float *data, size_t size) override { fill<float>(data, size); }
-};
-
-template <int val> class ValGenerator : public DataGenerator {
-  public:
-    virtual ~ValGenerator() {}
-
-  private:
-    template <typename T> void fill(T *data, size_t size) {
-        for (size_t i = 0; i < size; i++) {
-            data[i] = val;
-        }
-    }
-
-    void fill(uint32_t *data, size_t size) override {
-        fill<uint32_t>(data, size);
-    }
-    void fill(float *data, size_t size) override { fill<float>(data, size); }
-};
-typedef ValGenerator<1> OneGenerator;
-typedef ValGenerator<0> ZeroGenerator;
-} // namespace infini
+#pragma once
+#include "core/common.h"
+#include <random>
+
+namespace infini {
+
+class DataGenerator {
+  private:
+    virtual void fill(uint32_t *data, size_t size) { IT_TODO_HALT(); }
+    virtual void fill(float *data, size_t size) { IT_TODO_HALT(); }
+
+public:
+    virtual ~DataGenerator() {}
+    void operator()(void *data, size_t size, DataType dataType) {
+        if (dataType == DataType::UInt32)
+            fill(reinterpret_cast<uint32_t *>(data), size);
+        else if (dataType == DataType::Float32)
+            fill(reinterpret_cast<float *>(data), size);
+        else
+            IT_TODO_HALT();
+    }
+};
+
+class IncrementalGenerator : public DataGenerator {
+  public:
+    virtual ~IncrementalGenerator() {}
+
+  private:
+    template <typename T> void fill(T *data, size_t size) {
+        for (size_t i = 0; i < size; i++) {
+            data[i] = i;
+        }
+    }
+
+    void fill(uint32_t *data, size_t size) override {
+        fill<uint32_t>(data, size);
+    }
+    void fill(float *data, size_t size) override { fill<float>(data, size); }
+};
+
+template <int val> class ValGenerator : public DataGenerator {
+  public:
+    virtual ~ValGenerator() {}
+
+  private:
+    template <typename T> void fill(T *data, size_t size) {
+        for (size_t i = 0; i < size; i++) {
+            data[i] = val;
+        }
+    }
+
+    void fill(uint32_t *data, size_t size) override {
+        fill<uint32_t>(data, size);
+    }
+    void fill(float *data, size_t size) override { fill<float>(data, size); }
+};
+typedef ValGenerator<1> OneGenerator;
+typedef ValGenerator<0> ZeroGenerator;
+} // namespace infini
diff --git a/include/utils/exception.h b/include/utils/exception.h
index d7bb433..6fa62a1 100644
--- a/include/utils/exception.h
+++ b/include/utils/exception.h
@@ -1,22 +1,22 @@
-#pragma once
-#include <stdexcept>
-#include <string>
-
-namespace infini {
-
-class Exception : public std::runtime_error {
-  protected:
-    std::string info;
-
-  public:
-    Exception(const std::string &msg);
-
-    Exception &operator<<(const std::string &str) {
-        info += str;
-        return *this;
-    }
-
-    const char *what() const noexcept override { return info.c_str(); }
-};
-
-} // namespace infini
+#pragma once
+#include <stdexcept>
+#include <string>
+
+namespace infini {
+
+class Exception : public std::runtime_error {
+  protected:
+    std::string info;
+
+  public:
+    Exception(const std::string &msg);
+
+    Exception &operator<<(const std::string &str) {
+        info += str;
+        return *this;
+    }
+
+    const char *what() const noexcept override { return info.c_str(); }
+};
+
+} // namespace infini
diff --git a/include/utils/operator_utils.h b/include/utils/operator_utils.h
index e3a2373..7f6dd29 100644
--- a/include/utils/operator_utils.h
+++ b/include/utils/operator_utils.h
@@ -1,26 +1,26 @@
-#pragma once
-#ifndef OPERATOR_UTIL_H
-#define OPERATOR_UTIL_H
-
-#include "core/operator.h"
-#include "core/tensor.h"
-
-#include <numeric>
-
-namespace infini {
-
-// Launch a broadcast shape based on the shape of input A and B
-Shape infer_broadcast(const Shape &A, const Shape &B);
-// Launch the real axis based on rank and current axis
-int get_real_axis(const int &axis, const int &rank);
-// Locate the index with size from Shape
-Shape locate_index(size_t inputN, const Shape &shape);
-// Delocate the ShapeIndex from Shape with broadcast
-size_t delocate_index(const Shape &shapeIndex, const Shape &shape,
-                      const Shape &stride);
-// Convert KernelAttrs to a string representation
-std::string get_kernel_attrs_str(const KernelAttrs &kernelAttrs);
-
-} // namespace infini
-
-#endif
+#pragma once
+#ifndef OPERATOR_UTIL_H
+#define OPERATOR_UTIL_H
+
+#include "core/operator.h"
+#include "core/tensor.h"
+
+#include <numeric>
+
+namespace infini {
+
+// Launch a broadcast shape based on the shape of input A and B
+Shape infer_broadcast(const Shape &A, const Shape &B);
+// Launch the real axis based on rank and current axis
+int get_real_axis(const int &axis, const int &rank);
+// Locate the index with size from Shape
+Shape locate_index(size_t inputN, const Shape &shape);
+// Delocate the ShapeIndex from Shape with broadcast
+size_t delocate_index(const Shape &shapeIndex, const Shape &shape,
+                      const Shape &stride);
+// Convert KernelAttrs to a string representation
+std::string get_kernel_attrs_str(const KernelAttrs &kernelAttrs);
+
+} // namespace infini
+
+#endif
diff --git a/src/core/allocator.cc b/src/core/allocator.cc
index ff593ae..263a3c0 100644
--- a/src/core/allocator.cc
+++ b/src/core/allocator.cc
@@ -25,15 +25,39 @@ namespace infini
 
     size_t Allocator::alloc(size_t size)
     {
+        IT_ASSERT(size > 0);
         IT_ASSERT(this->ptr == nullptr);
         // pad the size to the multiple of alignment
         size = this->getAlignedSize(size);
-
+        size_t ret_offset = 0;
         // =================================== 作业 ===================================
         // TODO: 设计一个算法来分配内存，返回起始地址偏移量
         // =================================== 作业 ===================================
-
-        return 0;
+        auto tar_free_block = free_blocks.end();//记录最小适配空闲位置
+        auto end_free_block = tar_free_block;
+        for(auto it = free_blocks.begin(); it!=free_blocks.end(); it++){
+            if(it->second >= size){
+                
+                if(tar_free_block!= end_free_block){
+                    if(it->second< tar_free_block->second){
+                        tar_free_block = it;
+                    }
+                }else{
+                    tar_free_block=it;
+                }
+            }
+        }
+        if(tar_free_block != end_free_block){//找到了合适的空间
+            ret_offset = tar_free_block->first + tar_free_block->second - size;//从后向前alloc空间
+            tar_free_block->second -=size;
+            if(tar_free_block->second==0) free_blocks.erase(tar_free_block);
+        }else{//没有找到
+            ret_offset = used;
+            used += size;
+            peak = used; //peak 只需要在used变化是进行更新
+        }
+        
+        return ret_offset;//
     }
 
     void Allocator::free(size_t addr, size_t size)
@@ -44,6 +68,23 @@ namespace infini
         // =================================== 作业 ===================================
         // TODO: 设计一个算法来回收内存
         // =================================== 作业 ===================================
+        free_blocks[addr] = size;
+        used -= size;
+        auto last = free_blocks.begin();
+        if ( last == free_blocks.end()) return;
+        auto it = last;
+        it++;
+        for( ; it!=free_blocks.end() ; ) {
+            if(last->first+last->second == it->first){
+                auto need_remove = it;
+                it++;
+                free_blocks.erase(need_remove);
+            }else{
+                last = it;
+                it++;
+            }
+        }
+    
     }
 
     void *Allocator::getPtr()
diff --git a/src/core/graph.cc b/src/core/graph.cc
index 3a90637..28fcaa5 100644
--- a/src/core/graph.cc
+++ b/src/core/graph.cc
@@ -2,6 +2,10 @@
 #include <algorithm>
 #include <numeric>
 #include <queue>
+#include "operators/matmul.h"
+#include "operators/transpose.h"
+
+#include "core/ref.h"
 
 namespace infini
 {
@@ -106,6 +110,97 @@ namespace infini
         // 1. 去除冗余的算子（例如，两个相邻的算子都是 transpose 算子，且做的是相反的操作，可以将其全部删除）
         // 2. 合并算子（例如，矩阵乘算子中含有属性transA、transB，如果其输入存在transpose，且对最后两个维度做交换，就可以将transpose融入到矩阵乘算子的属性中去）
         // =================================== 作业 ===================================
+
+        if (ops.size()==0) return;
+        topo_sort();
+        //1
+        OpVec need_to_remove_operators;
+        for(auto op: ops){
+            switch(op->getOpType().type){
+                case OpType::Transpose:
+                    {
+                        auto input = op->getInputs()[0];
+                        auto output = op->getOutput();
+                        if (output->getTargets().size() == 1) // only one target
+                        {
+                            auto next_op = output->getTargets()[0];
+                            if (next_op->getOpType() == OpType::Transpose)
+                            {
+                                auto next_output = next_op->getOutput();
+                                if (next_output->getDims() == input->getDims())
+                                {
+                                    need_to_remove_operators.push_back(op);
+                                    need_to_remove_operators.push_back(next_op);
+                                    for (auto target : next_output->getTargets())
+                                    {
+                                        input->addTarget(target);
+                                        target->replaceInput(next_output, input);
+                                        target->removePredecessors(next_op);
+                                    }
+                                    input->removeTarget(op);
+                                    removeTensor(output);
+                                    removeTensor(next_output);
+                                }
+                            }
+                        }
+                        break;
+
+                    }
+                    
+                case OpType::MatMul:
+                    {   
+                        auto matlut_op = as<MatmulObj>(op);
+                        auto input_a = op->getInputs()[0];
+                        auto input_b = op->getInputs()[1];
+                        auto output = op->getOutput();
+                        if (input_a->getSource() && input_a->getSource()->getOpType() == OpType::Transpose)
+                        {
+                            auto transpose_op = input_a->getSource();
+                            if (as<TransposeObj>(transpose_op)->getPermute() == Shape{0, 1, 3, 2})
+                            {
+                                if(transpose_op->getOutput()->getTargets().size() == 1){
+                                    matlut_op->setTransB(true);
+                                    auto transpose_input = transpose_op->getInputs()[0];
+                                    transpose_input->removeTarget(transpose_op);
+                                    transpose_input->addTarget(matlut_op);
+                                    matlut_op->replaceInput(input_a, transpose_input);
+                                    matlut_op->removePredecessors(transpose_op);
+                                    need_to_remove_operators.push_back(transpose_op);
+                                    removeTensor(transpose_op->getOutput());
+                                }
+                            }
+                        }
+                        if (input_b->getSource() && input_b->getSource()->getOpType() == OpType::Transpose)
+                        {
+                            auto transpose_op = input_b->getSource();
+                            if (as<TransposeObj>(transpose_op)->getPermute() == Shape{0, 1, 3, 2})
+                            {
+                                if(transpose_op->getOutput()->getTargets().size() == 1){
+                                    matlut_op->setTransB(true);
+                                    auto transpose_input = transpose_op->getInputs()[0];
+                                    transpose_input->removeTarget(transpose_op);
+                                    transpose_input->addTarget(matlut_op);
+                                    matlut_op->replaceInput(input_b, transpose_input);
+                                    matlut_op->removePredecessors(transpose_op);
+                                    need_to_remove_operators.push_back(transpose_op);
+                                    removeTensor(transpose_op->getOutput());
+                                }
+                            }
+                        }
+
+                    }
+                    break;
+                default:
+                    // 其余算子
+                    break;
+            }
+            
+        }
+        for (auto op : need_to_remove_operators)
+        {
+            removeOperator(op);
+        }
+        
     }
 
     Tensor GraphObj::getTensor(int fuid) const
@@ -152,7 +247,19 @@ namespace infini
         // TODO：利用 allocator 给计算图分配内存
         // HINT: 获取分配好的内存指针后，可以调用 tensor 的 setDataBlob 函数给 tensor 绑定内存
         // =================================== 作业 ===================================
-
+        
+        std::unordered_map<UidBaseType, size_t> offsets;//为每一个tensor申请空间
+        for (auto &tensor : tensors)
+        {
+            auto offset = allocator.alloc(tensor->getBytes());
+            offsets[tensor->getFuid()] = offset;
+        }
+        auto base_addr = static_cast<char*>(allocator.getPtr());
+        for (auto &tensor : tensors)
+        {
+            auto offset = offsets[tensor->getFuid()];
+            tensor->setDataBlob(make_ref<BlobObj>(runtime, base_addr + offset));
+        }
         allocator.info();
     }
 
diff --git a/src/operators/concat.cc b/src/operators/concat.cc
index d196330..b3bd424 100644
--- a/src/operators/concat.cc
+++ b/src/operators/concat.cc
@@ -12,12 +12,17 @@ ConcatObj::ConcatObj(GraphObj *graph, TensorVec inputs, Tensor output, int _dim)
 optional<vector<Shape>> ConcatObj::inferShape(const TensorVec &inputs) {
     Shape dims = inputs[0]->getDims();
     auto rank = inputs[0]->getRank();
-
+    IT_ASSERT(inputs.size()>1);
+    
     // =================================== 作业 ===================================
     // TODO：修改 dims，返回正确的 concat 后的 shape
     // REF: https://onnx.ai/onnx/operators/onnx__Concat.html#concat-13
     // =================================== 作业 ===================================
 
+    for(size_t i = 1; i < inputs.size(); i++)
+    {
+        dims[dim] += inputs[i]->getDims()[dim];
+    }
     return {{dims}};
 }
 
diff --git a/src/operators/matmul.cc b/src/operators/matmul.cc
index 7a16ca2..4a28378 100644
--- a/src/operators/matmul.cc
+++ b/src/operators/matmul.cc
@@ -27,7 +27,14 @@ namespace infini
         // TODO：返回经过 matmul 操作后的 shape
         // REF: https://github.com/onnx/onnx/blob/main/docs/Operators.md#gemm
         // =================================== 作业 ===================================
-        return std::nullopt;
+        
+        Shape shape_a = inputs[0]->getDims();
+        Shape shape_b = inputs[1]->getDims();
+        if(transA) std::swap(shape_a[shape_a.size()-2], shape_a[shape_a.size()-1]);
+        if(transB) std::swap(shape_b[shape_a.size()-2], shape_b[shape_a.size()-1]);
+        Shape ret = shape_a;
+        ret[ret.size()-1] = shape_b[shape_b.size()-1];
+        return {{ret}};
     }
 
 } // namespace infini
\ No newline at end of file
diff --git a/src/operators/transpose.cc b/src/operators/transpose.cc
index faab2b6..8f5a029 100644
--- a/src/operators/transpose.cc
+++ b/src/operators/transpose.cc
@@ -33,8 +33,11 @@ namespace infini
         // TODO：修改 output_dim，返回正确的 transpose 后的 shape
         // REF: https://onnx.ai/onnx/operators/onnx__Transpose.html#transpose-21
         // =================================== 作业 ===================================
-
-        return std::nullopt;
+        for (int i = 0; i < rank; i++)
+        {
+            output_dim[i] = input_dim[transposePermute[i]];
+        }
+        return {{output_dim}};
     }
 
     std::string TransposeObj::toString() const
diff --git a/src/operators/unary.cc b/src/operators/unary.cc
index 3daad36..56e4222 100644
--- a/src/operators/unary.cc
+++ b/src/operators/unary.cc
@@ -39,7 +39,9 @@ namespace infini
         // TODO：返回经过 clip 操作后的 shape
         // REF: https://onnx.ai/onnx/operators/onnx__Clip.html#clip-13
         // =================================== 作业 ===================================
-        return std::nullopt;
+        auto len = inputs.size();
+        IT_ASSERT(len>0 && len<4);
+        return {{inputs[0]->getDims()}};;
     }
 
     std::string ClipObj::toString() const
@@ -66,7 +68,7 @@ namespace infini
         // REF_FILE: src/core/operator.cc
         // REF: https://onnx.ai/onnx/operators/onnx__Cast.html#cast-21
         // =================================== 作业 ===================================
-        return {};
+        return vector(numOutputs(), getOutputDataType());
     }
 
     optional<vector<Shape>> CastObj::inferShape(const TensorVec &inputs)
@@ -75,7 +77,10 @@ namespace infini
         // TODO：返回经过 cast 操作后的 shape
         // REF: https://onnx.ai/onnx/operators/onnx__Cast.html#cast-21
         // =================================== 作业 ===================================
-        return std::nullopt;
+        
+        IT_ASSERT(!inputs.empty(), "Empty input");
+        return {{inputs[0]->getDims()}};
+        // return std::nullopt;
     }
 
     std::string CastObj::toString() const
diff --git a/src/utils/operator_utils.cc b/src/utils/operator_utils.cc
index edbd2c8..b8c9d00 100644
--- a/src/utils/operator_utils.cc
+++ b/src/utils/operator_utils.cc
@@ -9,8 +9,24 @@ Shape infer_broadcast(const Shape &A, const Shape &B) {
     // TODO：对 A 和 B 进行双向广播，返回广播后的形状。
     // REF: https://github.com/onnx/onnx/blob/main/docs/Broadcasting.md
     // =================================== 作业 ===================================
-    
-    return {};
+    Shape ans;
+    auto a_l = A.size();
+    auto b_l = B.size();
+    while (a_l > 0 || b_l > 0){
+        if (a_l > 0 && b_l > 0) {
+            ans.insert(ans.begin(), std::max(A[a_l-1], B[b_l-1]));
+            a_l--;
+            b_l--;
+        } else if (a_l > 0) {
+            ans.insert(ans.begin(), A[a_l-1]);
+            a_l--;
+        } else {
+            ans.insert(ans.begin(), B[b_l-1]);
+            b_l--;
+        }
+    }
+
+    return ans;
 }
 
 int get_real_axis(const int &axis, const int &rank) {
diff --git a/test/core/test_allocator.cc b/test/core/test_allocator.cc
index 0515edc..71b1388 100644
--- a/test/core/test_allocator.cc
+++ b/test/core/test_allocator.cc
@@ -1,74 +1,74 @@
-#include "core/graph.h"
-#include "core/kernel.h"
-#include "core/runtime.h"
-#include "operators/unary.h"
-
-#include "test.h"
-
-namespace infini
-{
-    TEST(Allocator, testAlloc)
-    {
-        Shape shape = Shape{1, 2, 2, 3};
-        Runtime runtime = NativeCpuRuntimeObj::getInstance();
-        Tensor a = make_ref<TensorObj>(shape, DataType::Float32, runtime);
-        Tensor b = make_ref<TensorObj>(shape, DataType::Float32, runtime);
-        Tensor c = make_ref<TensorObj>(shape, DataType::Float32, runtime);
-        Tensor d = make_ref<TensorObj>(shape, DataType::Float32, runtime);
-        Allocator allocator = Allocator(runtime);
-        // allocate a->b->c
-        size_t offsetA = allocator.alloc(a->getBytes());
-        size_t offsetB = allocator.alloc(b->getBytes());
-        size_t offsetC = allocator.alloc(c->getBytes());
-        // free b, then allocate d
-        allocator.free(offsetB, b->getBytes());
-        size_t offsetD = allocator.alloc(d->getBytes());
-        // expected to be a->d->c
-        EXPECT_EQ(offsetB, offsetD);
-        ASSERT_FALSE(offsetA == 0 && offsetB == 0 && offsetC == 0 && offsetD == 0);
-    }
-
-    TEST(Allocator, testAllocWithEndFreeBlock)
-    {
-        Shape shape = Shape{1, 2, 2, 3};
-        Runtime runtime = NativeCpuRuntimeObj::getInstance();
-        Tensor a = make_ref<TensorObj>(shape, DataType::Float32, runtime);
-        Tensor b = make_ref<TensorObj>(shape, DataType::Float32, runtime);
-        Tensor c = make_ref<TensorObj>(shape, DataType::Float32, runtime);
-        Tensor d =
-            make_ref<TensorObj>(Shape{2, 2, 2, 3}, DataType::Float32, runtime);
-        Allocator allocator = Allocator(runtime);
-        // allocate a->b->c
-        allocator.alloc(a->getBytes());
-        allocator.alloc(b->getBytes());
-        size_t offsetC = allocator.alloc(c->getBytes());
-        allocator.info();
-        // free c, then allocate d
-        allocator.free(offsetC, c->getBytes());
-        size_t offsetD = allocator.alloc(d->getBytes());
-        allocator.info();
-        // expected to be a->b->d, with no free block between b and c
-        EXPECT_EQ(offsetC, offsetD);
-    }
-
-    TEST(Allocator, testGetPtr)
-    {
-        Shape shape = Shape{1, 2, 2, 3};
-        Runtime runtime = NativeCpuRuntimeObj::getInstance();
-        Tensor a = make_ref<TensorObj>(shape, DataType::Float32, runtime);
-        Tensor b = make_ref<TensorObj>(shape, DataType::Float32, runtime);
-        Tensor c = make_ref<TensorObj>(shape, DataType::Float32, runtime);
-        Tensor d = make_ref<TensorObj>(shape, DataType::Float32, runtime);
-        Allocator allocator = Allocator(runtime);
-        // allocate a->b->c->d
-        allocator.alloc(a->getBytes());
-        allocator.alloc(b->getBytes());
-        allocator.alloc(c->getBytes());
-        allocator.alloc(d->getBytes());
-        // multiple calls to the getPtr() function should return the same pointer
-        void *ptr1 = allocator.getPtr();
-        void *ptr2 = allocator.getPtr();
-        EXPECT_EQ(ptr1, ptr2);
-    }
-
-} // namespace infini
+#include "core/graph.h"
+#include "core/kernel.h"
+#include "core/runtime.h"
+#include "operators/unary.h"
+
+#include "test.h"
+
+namespace infini
+{
+    TEST(Allocator, testAlloc)
+    {
+        Shape shape = Shape{1, 2, 2, 3};
+        Runtime runtime = NativeCpuRuntimeObj::getInstance();
+        Tensor a = make_ref<TensorObj>(shape, DataType::Float32, runtime);
+        Tensor b = make_ref<TensorObj>(shape, DataType::Float32, runtime);
+        Tensor c = make_ref<TensorObj>(shape, DataType::Float32, runtime);
+        Tensor d = make_ref<TensorObj>(shape, DataType::Float32, runtime);
+        Allocator allocator = Allocator(runtime);
+        // allocate a->b->c
+        size_t offsetA = allocator.alloc(a->getBytes());
+        size_t offsetB = allocator.alloc(b->getBytes());
+        size_t offsetC = allocator.alloc(c->getBytes());
+        // free b, then allocate d
+        allocator.free(offsetB, b->getBytes());
+        size_t offsetD = allocator.alloc(d->getBytes());
+        // expected to be a->d->c
+        EXPECT_EQ(offsetB, offsetD);
+        ASSERT_FALSE(offsetA == 0 && offsetB == 0 && offsetC == 0 && offsetD == 0);
+    }
+
+    TEST(Allocator, testAllocWithEndFreeBlock)
+    {
+        Shape shape = Shape{1, 2, 2, 3};
+        Runtime runtime = NativeCpuRuntimeObj::getInstance();
+        Tensor a = make_ref<TensorObj>(shape, DataType::Float32, runtime);
+        Tensor b = make_ref<TensorObj>(shape, DataType::Float32, runtime);
+        Tensor c = make_ref<TensorObj>(shape, DataType::Float32, runtime);
+        Tensor d =
+            make_ref<TensorObj>(Shape{2, 2, 2, 3}, DataType::Float32, runtime);
+        Allocator allocator = Allocator(runtime);
+        // allocate a->b->c
+        allocator.alloc(a->getBytes());
+        allocator.alloc(b->getBytes());
+        size_t offsetC = allocator.alloc(c->getBytes());
+        allocator.info();
+        // free c, then allocate d
+        allocator.free(offsetC, c->getBytes());
+        size_t offsetD = allocator.alloc(d->getBytes());
+        allocator.info();
+        // expected to be a->b->d, with no free block between b and c
+        EXPECT_EQ(offsetC, offsetD);
+    }
+
+    TEST(Allocator, testGetPtr)
+    {
+        Shape shape = Shape{1, 2, 2, 3};
+        Runtime runtime = NativeCpuRuntimeObj::getInstance();
+        Tensor a = make_ref<TensorObj>(shape, DataType::Float32, runtime);
+        Tensor b = make_ref<TensorObj>(shape, DataType::Float32, runtime);
+        Tensor c = make_ref<TensorObj>(shape, DataType::Float32, runtime);
+        Tensor d = make_ref<TensorObj>(shape, DataType::Float32, runtime);
+        Allocator allocator = Allocator(runtime);
+        // allocate a->b->c->d
+        allocator.alloc(a->getBytes());
+        allocator.alloc(b->getBytes());
+        allocator.alloc(c->getBytes());
+        allocator.alloc(d->getBytes());
+        // multiple calls to the getPtr() function should return the same pointer
+        void *ptr1 = allocator.getPtr();
+        void *ptr2 = allocator.getPtr();
+        EXPECT_EQ(ptr1, ptr2);
+    }
+
+} // namespace infini
diff --git a/test/core/test_graph.cc b/test/core/test_graph.cc
index bf696dd..05317ad 100644
--- a/test/core/test_graph.cc
+++ b/test/core/test_graph.cc
@@ -1,40 +1,40 @@
-#include "core/graph.h"
-#include "core/kernel.h"
-#include "core/runtime.h"
-#include "operators/matmul.h"
-#include "operators/transpose.h"
-
-#include "test.h"
-
-namespace infini
-{
-    TEST(Graph, Optimize)
-    {
-        Runtime runtime = NativeCpuRuntimeObj::getInstance();
-        Graph g = make_ref<GraphObj>(runtime);
-        Tensor i1 = g->addTensor({2, 3, 4, 5}, DataType::UInt32);
-        Tensor i2 = g->addTensor({2, 3, 4, 5}, DataType::UInt32);
-        Tensor t1 = g->addTensor({2, 3, 5, 4}, DataType::UInt32);
-        Tensor t2 = g->addTensor({2, 3, 4, 5}, DataType::UInt32);
-        Tensor t3 = g->addTensor({2, 3, 5, 4}, DataType::UInt32);
-        Tensor o = g->addTensor({2, 3, 4, 4}, DataType::UInt32);
-        g->addOpWithOutputs<TransposeObj>(i1, t1, Shape{0, 1, 3, 2});
-        g->addOpWithOutputs<TransposeObj>(t1, t2, Shape{0, 1, 3, 2});
-        g->addOpWithOutputs<TransposeObj>(i2, t3, Shape{0, 1, 3, 2});
-        g->addOpWithOutputs<MatmulObj>(t2, t3, o);
-        // 优化前
-        g->print();
-        g->optimize();
-        // 优化后
-        g->print();
-        EXPECT_EQ(g->getOperators().size(), 1);
-        EXPECT_EQ(g->getTensors().size(), 3);
-        EXPECT_EQ(g->getOperators()[0]->getOpType().underlying(), 7);
-        auto op = as<MatmulObj>(g->getOperators()[0]);
-        EXPECT_EQ(op->getInputs(0)->getGuid(), 2);
-        EXPECT_EQ(op->getInputs(1)->getGuid(), 3);
-        EXPECT_EQ(op->getOutputs()[0], o);
-        EXPECT_EQ(op->getTransA(), false);
-        EXPECT_EQ(op->getTransB(), true);
-    }
+#include "core/graph.h"
+#include "core/kernel.h"
+#include "core/runtime.h"
+#include "operators/matmul.h"
+#include "operators/transpose.h"
+
+#include "test.h"
+
+namespace infini
+{
+    TEST(Graph, Optimize)
+    {
+        Runtime runtime = NativeCpuRuntimeObj::getInstance();
+        Graph g = make_ref<GraphObj>(runtime);
+        Tensor i1 = g->addTensor({2, 3, 4, 5}, DataType::UInt32);
+        Tensor i2 = g->addTensor({2, 3, 4, 5}, DataType::UInt32);
+        Tensor t1 = g->addTensor({2, 3, 5, 4}, DataType::UInt32);
+        Tensor t2 = g->addTensor({2, 3, 4, 5}, DataType::UInt32);
+        Tensor t3 = g->addTensor({2, 3, 5, 4}, DataType::UInt32);
+        Tensor o = g->addTensor({2, 3, 4, 4}, DataType::UInt32);
+        g->addOpWithOutputs<TransposeObj>(i1, t1, Shape{0, 1, 3, 2});
+        g->addOpWithOutputs<TransposeObj>(t1, t2, Shape{0, 1, 3, 2});
+        g->addOpWithOutputs<TransposeObj>(i2, t3, Shape{0, 1, 3, 2});
+        g->addOpWithOutputs<MatmulObj>(t2, t3, o);
+        // 优化前
+        g->print();
+        g->optimize();
+        // 优化后
+        g->print();
+        EXPECT_EQ(g->getOperators().size(), 1);
+        EXPECT_EQ(g->getTensors().size(), 3);
+        EXPECT_EQ(g->getOperators()[0]->getOpType().underlying(), 7);
+        auto op = as<MatmulObj>(g->getOperators()[0]);
+        EXPECT_EQ(op->getInputs(0)->getGuid(), 2);
+        EXPECT_EQ(op->getInputs(1)->getGuid(), 3);
+        EXPECT_EQ(op->getOutputs()[0], o);
+        EXPECT_EQ(op->getTransA(), false);
+        EXPECT_EQ(op->getTransB(), true);
+    }
 }
\ No newline at end of file
diff --git a/test/kernels/nativecpu/test_nativecpu_concat.cc b/test/kernels/nativecpu/test_nativecpu_concat.cc
index fc87fb1..4eac503 100644
--- a/test/kernels/nativecpu/test_nativecpu_concat.cc
+++ b/test/kernels/nativecpu/test_nativecpu_concat.cc
@@ -1,28 +1,28 @@
-#include "core/graph.h"
-#include "core/runtime.h"
-#include "operators/concat.h"
-
-#include "test.h"
-
-namespace infini {
-
-TEST(Concat, NativeCpu) {
-    Runtime runtime = NativeCpuRuntimeObj::getInstance();
-    Graph g = make_ref<GraphObj>(runtime);
-
-    auto t1 = g->addTensor({2, 2, 3, 1}, DataType::Float32);
-    auto t2 = g->addTensor({2, 2, 1, 1}, DataType::Float32);
-    auto t3 = g->addTensor({2, 2, 2, 1}, DataType::Float32);
-    auto op = g->addOp<ConcatObj>(TensorVec{t1, t2, t3}, nullptr, 2);
-    g->dataMalloc();
-    t1->setData(IncrementalGenerator());
-    t2->setData(OneGenerator());
-    t3->setData(OneGenerator());
-
-    runtime->run(g);
-    EXPECT_TRUE(op->getOutput()->equalData(
-        vector<float>{0, 1, 2, 1, 1, 1, 3, 4,  5,  1, 1, 1,
-                      6, 7, 8, 1, 1, 1, 9, 10, 11, 1, 1, 1}));
-}
-
-} // namespace infini
+#include "core/graph.h"
+#include "core/runtime.h"
+#include "operators/concat.h"
+
+#include "test.h"
+
+namespace infini {
+
+TEST(Concat, NativeCpu) {
+    Runtime runtime = NativeCpuRuntimeObj::getInstance();
+    Graph g = make_ref<GraphObj>(runtime);
+
+    auto t1 = g->addTensor({2, 2, 3, 1}, DataType::Float32);
+    auto t2 = g->addTensor({2, 2, 1, 1}, DataType::Float32);
+    auto t3 = g->addTensor({2, 2, 2, 1}, DataType::Float32);
+    auto op = g->addOp<ConcatObj>(TensorVec{t1, t2, t3}, nullptr, 2);
+    g->dataMalloc();
+    t1->setData(IncrementalGenerator());
+    t2->setData(OneGenerator());
+    t3->setData(OneGenerator());
+
+    runtime->run(g);
+    EXPECT_TRUE(op->getOutput()->equalData(
+        vector<float>{0, 1, 2, 1, 1, 1, 3, 4,  5,  1, 1, 1,
+                      6, 7, 8, 1, 1, 1, 9, 10, 11, 1, 1, 1}));
+}
+
+} // namespace infini
diff --git a/test/kernels/nativecpu/test_nativecpu_elementwise.cc b/test/kernels/nativecpu/test_nativecpu_elementwise.cc
index c6ef191..9d8ae3c 100644
--- a/test/kernels/nativecpu/test_nativecpu_elementwise.cc
+++ b/test/kernels/nativecpu/test_nativecpu_elementwise.cc
@@ -1,44 +1,44 @@
-#include "core/graph.h"
-#include "core/runtime.h"
-#include "operators/element_wise.h"
-
-#include "test.h"
-
-namespace infini {
-
-using ExpectOutput = vector<float>;
-template <class T>
-void testElementWiseNativeCpu(
-    const std::function<void(void *, size_t, DataType)> &generator1,
-    const std::function<void(void *, size_t, DataType)> &generator2,
-    const Shape &shape1, const Shape &shape2, const ExpectOutput &ansVec) {
-    Runtime runtime = NativeCpuRuntimeObj::getInstance();
-    Graph g = make_ref<GraphObj>(runtime);
-    auto t1 = g->addTensor(shape1, DataType::Float32);
-    auto t2 = g->addTensor(shape2, DataType::Float32);
-
-    auto op = g->addOp<T>(t1, t2, nullptr);
-    g->dataMalloc();
-    t1->setData(generator1);
-    t2->setData(generator2);
-
-    runtime->run(g);
-    EXPECT_TRUE(op->getOutput()->equalData(ansVec));
-}
-
-TEST(ElementWise, NativeCpu) {
-    testElementWiseNativeCpu<AddObj>(
-        IncrementalGenerator(), IncrementalGenerator(), Shape{1, 2, 2, 3, 1},
-        Shape{2, 1, 1}, ExpectOutput{0, 1, 2, 4, 5, 6, 6, 7, 8, 10, 11, 12});
-    testElementWiseNativeCpu<MulObj>(
-        IncrementalGenerator(), IncrementalGenerator(), Shape{1, 2, 2, 3, 1},
-        Shape{2, 1, 1}, ExpectOutput{0, 0, 0, 3, 4, 5, 0, 0, 0, 9, 10, 11});
-    testElementWiseNativeCpu<SubObj>(
-        IncrementalGenerator(), IncrementalGenerator(), Shape{1, 2, 2, 3, 1},
-        Shape{2, 1, 1}, ExpectOutput{0, 1, 2, 2, 3, 4, 6, 7, 8, 8, 9, 10});
-    testElementWiseNativeCpu<DivObj>(
-        IncrementalGenerator(), OneGenerator(), Shape{1, 2, 2, 3, 1},
-        Shape{2, 1, 1}, ExpectOutput{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11});
-}
-
-} // namespace infini
+#include "core/graph.h"
+#include "core/runtime.h"
+#include "operators/element_wise.h"
+
+#include "test.h"
+
+namespace infini {
+
+using ExpectOutput = vector<float>;
+template <class T>
+void testElementWiseNativeCpu(
+    const std::function<void(void *, size_t, DataType)> &generator1,
+    const std::function<void(void *, size_t, DataType)> &generator2,
+    const Shape &shape1, const Shape &shape2, const ExpectOutput &ansVec) {
+    Runtime runtime = NativeCpuRuntimeObj::getInstance();
+    Graph g = make_ref<GraphObj>(runtime);
+    auto t1 = g->addTensor(shape1, DataType::Float32);
+    auto t2 = g->addTensor(shape2, DataType::Float32);
+
+    auto op = g->addOp<T>(t1, t2, nullptr);
+    g->dataMalloc();
+    t1->setData(generator1);
+    t2->setData(generator2);
+
+    runtime->run(g);
+    EXPECT_TRUE(op->getOutput()->equalData(ansVec));
+}
+
+TEST(ElementWise, NativeCpu) {
+    testElementWiseNativeCpu<AddObj>(
+        IncrementalGenerator(), IncrementalGenerator(), Shape{1, 2, 2, 3, 1},
+        Shape{2, 1, 1}, ExpectOutput{0, 1, 2, 4, 5, 6, 6, 7, 8, 10, 11, 12});
+    testElementWiseNativeCpu<MulObj>(
+        IncrementalGenerator(), IncrementalGenerator(), Shape{1, 2, 2, 3, 1},
+        Shape{2, 1, 1}, ExpectOutput{0, 0, 0, 3, 4, 5, 0, 0, 0, 9, 10, 11});
+    testElementWiseNativeCpu<SubObj>(
+        IncrementalGenerator(), IncrementalGenerator(), Shape{1, 2, 2, 3, 1},
+        Shape{2, 1, 1}, ExpectOutput{0, 1, 2, 2, 3, 4, 6, 7, 8, 8, 9, 10});
+    testElementWiseNativeCpu<DivObj>(
+        IncrementalGenerator(), OneGenerator(), Shape{1, 2, 2, 3, 1},
+        Shape{2, 1, 1}, ExpectOutput{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11});
+}
+
+} // namespace infini
diff --git a/test/kernels/nativecpu/test_nativecpu_transpose.cc b/test/kernels/nativecpu/test_nativecpu_transpose.cc
index 501d402..0fcf808 100644
--- a/test/kernels/nativecpu/test_nativecpu_transpose.cc
+++ b/test/kernels/nativecpu/test_nativecpu_transpose.cc
@@ -1,27 +1,27 @@
-#include "core/graph.h"
-#include "core/kernel.h"
-#include "core/runtime.h"
-#include "operators/transpose.h"
-
-#include "test.h"
-
-namespace infini {
-
-TEST(Transpose, NativeCpu) {
-    Runtime runtime = NativeCpuRuntimeObj::getInstance();
-    Graph g = make_ref<GraphObj>(runtime);
-
-    Shape permute = {0, 2, 1, 3};
-    auto input = g->addTensor({1, 2, 3, 4}, DataType::Float32);
-    auto op = g->addOp<TransposeObj>(input, nullptr, permute);
-    g->dataMalloc();
-    input->setData(IncrementalGenerator());
-
-    runtime->run(g);
-
-    EXPECT_TRUE(op->getOutput(0)->equalData(vector<float>{0, 1, 2, 3, 12, 13, 14, 15,
-                                                          4, 5, 6, 7, 16, 17, 18, 19,
-                                                          8, 9, 10, 11, 20, 21, 22, 23}));
-}
-
-} // namespace infini
+#include "core/graph.h"
+#include "core/kernel.h"
+#include "core/runtime.h"
+#include "operators/transpose.h"
+
+#include "test.h"
+
+namespace infini {
+
+TEST(Transpose, NativeCpu) {
+    Runtime runtime = NativeCpuRuntimeObj::getInstance();
+    Graph g = make_ref<GraphObj>(runtime);
+
+    Shape permute = {0, 2, 1, 3};
+    auto input = g->addTensor({1, 2, 3, 4}, DataType::Float32);
+    auto op = g->addOp<TransposeObj>(input, nullptr, permute);
+    g->dataMalloc();
+    input->setData(IncrementalGenerator());
+
+    runtime->run(g);
+
+    EXPECT_TRUE(op->getOutput(0)->equalData(vector<float>{0, 1, 2, 3, 12, 13, 14, 15,
+                                                          4, 5, 6, 7, 16, 17, 18, 19,
+                                                          8, 9, 10, 11, 20, 21, 22, 23}));
+}
+
+} // namespace infini
diff --git a/test/operators/test_cast.cc b/test/operators/test_cast.cc
index 3177751..52909d0 100644
--- a/test/operators/test_cast.cc
+++ b/test/operators/test_cast.cc
@@ -1,23 +1,23 @@
-#include "core/graph.h"
-#include "core/kernel.h"
-#include "core/runtime.h"
-#include "operators/unary.h"
-
-#include "test.h"
-
-namespace infini
-{
-
-    TEST(Cast, ShapeInference)
-    {
-        Runtime runtime = NativeCpuRuntimeObj::getInstance();
-        {
-            Graph g = make_ref<GraphObj>(runtime);
-            Tensor i0 = g->addTensor({2}, DataType::Float32);
-            auto op = g->addOp<CastObj>(i0, nullptr, CastType::Float2Float16);
-            EXPECT_EQ(op->getOutput()->getDims(), (Shape{2}));
-            EXPECT_EQ(op->getOutDType(), (DataType::Float16));
-        }
-    }
-
-} // namespace infini
+#include "core/graph.h"
+#include "core/kernel.h"
+#include "core/runtime.h"
+#include "operators/unary.h"
+
+#include "test.h"
+
+namespace infini
+{
+
+    TEST(Cast, ShapeInference)
+    {
+        Runtime runtime = NativeCpuRuntimeObj::getInstance();
+        {
+            Graph g = make_ref<GraphObj>(runtime);
+            Tensor i0 = g->addTensor({2}, DataType::Float32);
+            auto op = g->addOp<CastObj>(i0, nullptr, CastType::Float2Float16);
+            EXPECT_EQ(op->getOutput()->getDims(), (Shape{2}));
+            EXPECT_EQ(op->getOutDType(), (DataType::Float16));
+        }
+    }
+
+} // namespace infini
diff --git a/test/operators/test_clip.cc b/test/operators/test_clip.cc
index bd4e07f..b440163 100644
--- a/test/operators/test_clip.cc
+++ b/test/operators/test_clip.cc
@@ -1,23 +1,23 @@
-#include "core/graph.h"
-#include "core/kernel.h"
-#include "core/runtime.h"
-#include "operators/unary.h"
-
-#include "test.h"
-
-namespace infini {
-
-    TEST(Clip, ShapeInference)
-    {
-        // Runtime
-        Runtime runtime = NativeCpuRuntimeObj::getInstance();
-        Graph g = make_ref<GraphObj>(runtime);
-        Tensor i0 = g->addTensor({1, 2, 2, 3}, DataType::Float32);
-        float min = 1.0;
-        float max = 4.0;
-        auto op = g->addOp<ClipObj>(i0, nullptr, min, max);
-        EXPECT_EQ(op->getOutput()->getDims(), (Shape{1, 2, 2, 3}));
-        EXPECT_EQ(op->getOutDType(), (DataType::Float32));
-    }
-
-} // namespace infini
+#include "core/graph.h"
+#include "core/kernel.h"
+#include "core/runtime.h"
+#include "operators/unary.h"
+
+#include "test.h"
+
+namespace infini {
+
+    TEST(Clip, ShapeInference)
+    {
+        // Runtime
+        Runtime runtime = NativeCpuRuntimeObj::getInstance();
+        Graph g = make_ref<GraphObj>(runtime);
+        Tensor i0 = g->addTensor({1, 2, 2, 3}, DataType::Float32);
+        float min = 1.0;
+        float max = 4.0;
+        auto op = g->addOp<ClipObj>(i0, nullptr, min, max);
+        EXPECT_EQ(op->getOutput()->getDims(), (Shape{1, 2, 2, 3}));
+        EXPECT_EQ(op->getOutDType(), (DataType::Float32));
+    }
+
+} // namespace infini
diff --git a/test/operators/test_concat.cc b/test/operators/test_concat.cc
index 8984b9f..6bb7ea5 100644
--- a/test/operators/test_concat.cc
+++ b/test/operators/test_concat.cc
@@ -1,16 +1,16 @@
-#include "core/graph.h"
-#include "core/runtime.h"
-#include "operators/concat.h"
-#include "test.h"
-
-namespace infini {
-TEST(Concat, ShapeInfer) {
-    Runtime runtime = NativeCpuRuntimeObj::getInstance();
-    Graph g = make_ref<GraphObj>(runtime);
-    auto t1 = g->addTensor({1, 3, 2, 4}, DataType::Float32);
-    auto t2 = g->addTensor({1, 3, 2, 5}, DataType::Float32);
-
-    auto op = g->addOp<ConcatObj>(TensorVec{t1, t2}, nullptr, 3);
-    EXPECT_EQ(op->getOutput()->getDims(), (Shape{1, 3, 2, 9}));
-}
-} // namespace infini
+#include "core/graph.h"
+#include "core/runtime.h"
+#include "operators/concat.h"
+#include "test.h"
+
+namespace infini {
+TEST(Concat, ShapeInfer) {
+    Runtime runtime = NativeCpuRuntimeObj::getInstance();
+    Graph g = make_ref<GraphObj>(runtime);
+    auto t1 = g->addTensor({1, 3, 2, 4}, DataType::Float32);
+    auto t2 = g->addTensor({1, 3, 2, 5}, DataType::Float32);
+
+    auto op = g->addOp<ConcatObj>(TensorVec{t1, t2}, nullptr, 3);
+    EXPECT_EQ(op->getOutput()->getDims(), (Shape{1, 3, 2, 9}));
+}
+} // namespace infini
diff --git a/test/operators/test_element_wise.cc b/test/operators/test_element_wise.cc
index f4fdd66..f2c30cd 100644
--- a/test/operators/test_element_wise.cc
+++ b/test/operators/test_element_wise.cc
@@ -1,66 +1,66 @@
-#include "core/graph.h"
-#include "core/kernel.h"
-#include "core/runtime.h"
-#include "operators/element_wise.h"
-
-#include "test.h"
-
-namespace infini {
-
-    TEST(ElementWise, ShapeInference)
-    {
-        Runtime runtime = NativeCpuRuntimeObj::getInstance();
-        {
-            Graph g = make_ref<GraphObj>(runtime);
-            Tensor i0 = g->addTensor({2, 3, 3, 4}, DataType::UInt32);
-            Tensor i1 = g->addTensor({2, 3, 3, 4}, DataType::UInt32);
-            auto op = g->addOp<AddObj>(i0, i1, nullptr);
-            EXPECT_EQ(op->getOutput()->getDims(), (Shape{2, 3, 3, 4}));
-        }
-    }
-
-    TEST(ElementWise, Broadcasting)
-    {
-        Runtime runtime = NativeCpuRuntimeObj::getInstance();
-        {
-            Graph g = make_ref<GraphObj>(runtime);
-            Tensor i0 = g->addTensor({2, 3, 4, 5}, DataType::UInt32);
-            Tensor i1 = g->addTensor({}, DataType::UInt32);
-            auto op = g->addOp<AddObj>(i0, i1, nullptr);
-            EXPECT_EQ(op->getOutput()->getDims(), (Shape{2, 3, 4, 5}));
-        }
-
-        {
-            Graph g = make_ref<GraphObj>(runtime);
-            Tensor i0 = g->addTensor({2, 3, 4, 5}, DataType::UInt32);
-            Tensor i1 = g->addTensor({5}, DataType::UInt32);
-            auto op = g->addOp<AddObj>(i0, i1, nullptr);
-            EXPECT_EQ(op->getOutput()->getDims(), (Shape{2, 3, 4, 5}));
-        }
-
-        {
-            Graph g = make_ref<GraphObj>(runtime);
-            Tensor i0 = g->addTensor({4, 5}, DataType::UInt32);
-            Tensor i1 = g->addTensor({2, 3, 4, 5}, DataType::UInt32);
-            auto op = g->addOp<AddObj>(i0, i1, nullptr);
-            EXPECT_EQ(op->getOutput()->getDims(), (Shape{2, 3, 4, 5}));
-        }
-
-        {
-            Graph g = make_ref<GraphObj>(runtime);
-            Tensor i0 = g->addTensor({1, 4, 5}, DataType::UInt32);
-            Tensor i1 = g->addTensor({2, 3, 1, 1}, DataType::UInt32);
-            auto op = g->addOp<AddObj>(i0, i1, nullptr);
-            EXPECT_EQ(op->getOutput()->getDims(), (Shape{2, 3, 4, 5}));
-        }
-
-        {
-            Graph g = make_ref<GraphObj>(runtime);
-            Tensor i0 = g->addTensor({3, 4, 5}, DataType::UInt32);
-            Tensor i1 = g->addTensor({2, 1, 1, 1}, DataType::UInt32);
-            auto op = g->addOp<AddObj>(i0, i1, nullptr);
-            EXPECT_EQ(op->getOutput()->getDims(), (Shape{2, 3, 4, 5}));
-        }
-    }
-
-} // namespace infini
+#include "core/graph.h"
+#include "core/kernel.h"
+#include "core/runtime.h"
+#include "operators/element_wise.h"
+
+#include "test.h"
+
+namespace infini {
+
+    TEST(ElementWise, ShapeInference)
+    {
+        Runtime runtime = NativeCpuRuntimeObj::getInstance();
+        {
+            Graph g = make_ref<GraphObj>(runtime);
+            Tensor i0 = g->addTensor({2, 3, 3, 4}, DataType::UInt32);
+            Tensor i1 = g->addTensor({2, 3, 3, 4}, DataType::UInt32);
+            auto op = g->addOp<AddObj>(i0, i1, nullptr);
+            EXPECT_EQ(op->getOutput()->getDims(), (Shape{2, 3, 3, 4}));
+        }
+    }
+
+    TEST(ElementWise, Broadcasting)
+    {
+        Runtime runtime = NativeCpuRuntimeObj::getInstance();
+        {
+            Graph g = make_ref<GraphObj>(runtime);
+            Tensor i0 = g->addTensor({2, 3, 4, 5}, DataType::UInt32);
+            Tensor i1 = g->addTensor({}, DataType::UInt32);
+            auto op = g->addOp<AddObj>(i0, i1, nullptr);
+            EXPECT_EQ(op->getOutput()->getDims(), (Shape{2, 3, 4, 5}));
+        }
+
+        {
+            Graph g = make_ref<GraphObj>(runtime);
+            Tensor i0 = g->addTensor({2, 3, 4, 5}, DataType::UInt32);
+            Tensor i1 = g->addTensor({5}, DataType::UInt32);
+            auto op = g->addOp<AddObj>(i0, i1, nullptr);
+            EXPECT_EQ(op->getOutput()->getDims(), (Shape{2, 3, 4, 5}));
+        }
+
+        {
+            Graph g = make_ref<GraphObj>(runtime);
+            Tensor i0 = g->addTensor({4, 5}, DataType::UInt32);
+            Tensor i1 = g->addTensor({2, 3, 4, 5}, DataType::UInt32);
+            auto op = g->addOp<AddObj>(i0, i1, nullptr);
+            EXPECT_EQ(op->getOutput()->getDims(), (Shape{2, 3, 4, 5}));
+        }
+
+        {
+            Graph g = make_ref<GraphObj>(runtime);
+            Tensor i0 = g->addTensor({1, 4, 5}, DataType::UInt32);
+            Tensor i1 = g->addTensor({2, 3, 1, 1}, DataType::UInt32);
+            auto op = g->addOp<AddObj>(i0, i1, nullptr);
+            EXPECT_EQ(op->getOutput()->getDims(), (Shape{2, 3, 4, 5}));
+        }
+
+        {
+            Graph g = make_ref<GraphObj>(runtime);
+            Tensor i0 = g->addTensor({3, 4, 5}, DataType::UInt32);
+            Tensor i1 = g->addTensor({2, 1, 1, 1}, DataType::UInt32);
+            auto op = g->addOp<AddObj>(i0, i1, nullptr);
+            EXPECT_EQ(op->getOutput()->getDims(), (Shape{2, 3, 4, 5}));
+        }
+    }
+
+} // namespace infini
diff --git a/test/operators/test_matmul.cc b/test/operators/test_matmul.cc
index 32fbc36..843b796 100644
--- a/test/operators/test_matmul.cc
+++ b/test/operators/test_matmul.cc
@@ -1,57 +1,57 @@
-#include "core/graph.h"
-#include "core/kernel.h"
-#include "core/runtime.h"
-#include "operators/matmul.h"
-
-#include "test.h"
-
-namespace infini
-{
-    using ExpectOutput = vector<float>;
-
-    TEST(Matmul, ShapeInference)
-    {
-        auto runtime = NativeCpuRuntimeObj::getInstance();
-        {
-            Graph g = make_ref<GraphObj>(runtime);
-            auto A = g->addTensor(Shape{1, 3, 5});
-            auto B = g->addTensor(Shape{1, 5, 2});
-            auto matmul = g->addOp<MatmulObj>(A, B, nullptr);
-            auto C = matmul->getOutputs()[0];
-            EXPECT_EQ(C->getDims(), (Shape{1, 3, 2}));
-        }
-        {
-            Graph g = make_ref<GraphObj>(runtime);
-            auto A = g->addTensor(Shape{3, 5, 4});
-            auto B = g->addTensor(Shape{3, 5, 2});
-            auto matmul = g->addOp<MatmulObj>(A, B, nullptr, true, false);
-            auto C = matmul->getOutputs()[0];
-            EXPECT_EQ(C->getDims(), (Shape{3, 4, 2}));
-        }
-        {
-            Graph g = make_ref<GraphObj>(runtime);
-            auto A = g->addTensor(Shape{1, 2, 3, 5});
-            auto B = g->addTensor(Shape{1, 1, 5, 2});
-            auto matmul = g->addOp<MatmulObj>(A, B, nullptr);
-            auto C = matmul->getOutputs()[0];
-            EXPECT_EQ(C->getDims(), (Shape{1, 2, 3, 2}));
-        }
-        {
-            Graph g = make_ref<GraphObj>(runtime);
-            auto A = g->addTensor(Shape{2, 3, 5, 4});
-            auto B = g->addTensor(Shape{1, 3, 5, 2});
-            auto matmul = g->addOp<MatmulObj>(A, B, nullptr, true, false);
-            auto C = matmul->getOutputs()[0];
-            EXPECT_EQ(C->getDims(), (Shape{2, 3, 4, 2}));
-        }
-        {
-            Graph g = make_ref<GraphObj>(runtime);
-            auto A = g->addTensor(Shape{2, 3, 5, 4});
-            auto B = g->addTensor(Shape{1, 3, 2, 5});
-            auto matmul = g->addOp<MatmulObj>(A, B, nullptr, true, true);
-            auto C = matmul->getOutputs()[0];
-            EXPECT_EQ(C->getDims(), (Shape{2, 3, 4, 2}));
-        }
-    }
-
+#include "core/graph.h"
+#include "core/kernel.h"
+#include "core/runtime.h"
+#include "operators/matmul.h"
+
+#include "test.h"
+
+namespace infini
+{
+    using ExpectOutput = vector<float>;
+
+    TEST(Matmul, ShapeInference)
+    {
+        auto runtime = NativeCpuRuntimeObj::getInstance();
+        {
+            Graph g = make_ref<GraphObj>(runtime);
+            auto A = g->addTensor(Shape{1, 3, 5});
+            auto B = g->addTensor(Shape{1, 5, 2});
+            auto matmul = g->addOp<MatmulObj>(A, B, nullptr);
+            auto C = matmul->getOutputs()[0];
+            EXPECT_EQ(C->getDims(), (Shape{1, 3, 2}));
+        }
+        {
+            Graph g = make_ref<GraphObj>(runtime);
+            auto A = g->addTensor(Shape{3, 5, 4});
+            auto B = g->addTensor(Shape{3, 5, 2});
+            auto matmul = g->addOp<MatmulObj>(A, B, nullptr, true, false);
+            auto C = matmul->getOutputs()[0];
+            EXPECT_EQ(C->getDims(), (Shape{3, 4, 2}));
+        }
+        {
+            Graph g = make_ref<GraphObj>(runtime);
+            auto A = g->addTensor(Shape{1, 2, 3, 5});
+            auto B = g->addTensor(Shape{1, 1, 5, 2});
+            auto matmul = g->addOp<MatmulObj>(A, B, nullptr);
+            auto C = matmul->getOutputs()[0];
+            EXPECT_EQ(C->getDims(), (Shape{1, 2, 3, 2}));
+        }
+        {
+            Graph g = make_ref<GraphObj>(runtime);
+            auto A = g->addTensor(Shape{2, 3, 5, 4});
+            auto B = g->addTensor(Shape{1, 3, 5, 2});
+            auto matmul = g->addOp<MatmulObj>(A, B, nullptr, true, false);
+            auto C = matmul->getOutputs()[0];
+            EXPECT_EQ(C->getDims(), (Shape{2, 3, 4, 2}));
+        }
+        {
+            Graph g = make_ref<GraphObj>(runtime);
+            auto A = g->addTensor(Shape{2, 3, 5, 4});
+            auto B = g->addTensor(Shape{1, 3, 2, 5});
+            auto matmul = g->addOp<MatmulObj>(A, B, nullptr, true, true);
+            auto C = matmul->getOutputs()[0];
+            EXPECT_EQ(C->getDims(), (Shape{2, 3, 4, 2}));
+        }
+    }
+
 }; // namespace infini
\ No newline at end of file
diff --git a/test/operators/test_transpose.cc b/test/operators/test_transpose.cc
index 1c12b79..4867ea5 100644
--- a/test/operators/test_transpose.cc
+++ b/test/operators/test_transpose.cc
@@ -1,32 +1,32 @@
-#include "core/graph.h"
-#include "core/kernel.h"
-#include "core/runtime.h"
-#include "operators/transpose.h"
-
-#include "test.h"
-
-namespace infini {
-
-TEST(Transpose, ShapeInference) {
-    Runtime runtime = NativeCpuRuntimeObj::getInstance();
-    {
-        Graph g = make_ref<GraphObj>(runtime);
-        Tensor i = g->addTensor({1, 2, 3, 4}, DataType::Float32);
-        auto op = g->addOp<TransposeObj>(i, nullptr, Shape{0, 1, 2, 3});
-        EXPECT_EQ(op->getOutput()->getDims(), (Shape{1, 2, 3, 4}));
-    }
-    {
-        Graph g = make_ref<GraphObj>(runtime);
-        Tensor i = g->addTensor({1, 2, 3, 4}, DataType::Float32);
-        auto op = g->addOp<TransposeObj>(i, nullptr, Shape{0, 2, 1, 3});
-        EXPECT_EQ(op->getOutput()->getDims(), (Shape{1, 3, 2, 4}));
-    }
-    {
-        Graph g = make_ref<GraphObj>(runtime);
-        Tensor i = g->addTensor({2, 3, 4}, DataType::Float32);
-        auto op = g->addOp<TransposeObj>(i, nullptr, Shape{0, 2, 1});
-        EXPECT_EQ(op->getOutput()->getDims(), (Shape{2, 4, 3}));
-    }
-}
-
-} // namespace infini
+#include "core/graph.h"
+#include "core/kernel.h"
+#include "core/runtime.h"
+#include "operators/transpose.h"
+
+#include "test.h"
+
+namespace infini {
+
+TEST(Transpose, ShapeInference) {
+    Runtime runtime = NativeCpuRuntimeObj::getInstance();
+    {
+        Graph g = make_ref<GraphObj>(runtime);
+        Tensor i = g->addTensor({1, 2, 3, 4}, DataType::Float32);
+        auto op = g->addOp<TransposeObj>(i, nullptr, Shape{0, 1, 2, 3});
+        EXPECT_EQ(op->getOutput()->getDims(), (Shape{1, 2, 3, 4}));
+    }
+    {
+        Graph g = make_ref<GraphObj>(runtime);
+        Tensor i = g->addTensor({1, 2, 3, 4}, DataType::Float32);
+        auto op = g->addOp<TransposeObj>(i, nullptr, Shape{0, 2, 1, 3});
+        EXPECT_EQ(op->getOutput()->getDims(), (Shape{1, 3, 2, 4}));
+    }
+    {
+        Graph g = make_ref<GraphObj>(runtime);
+        Tensor i = g->addTensor({2, 3, 4}, DataType::Float32);
+        auto op = g->addOp<TransposeObj>(i, nullptr, Shape{0, 2, 1});
+        EXPECT_EQ(op->getOutput()->getDims(), (Shape{2, 4, 3}));
+    }
+}
+
+} // namespace infini