diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1b96c12fbf3..b8d3af7adab 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -119,6 +119,10 @@ if(EXECUTORCH_ENABLE_EVENT_TRACER)
   add_definitions(-DET_EVENT_TRACER_ENABLED)
 endif()
 
+if(EXECUTORCH_ENABLE_BUNDLE_IO)
+  add_definitions(-DET_BUNDLE_IO_ENABLED)
+endif()
+
 # -ffunction-sections -fdata-sections: breaks function and data into sections so
 # they can be properly gc'd. -s: strip symbol.
 if(WIN32)
@@ -1072,6 +1076,10 @@ if(EXECUTORCH_BUILD_EXECUTOR_RUNNER)
     list(APPEND _executor_runner_libs etdump flatccrt)
   endif()
 
+  if(EXECUTORCH_ENABLE_BUNDLE_IO)
+    list(APPEND _executor_runner_libs bundled_program)
+  endif()
+
   add_executable(executor_runner ${_executor_runner__srcs})
   if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
     target_link_options_gc_sections(executor_runner)
diff --git a/backends/arm/scripts/build_executor_runner_vkml.sh b/backends/arm/scripts/build_executor_runner_vkml.sh
index 61edf3fbbe4..16074bc8ead 100755
--- a/backends/arm/scripts/build_executor_runner_vkml.sh
+++ b/backends/arm/scripts/build_executor_runner_vkml.sh
@@ -6,39 +6,43 @@
 
 set -eu
 
-script_dir=$(cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
+script_dir=$(cd -- "$( dirname -- "${BASH_SOURCE[0]}")" &> /dev/null && pwd)
 et_root_dir=$(cd ${script_dir}/../../.. && pwd)
 et_root_dir=$(realpath ${et_root_dir})
 setup_path_script=${et_root_dir}/examples/arm/ethos-u-scratch/setup_path.sh
 _setup_msg="please refer to ${et_root_dir}/examples/arm/setup.sh to properly install necessary tools."
-source "${script_dir}/utils.sh"
-
 build_type="Release"
 build_with_etdump=false
 extra_build_flags=""
 output_folder="cmake-out-vkml"
+build_with_etdump_flags="-DEXECUTORCH_ENABLE_EVENT_TRACER=OFF"
+build_with_bundleio_flags="-DEXECUTORCH_ENABLE_BUNDLE_IO=OFF"
+
+source "${script_dir}/utils.sh"
+
 
-build_with_etdump_flags="-DEXECUTORCH_ENABLE_EVENT_TRACER=OFF -DEXECUTORCH_BUILD_DEVTOOLS=OFF"
 help() {
     echo "Usage: $(basename $0) [options]"
     echo "Options:"
-    echo "  --build_type=<TYPE>             Build with Release, Debug or RelWithDebInfo, default is ${build_type}"
-    echo "  --etdump                        Adds Devtools etdump support to track timing, etdump area will be base64 encoded in the log"
-    echo "  --extra_build_flags=<FLAGS>     Extra flags to pass to cmake. Default: none "
-    echo "  --output=<FOLDER>               Output folder Default: $(output_folder)"
+    echo "  --build_type=<TYPE>         Build with Release, Debug or RelWithDebInfo, default is ${build_type}"
+    echo "  --etdump                    Adds Devtools etdump support to track timing, etdump area will be base64 encoded in the log"
+    echo "  --extra_build_flags=<FLAGS> Extra flags to pass to cmake. Default: none "
+    echo "  --output=<FOLDER>           Output folder Default: $(output_folder)"
+    echo "  --bundleio                  Support BundleIO using Devtools with Input/RefOutput included"
     exit 0
 }
 
 for arg in "$@"; do
     case $arg in
-      -h|--help) help ;;
-      --build_type=*) build_type="${arg#*=}";;
-      --etdump) build_with_etdump=true ;;
-      --extra_build_flags=*) extra_build_flags="${arg#*=}";;
-      --output=*) output_folder="${arg#*=}";;
-      --select_ops_list=*) select_ops_list="${arg#*=}";;
-      *)
-      ;;
+        -h|--help) help ;;
+        --build_type=*) build_type="${arg#*=}";;
+        --etdump) build_with_etdump=true ;;
+        --extra_build_flags=*) extra_build_flags="${arg#*=}";;
+        --output=*) output_folder="${arg#*=}";;
+        --select_ops_list=*) select_ops_list="${arg#*=}";;
+        --bundleio) build_with_bundleio_flags="-DEXECUTORCH_ENABLE_BUNDLE_IO=ON" ;;
+        *)
+        ;;
     esac
 done
 
@@ -52,23 +56,24 @@ source ${setup_path_script}
 mkdir -p "${output_folder}"
 output_folder=$(realpath "${output_folder}")
 
-echo "--------------------------------------------------------------------------------"
-echo "Build Arm VKML executor runner: '${output_folder}' with extra build flags: ${extra_build_flags}"
-echo "--------------------------------------------------------------------------------"
-
 cd ${et_root_dir}/examples/arm/executor_runner
 
 if [ "$build_with_etdump" = true ] ; then
-    build_with_etdump_flags="-DEXECUTORCH_ENABLE_EVENT_TRACER=ON -DEXECUTORCH_BUILD_DEVTOOLS=ON"
+    build_with_etdump_flags="-DEXECUTORCH_ENABLE_EVENT_TRACER=ON"
 fi
 
-echo "Building with extra flags: ${build_with_etdump_flags} ${extra_build_flags}"
+echo "-----------------------------------------------------------------------------------------------"
+echo "Build Arm VKML executor runner: '${output_folder}' with extra build flags: "
+echo "${build_with_etdump_flags} ${build_with_bundleio_flags} ${extra_build_flags}"
+echo "-----------------------------------------------------------------------------------------------"
+
 cmake \
     -S "${et_root_dir}" \
     -B "${output_folder}" \
     -Wall \
     -Werror \
     -DCMAKE_BUILD_TYPE=${build_type}            \
+    -DCMAKE_CXX_FLAGS="${extra_build_flags} ${CMAKE_CXX_FLAGS:-}" \
     -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
     -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
     -DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \
@@ -80,9 +85,10 @@ cmake \
     -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
     -DEXECUTORCH_BUILD_KERNELS_QUANTIZED_AOT=ON \
     -DEXECUTORCH_ENABLE_LOGGING=ON \
+    -DEXECUTORCH_BUILD_DEVTOOLS=ON \
     -DPYTHON_EXECUTABLE="$(which python3)"      \
-    ${build_with_etdump_flags}                  \
-    ${extra_build_flags}
+    ${build_with_etdump_flags} \
+    ${build_with_bundleio_flags}
 
 echo "[${BASH_SOURCE[0]}] Configured CMAKE"
 
diff --git a/backends/arm/scripts/run_vkml.sh b/backends/arm/scripts/run_vkml.sh
index 8a64a937638..c69e3e0f8ef 100755
--- a/backends/arm/scripts/run_vkml.sh
+++ b/backends/arm/scripts/run_vkml.sh
@@ -19,6 +19,7 @@ _setup_msg="please refer to ${et_root_dir}/examples/arm/setup.sh to properly ins
 
 
 model=""
+opt_flags=""
 build_path="cmake-out-vkml"
 converter="model-converter"
 
@@ -33,6 +34,7 @@ help() {
 for arg in "$@"; do
     case $arg in
       -h|--help) help ;;
+      --optional_flags=*) opt_flags="${arg#*=}";;
       --model=*) model="${arg#*=}";;
       --build_path=*) build_path="${arg#*=}";;
       *)
@@ -55,11 +57,11 @@ hash ${converter} \
     || { echo "Could not find ${converter} on PATH, ${_setup_msg}"; exit 1; }
 
 
+runner=$(find ${build_path} -name executor_runner -type f)
 
-runner="${build_path}/executor_runner"
 
 echo "--------------------------------------------------------------------------------"
-echo "Running ${model} with ${runner}"
+echo "Running ${model} with ${runner} ${opt_flags}"
 echo "WARNING: The VK_ML layer driver will not provide accurate performance information"
 echo "--------------------------------------------------------------------------------"
 
@@ -75,7 +77,7 @@ fi
 log_file=$(mktemp)
 
 
-${nobuf} ${runner} -model_path ${model} | tee ${log_file}
+${nobuf} ${runner} -model_path ${model} ${opt_flags} | tee ${log_file}
 echo "[${BASH_SOURCE[0]}] execution complete, $?"
 
 # Most of these can happen for bare metal or linx executor_runner runs.
diff --git a/backends/arm/test/test_arm_baremetal.sh b/backends/arm/test/test_arm_baremetal.sh
index 4bc4fe0f06d..879d140e7c1 100755
--- a/backends/arm/test/test_arm_baremetal.sh
+++ b/backends/arm/test/test_arm_baremetal.sh
@@ -190,11 +190,11 @@ test_run_vkml() { # End to End model tests using run.sh
 
     echo "${TEST_SUITE_NAME}: Test VKML"
     out_folder="arm_test/test_run"
-    examples/arm/run.sh --et_build_root=${out_folder} --target=vgf --model_name=add --output=${out_folder}/runner
-    examples/arm/run.sh --et_build_root=${out_folder} --target=vgf --model_name=mul --output=${out_folder}/runner
+    examples/arm/run.sh --et_build_root=${out_folder} --target=vgf --model_name=add --output=${out_folder}/runner --bundleio
+    examples/arm/run.sh --et_build_root=${out_folder} --target=vgf --model_name=mul --output=${out_folder}/runner --bundleio
 
-    examples/arm/run.sh --et_build_root=${out_folder} --target=vgf --model_name=qadd --output=${out_folder}/runner
-    examples/arm/run.sh --et_build_root=${out_folder} --target=vgf --model_name=qops --output=${out_folder}/runner
+    examples/arm/run.sh --et_build_root=${out_folder} --target=vgf --model_name=qadd --output=${out_folder}/runner --bundleio
+    examples/arm/run.sh --et_build_root=${out_folder} --target=vgf --model_name=qops --output=${out_folder}/runner --bundleio
 
     echo "${TEST_SUITE_NAME}: PASS"
 }
@@ -254,8 +254,8 @@ test_models_vkml() { # End to End model tests using model_test.py
 
     # VKML
     echo "${TEST_SUITE_NAME}: Test target VKML"
-    python3 backends/arm/test/test_model.py --test_output=arm_test/test_model --target=vgf --model=mv2
-    python3 backends/arm/test/test_model.py --test_output=arm_test/test_model --target=vgf --no_quantize --model=mv2
+    python3 backends/arm/test/test_model.py --test_output=arm_test/test_model --target=vgf --model=resnet18 --extra_runtime_flags="--bundleio_atol=0.2 --bundleio_rtol=0.2"
+    python3 backends/arm/test/test_model.py --test_output=arm_test/test_model --target=vgf --model=resnet50 --extra_runtime_flags="--bundleio_atol=0.2 --bundleio_rtol=0.2"
 
     echo "${TEST_SUITE_NAME}: PASS"
 }
diff --git a/backends/arm/test/test_model.py b/backends/arm/test/test_model.py
index 5dc11e12a08..04972856044 100755
--- a/backends/arm/test/test_model.py
+++ b/backends/arm/test/test_model.py
@@ -67,9 +67,15 @@ def get_args():
     parser.add_argument(
         "--extra_flags",
         required=False,
-        default=None,
+        default="",
         help="Extra cmake flags to pass the when building the executor_runner",
     )
+    parser.add_argument(
+        "--extra_runtime_flags",
+        required=False,
+        default="",
+        help="Extra runtime flags to pass the final runner/executable",
+    )
     parser.add_argument(
         "--timeout",
         required=False,
@@ -130,20 +136,18 @@ def build_pte(
     no_intermediate: bool,
     no_quantize: bool,
 ):
-    pte_file_ending = "pte"
     command_list = [
         "python3",
         "-m",
         "examples.arm.aot_arm_compiler",
         "--delegate",
+        "--bundleio",
         f"--model_name={model_name}",
         f"--target={target}",
         f"--output={build_output}",
     ]
 
     if "vgf" != target:
-        pte_file_ending = "bpte"
-        command_list.append("--bundleio")
         command_list.append(f"--system_config={system_config}")
         command_list.append(f"--memory_mode={memory_mode}")
 
@@ -155,6 +159,7 @@ def build_pte(
 
     run_external_cmd(command_list)
 
+    pte_file_ending = "bpte"
     pte_file = os.path.join(
         output, f"{model_name}_arm_delegate_{args.target}.{pte_file_ending}"
     )
@@ -218,6 +223,7 @@ def build_vkml_runtime(
             os.path.join(script_path, "build_executor_runner_vkml.sh"),
             f"--et_build_root={et_build_root}",
             "--etdump",
+            "--bundleio",
             "--build_type=Release",
             f"--extra_build_flags=-DET_DUMP_OUTPUT=OFF {extra_flags}",
             f"--output={build_path}",
@@ -228,13 +234,14 @@ def build_vkml_runtime(
     return runner
 
 
-def run_vkml(script_path: str, pte_file: str, runner_build_path: str):
+def run_vkml(script_path: str, pte_file: str, runner_build_path: str, extra_flags: str):
     run_external_cmd(
         [
             "bash",
             os.path.join(script_path, "run_vkml.sh"),
             f"--model={pte_file}",
             f"--build_path={runner_build_path}",
+            f"--optional_flags={extra_flags}",
         ]
     )
 
@@ -297,7 +304,7 @@ def run_vkml(script_path: str, pte_file: str, runner_build_path: str):
             )
 
             start_time = time.perf_counter()
-            run_vkml(script_path, pte_file, build_path)
+            run_vkml(script_path, pte_file, build_path, args.extra_runtime_flags)
             end_time = time.perf_counter()
             print(
                 f"[Test model: {end_time - start_time:.2f} s] Tested VKML runner: {vkml_runner}"
diff --git a/examples/arm/executor_runner/pte_to_header.py b/examples/arm/executor_runner/pte_to_header.py
index 1b5fad05a12..65213bc729e 100644
--- a/examples/arm/executor_runner/pte_to_header.py
+++ b/examples/arm/executor_runner/pte_to_header.py
@@ -59,7 +59,7 @@ def input_file_path(path):
 if __name__ == "__main__":
     args = parser.parse_args()
     outfile = os.path.join(args.outdir, args.outfile)
-    attr = f'__attribute__((section("{args.section}"), aligned(16))) char '
+    attr = f'__attribute__((section("{args.section}"), aligned(16))) unsigned char '
 
     with open(args.pte, "rb") as fr, open(outfile, "w") as fw:
         data = fr.read()
diff --git a/examples/arm/run.sh b/examples/arm/run.sh
index 191be3fc3fe..6f7acaf9f58 100755
--- a/examples/arm/run.sh
+++ b/examples/arm/run.sh
@@ -321,7 +321,8 @@ for i in "${!test_model[@]}"; do
         set -x
         backends/arm/scripts/build_executor_runner_vkml.sh --build_type=${build_type} \
                                                            --extra_build_flags="${extra_build_flags}" \
-                                                           --output="${output_folder}"
+                                                           --output="${output_folder}" \
+                                                           ${bundleio_flag}
         if [ "$build_only" = false ] ; then
             backends/arm/scripts/run_vkml.sh --model=${pte_file} --build_path=${output_folder}
         fi
diff --git a/examples/devtools/CMakeLists.txt b/examples/devtools/CMakeLists.txt
index 355ff375361..f541f70f86d 100644
--- a/examples/devtools/CMakeLists.txt
+++ b/examples/devtools/CMakeLists.txt
@@ -1,5 +1,6 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
+# Copyright 2025 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -47,7 +48,9 @@ find_package(
 )
 
 add_executable(example_runner example_runner/example_runner.cpp)
-target_compile_options(executorch INTERFACE -DET_EVENT_TRACER_ENABLED)
+target_compile_options(
+  executorch INTERFACE -DET_EVENT_TRACER_ENABLED -DET_BUNDLE_IO_ENABLED
+)
 
 target_include_directories(
   etdump INTERFACE ${CMAKE_CURRENT_BINARY_DIR}/../../devtools/include
diff --git a/examples/portable/executor_runner/executor_runner.cpp b/examples/portable/executor_runner/executor_runner.cpp
index 1157554c050..3b287d49bbe 100644
--- a/examples/portable/executor_runner/executor_runner.cpp
+++ b/examples/portable/executor_runner/executor_runner.cpp
@@ -18,12 +18,16 @@
  * all fp32 tensors.
  */
 
+#include <cstdint>
+#include <filesystem>
 #include <fstream>
 #include <iostream>
 #include <memory>
+#include <optional>
 
 #include <gflags/gflags.h>
 
+#include <executorch/extension/data_loader/buffer_data_loader.h>
 #include <executorch/extension/data_loader/file_data_loader.h>
 #include <executorch/extension/evalue_util/print_evalue.h>
 #include <executorch/extension/flat_tensor/flat_tensor_data_map.h>
@@ -42,8 +46,10 @@
 #include <executorch/extension/threadpool/cpuinfo_utils.h>
 #include <executorch/extension/threadpool/threadpool.h>
 #include <executorch/extension/threadpool/threadpool_guard.h>
+#endif
 
-#include <optional>
+#ifdef ET_BUNDLE_IO_ENABLED
+#include <executorch/devtools/bundled_program/bundled_program.h>
 #endif
 
 static uint8_t method_allocator_pool[4 * 1024U * 1024U]; // 4 MB
@@ -54,7 +60,7 @@ DEFINE_string(
     model_path,
     "model.pte",
     "Model serialized in flatbuffer format.");
-DEFINE_string(data_path, "", "Path to data file.");
+DEFINE_string(data_path, "", "Path to data file (.ptd).");
 DEFINE_string(inputs, "", "Comma-separated list of input files");
 DEFINE_string(
     output_file,
@@ -74,10 +80,22 @@ DEFINE_int32(
     -1,
     "Number of CPU threads for inference. Defaults to -1, which implies we'll use a heuristic to derive the # of performant cores for a specific device.");
 
+#ifdef ET_BUNDLE_IO_ENABLED
+DEFINE_double(bundleio_rtol, 0.01, "Relative tolerance for bundled IO.");
+DEFINE_double(bundleio_atol, 0.01, "Absolute tolerance for bundled IO.");
+#endif
+
 using executorch::aten::ScalarType;
 using executorch::aten::Tensor;
+#ifdef ET_BUNDLE_IO_ENABLED
+using executorch::bundled_program::compute_method_output_error_stats;
+using executorch::bundled_program::ErrorStats;
+using executorch::bundled_program::verify_method_outputs;
+#endif
+using executorch::extension::BufferDataLoader;
 using executorch::extension::FileDataLoader;
 using executorch::extension::FlatTensorDataMap;
+using executorch::runtime::DataLoader;
 using executorch::runtime::Error;
 using executorch::runtime::EValue;
 using executorch::runtime::EventTracer;
@@ -142,6 +160,25 @@ class EventTraceManager {
   std::shared_ptr<EventTracer> event_tracer_ptr_;
 };
 
+#ifdef ET_BUNDLE_IO_ENABLED
+std::vector<uint8_t> try_load_file(const std::filesystem::path& path) {
+  std::ifstream file(path, std::ios::binary | std::ios::ate);
+  ET_CHECK_MSG(
+      file.is_open(), "Could not open file '%s'", path.string().c_str());
+
+  const std::size_t nbytes = static_cast<std::size_t>(file.tellg());
+  file.seekg(0, std::ios::beg);
+
+  std::vector<uint8_t> file_data(nbytes);
+  ET_CHECK_MSG(
+      file.read(reinterpret_cast<char*>(file_data.data()), nbytes),
+      "Could not load contents of file '%s'",
+      path.string().c_str());
+
+  return file_data;
+}
+#endif
+
 int main(int argc, char** argv) {
   executorch::runtime::runtime_init();
 
@@ -172,20 +209,86 @@ int main(int argc, char** argv) {
     opt_guard.emplace();
   }
 #endif // ET_USE_THREADPOOL
-  // Create a loader to get the data of the program file. There are other
-  // DataLoaders that use mmap() or point to data that's already in memory, and
-  // users can create their own DataLoaders to load from arbitrary sources.
-  const char* model_path = FLAGS_model_path.c_str();
-  Result<FileDataLoader> loader = FileDataLoader::from(model_path);
-  ET_CHECK_MSG(
-      loader.ok(),
-      "FileDataLoader::from() failed: 0x%" PRIx32,
-      (uint32_t)loader.error());
 
-  // Load .ptd file if provided
+  bool bundle_io = false;
+  size_t program_data_len = 0;
+  const void* program_data;
+
+#ifdef ET_BUNDLE_IO_ENABLED
+  std::vector<uint8_t> model_file_data = try_load_file(FLAGS_model_path);
+  uint8_t* model_pte = model_file_data.data();
+  size_t pte_size = model_file_data.size();
+  constexpr size_t testset_idx = 0;
+
+  // Check for bundled IO provided model.
+  bundle_io = executorch::bundled_program::is_bundled_program(
+      reinterpret_cast<void*>(model_pte), pte_size);
+
+  if (bundle_io) {
+    // BundleIO bpte file is provided - dig out the actual model from the data
+    // area.
+    ET_LOG(Debug, "PTE Model with bundle io detected.");
+    Error status = executorch::bundled_program::get_program_data(
+        reinterpret_cast<void*>(model_pte),
+        pte_size,
+        &program_data,
+        &program_data_len);
+
+    ET_CHECK_MSG(
+        status == Error::Ok,
+        "get_program_data() from bundle PTE failed: 0x%x" PRIx32,
+        static_cast<uint32_t>(status));
+  } else {
+    ET_LOG(Debug, "PTE Model has no bundled IO");
+  }
+#endif
+
+  // Inputs can come from bundleio, as optional input file(s), or
+  // everything hardcoded to ones.
+  std::vector<std::string> inputs_storage;
+  std::vector<std::pair<char*, size_t>> input_buffers;
+  if (!bundle_io) {
+    if (!FLAGS_inputs.empty()) {
+      ET_LOG(Info, "Loading inputs from input file(s).");
+      std::stringstream list_of_input_files(FLAGS_inputs);
+      std::string path;
+
+      std::vector<std::string> file_paths;
+      while (std::getline(list_of_input_files, path, ',')) {
+        file_paths.push_back(std::move(path));
+      }
+      // First reserve number of elements to avoid vector reallocations.
+      inputs_storage.reserve(file_paths.size());
+
+      for (const auto& file_path : file_paths) {
+        std::ifstream input_file_handle(
+            file_path, std::ios::binary | std::ios::ate);
+
+        if (!input_file_handle) {
+          ET_LOG(Error, "Failed to open input file: %s\n", file_path.c_str());
+          return 1;
+        }
+
+        std::streamsize file_size = input_file_handle.tellg();
+        input_file_handle.seekg(0, std::ios::beg);
+
+        // Reserve memory for actual file contents.
+        inputs_storage.emplace_back(file_size, '\0');
+
+        if (!input_file_handle.read(inputs_storage.back().data(), file_size)) {
+          ET_LOG(Error, "Failed to read input file: %s\n", file_path.c_str());
+          return 1;
+        }
+
+        input_buffers.emplace_back(&inputs_storage.back()[0], file_size);
+      }
+    }
+  }
+
   std::unique_ptr<FileDataLoader> ptd_loader;
   std::unique_ptr<FlatTensorDataMap> ptd_data_map;
   if (!FLAGS_data_path.empty()) {
+    ET_LOG(Info, "Loading tensor data from .ptd file.");
     const char* data_path = FLAGS_data_path.c_str();
     Result<FileDataLoader> ptd_loader_result = FileDataLoader::from(data_path);
     ET_CHECK_MSG(
@@ -210,51 +313,41 @@ int main(int argc, char** argv) {
         static_cast<uint64_t>(ptd_data_map->get_num_keys().get()));
   }
 
-  std::vector<std::string> inputs_storage;
-  std::vector<std::pair<char*, size_t>> input_buffers;
-
-  std::stringstream list_of_input_files(FLAGS_inputs);
-  std::string path;
-
-  // First reserve memory for number of vector elements to avoid vector
-  // reallocations when emplacing back.
-  std::vector<std::string> file_paths;
-  while (std::getline(list_of_input_files, path, ',')) {
-    file_paths.push_back(std::move(path));
-  }
-  inputs_storage.reserve(file_paths.size());
-
-  for (const auto& file_path : file_paths) {
-    std::ifstream input_file_handle(
-        file_path, std::ios::binary | std::ios::ate);
-
-    if (!input_file_handle) {
-      ET_LOG(Error, "Failed to open input file: %s\n", file_path.c_str());
-      return 1;
-    }
-
-    std::streamsize file_size = input_file_handle.tellg();
-    input_file_handle.seekg(0, std::ios::beg);
-
-    // Reserve memory for actual file contents.
-    inputs_storage.emplace_back(file_size, '\0');
-
-    if (!input_file_handle.read(&inputs_storage.back()[0], file_size)) {
-      ET_LOG(Error, "Failed to read input file: %s\n", file_path.c_str());
-      return 1;
-    }
+  // Create a loader to get the data of the program file. There are other
+  // DataLoaders that use mmap() or point to data that's already in memory, and
+  // users can create their own DataLoaders to load from arbitrary sources.
+  std::unique_ptr<DataLoader> loader;
 
-    input_buffers.emplace_back(&inputs_storage.back()[0], file_size);
+  if (bundle_io) {
+    Result<BufferDataLoader> buffer_loader =
+        BufferDataLoader(program_data, program_data_len);
+    ET_CHECK_MSG(
+        buffer_loader.ok(),
+        "BufferDataLoader failed: 0x%" PRIx32,
+        static_cast<uint32_t>(buffer_loader.error()));
+    ET_LOG(
+        Debug,
+        "Bundled IO PTE Model data loaded. Size: %zu bytes.",
+        program_data_len);
+    loader = std::make_unique<BufferDataLoader>(std::move(buffer_loader.get()));
+  } else {
+    Result<FileDataLoader> file_loader =
+        FileDataLoader::from(FLAGS_model_path.c_str());
+    ET_CHECK_MSG(
+        file_loader.ok(),
+        "FileDataLoader::from() failed: 0x%" PRIx32,
+        static_cast<uint32_t>(file_loader.error()));
+    loader = std::make_unique<FileDataLoader>(std::move(file_loader.get()));
   }
 
   // Parse the program file. This is immutable, and can also be reused between
   // multiple execution invocations across multiple threads.
-  Result<Program> program = Program::load(&loader.get());
+  Result<Program> program = Program::load(loader.get());
   if (!program.ok()) {
-    ET_LOG(Error, "Failed to parse model file %s", model_path);
+    ET_LOG(Error, "Failed to parse model file %s", FLAGS_model_path.c_str());
     return 1;
   }
-  ET_LOG(Info, "Model file %s is loaded.", model_path);
+  ET_LOG(Info, "Model file %s is loaded.", FLAGS_model_path.c_str());
 
   // Use the first method in the program.
   const char* method_name = nullptr;
@@ -347,9 +440,8 @@ int main(int argc, char** argv) {
   et_timestamp_t time_spent_executing = 0;
   // Run the model.
   for (uint32_t i = 0; i < FLAGS_num_executions; i++) {
-    ET_LOG(Debug, "Preparing inputs.");
     // Allocate input tensors and set all of their elements to 1 or to the
-    // contents of input_buffers if available. The `inputs`
+    // contents of input_buffers if available. For non bundled IO, the `inputs`
     // variable owns the allocated memory and must live past the last call to
     // `execute()`.
     //
@@ -357,13 +449,30 @@ int main(int argc, char** argv) {
     // because inputs whose space gets reused by memory planning (if
     // any such inputs exist) will not be preserved for the next
     // execution.
-    auto inputs = executorch::extension::prepare_input_tensors(
-        *method, {}, input_buffers);
-    ET_CHECK_MSG(
-        inputs.ok(),
-        "Could not prepare inputs: 0x%" PRIx32,
-        (uint32_t)inputs.error());
-    ET_LOG(Debug, "Inputs prepared.");
+    std::optional<executorch::extension::BufferCleanup> inputs;
+
+#ifdef ET_BUNDLE_IO_ENABLED
+    if (bundle_io) {
+      ET_LOG(Debug, "Getting inputs from bundled IO");
+      Error status = executorch::bundled_program::load_bundled_input(
+          *method, model_pte, testset_idx);
+      ET_CHECK_MSG(
+          status == Error::Ok,
+          "load_bundled_input failed with status 0x%" PRIx32,
+          static_cast<uint32_t>(status));
+    } else
+#endif
+    {
+      ET_LOG(Debug, "Preparing inputs.");
+      auto res = executorch::extension::prepare_input_tensors(
+          *method, {}, input_buffers);
+      ET_CHECK_MSG(
+          res.ok(),
+          "Could not prepare inputs: 0x%" PRIx32,
+          (uint32_t)res.error());
+      inputs.emplace(std::move(res.get()));
+      ET_LOG(Debug, "Inputs prepared.");
+    }
 
     const et_timestamp_t before_execute =
         executorch::runtime::pal_current_ticks();
@@ -375,7 +484,7 @@ int main(int argc, char** argv) {
         status == Error::Ok,
         "Execution of method %s failed with status 0x%" PRIx32,
         method_name,
-        (uint32_t)status);
+        static_cast<uint32_t>(status));
   }
   const auto tick_ratio = et_pal_ticks_to_ns_multiplier();
   constexpr auto NANOSECONDS_PER_MILLISECOND = 1000000;
@@ -460,5 +569,58 @@ int main(int argc, char** argv) {
     ET_CHECK_MSG(status == Error::Ok, "Failed to save ETDump file.");
   }
 
+#ifdef ET_BUNDLE_IO_ENABLED
+  if (bundle_io) {
+    // With bundled io we can check the result.
+    bool model_ok = false;
+
+    ErrorStats stats =
+        compute_method_output_error_stats(*method, model_pte, testset_idx);
+
+    if (stats.status == Error::Ok) {
+      ET_LOG(Info, "=== Error stats for testset %zu ===", testset_idx);
+      ET_LOG(Info, " mean_absolute_error: %f", stats.mean_abs_error);
+      ET_LOG(Info, " max_absolute_error:  %f", stats.max_abs_error);
+      ET_LOG(Info, " mean_relative_error: %f", stats.mean_relative_error);
+      ET_LOG(Info, " max_relative_error:  %f", stats.max_relative_error);
+    } else {
+      ET_LOG(
+          Info,
+          "=== Error calculating stats for testset %zu ERROR: 0x%x" PRIx32
+          "===",
+          testset_idx,
+          static_cast<uint32_t>(stats.status));
+    }
+
+    Error status = verify_method_outputs(
+        *method,
+        model_pte,
+        testset_idx,
+        FLAGS_bundleio_rtol,
+        FLAGS_bundleio_atol);
+    if (status == Error::Ok) {
+      ET_LOG(Info, "Model output match expected BundleIO bpte ref data.");
+      ET_LOG(Info, "TEST: BundleIO index[%zu] Test_result: PASS", testset_idx);
+      model_ok = true;
+    } else {
+      ET_LOG(
+          Error,
+          "Model output don't match expected BundleIO bpte ref data. rtol=%f atol=%f",
+          FLAGS_bundleio_rtol,
+          FLAGS_bundleio_atol);
+      ET_LOG(Error, "TEST: BundleIO index[%zu] Test_result: FAIL", testset_idx);
+      ET_LOG(
+          Error,
+          "Bundle verification failed with status 0x%" PRIx32,
+          static_cast<uint32_t>(status));
+      model_ok = false;
+    }
+
+    if (!model_ok) {
+      return 1;
+    }
+  }
+#endif
+
   return 0;
 }
diff --git a/examples/portable/executor_runner/targets.bzl b/examples/portable/executor_runner/targets.bzl
index d1304a84bcb..61a4db43f68 100644
--- a/examples/portable/executor_runner/targets.bzl
+++ b/examples/portable/executor_runner/targets.bzl
@@ -18,6 +18,7 @@ def define_common_targets():
             "//executorch/runtime/executor:program",
             "//executorch/devtools/etdump:etdump_flatcc",
             "//executorch/extension/data_loader:file_data_loader",
+            "//executorch/extension/data_loader:buffer_data_loader",
             "//executorch/extension/evalue_util:print_evalue",
             "//executorch/extension/flat_tensor:flat_tensor_data_map",
             "//executorch/extension/runner_util:inputs",
@@ -38,6 +39,7 @@ def define_common_targets():
         deps = [
             "//executorch/runtime/executor:program",
             "//executorch/extension/data_loader:file_data_loader",
+            "//executorch/extension/data_loader:buffer_data_loader",
             "//executorch/extension/evalue_util:print_evalue",
             "//executorch/extension/flat_tensor:flat_tensor_data_map",
             "//executorch/extension/runner_util:inputs",
diff --git a/tools/cmake/preset/default.cmake b/tools/cmake/preset/default.cmake
index 861e41e4a63..a020e2b107b 100644
--- a/tools/cmake/preset/default.cmake
+++ b/tools/cmake/preset/default.cmake
@@ -1,5 +1,6 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
+# Copyright 2025 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -47,6 +48,9 @@ define_overridable_option(
 define_overridable_option(
   EXECUTORCH_ENABLE_EVENT_TRACER "Build with ET_EVENT_TRACER_ENABLED" BOOL OFF
 )
+define_overridable_option(
+  EXECUTORCH_ENABLE_BUNDLE_IO "Build with ET_BUNDLE_IO_ENABLED" BOOL OFF
+)
 define_overridable_option(
   EXECUTORCH_OPTIMIZE_SIZE
   "Build executorch runtime optimizing for binary size" BOOL OFF
@@ -295,6 +299,10 @@ check_required_options_on(
   IF_ON EXECUTORCH_ENABLE_EVENT_TRACER REQUIRES EXECUTORCH_BUILD_DEVTOOLS
 )
 
+check_required_options_on(
+  IF_ON EXECUTORCH_ENABLE_BUNDLE_IO REQUIRES EXECUTORCH_BUILD_DEVTOOLS
+)
+
 check_required_options_on(
   IF_ON EXECUTORCH_BUILD_EXECUTOR_RUNNER REQUIRES
   EXECUTORCH_BUILD_EXTENSION_EVALUE_UTIL EXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL