From 013516065b650a01c15afb5c311f6ef34639a63a Mon Sep 17 00:00:00 2001
From: Zhengjun Xing <zhengjun.xing@arm.com>
Date: Thu, 16 Oct 2025 10:10:20 +0000
Subject: [PATCH] Fix OSError: [Errno 24] Too many open files in multi-copy
 benchmark

When running benchmarks with a large number of copies, the process may raise:
 OSError: [Errno 24] Too many open files.

Example command:
(fbgemm_gpu_env)$ ulimit -n 1048576
(fbgemm_gpu_env)$ python ./bench/tbe/tbe_inference_benchmark.py nbit-cpu
\
    --num-embeddings=40000000 --bag-size=2 --embedding-dim=96 \
    --batch-size=162 --num-tables=8 --weights-precision=int4 \
    --output-dtype=fp32 --copies=96 --iters=30000

PyTorch multiprocessing provides two shared-memory strategies:
1.file_descriptor (default)
2.file_system

The default file_descriptor strategy uses file descriptors as shared
memory handles, which can result in a large number of open FDs when many
tensors are shared.
If the total number of open FDs exceeds the system limit and cannot be
raised, the file_system strategy should be used instead.

This patch allows switching to the file_system strategy by setting:
  export PYTORCH_SHARE_STRATEGY='file_system'

Reference:
https://pytorch.org/docs/stable/multiprocessing.html#sharing-strategies
---
 fbgemm_gpu/fbgemm_gpu/tbe/bench/bench_runs.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/fbgemm_gpu/fbgemm_gpu/tbe/bench/bench_runs.py b/fbgemm_gpu/fbgemm_gpu/tbe/bench/bench_runs.py
index 1243f14db4..84fa06bdd8 100644
--- a/fbgemm_gpu/fbgemm_gpu/tbe/bench/bench_runs.py
+++ b/fbgemm_gpu/fbgemm_gpu/tbe/bench/bench_runs.py
@@ -153,6 +153,12 @@ def benchmark_cpu_requests_mp(
         float: The average runtime per iteration in seconds.
 
     """
+    import os
+    strategy = os.environ.get('PYTORCH_SHARE_STRATEGY')
+    current_strategy = torch.multiprocessing.get_sharing_strategy()
+    if strategy is not None and current_strategy != strategy:
+        torch.multiprocessing.set_sharing_strategy(strategy)
+
     cpu_bm_barrier.create_barrier(num_copies)
     worker_pool = torch.multiprocessing.Pool(num_copies)