From 013516065b650a01c15afb5c311f6ef34639a63a Mon Sep 17 00:00:00 2001 From: Zhengjun Xing Date: Thu, 16 Oct 2025 10:10:20 +0000 Subject: [PATCH] Fix OSError: [Errno 24] Too many open files in multi-copy benchmark When running benchmarks with a large number of copies, the process may raise: OSError: [Errno 24] Too many open files. Example command: (fbgemm_gpu_env)$ ulimit -n 1048576 (fbgemm_gpu_env)$ python ./bench/tbe/tbe_inference_benchmark.py nbit-cpu \ --num-embeddings=40000000 --bag-size=2 --embedding-dim=96 \ --batch-size=162 --num-tables=8 --weights-precision=int4 \ --output-dtype=fp32 --copies=96 --iters=30000 PyTorch multiprocessing provides two shared-memory strategies: 1.file_descriptor (default) 2.file_system The default file_descriptor strategy uses file descriptors as shared memory handles, which can result in a large number of open FDs when many tensors are shared. If the total number of open FDs exceeds the system limit and cannot be raised, the file_system strategy should be used instead. This patch allows switching to the file_system strategy by setting: export PYTORCH_SHARE_STRATEGY='file_system' Reference: https://pytorch.org/docs/stable/multiprocessing.html#sharing-strategies --- fbgemm_gpu/fbgemm_gpu/tbe/bench/bench_runs.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fbgemm_gpu/fbgemm_gpu/tbe/bench/bench_runs.py b/fbgemm_gpu/fbgemm_gpu/tbe/bench/bench_runs.py index 1243f14db4..84fa06bdd8 100644 --- a/fbgemm_gpu/fbgemm_gpu/tbe/bench/bench_runs.py +++ b/fbgemm_gpu/fbgemm_gpu/tbe/bench/bench_runs.py @@ -153,6 +153,12 @@ def benchmark_cpu_requests_mp( float: The average runtime per iteration in seconds. """ + import os + strategy = os.environ.get('PYTORCH_SHARE_STRATEGY') + current_strategy = torch.multiprocessing.get_sharing_strategy() + if strategy is not None and current_strategy != strategy: + torch.multiprocessing.set_sharing_strategy(strategy) + cpu_bm_barrier.create_barrier(num_copies) worker_pool = torch.multiprocessing.Pool(num_copies)