Skip to content

Commit 87e933c

Browse files
committed
detect topology directly for xdist
1 parent 426015f commit 87e933c

File tree

4 files changed

+75
-76
lines changed

4 files changed

+75
-76
lines changed

.github/actions/get-runner/action.yml

Lines changed: 2 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ name: Get Runner Infos
22

33
inputs:
44
ut_name:
5-
required: true
5+
required: false
66
type: string
77
description: Which ut to launch
88

@@ -64,22 +64,7 @@ runs:
6464
}' |wc -l)"
6565
cpus_per_xpu="$(echo |awk -v c="${cpu_num}" -v x="${xpu_num}" '{printf c/x}')"
6666
if [ "${{ inputs.ut_name }}" == "xpu_distributed" ];then
67-
pytest_extra_args="$(echo |awk -v x="${xpu_num}" -v z="${ZE_AFFINITY_MASK}" -v cx="${cpus_per_xpu}" '{
68-
if (x > 0) {
69-
split(z, xpu_list, ",");
70-
for (i=0;i<x;i=i+4) {
71-
if (z != "") {
72-
ze = xpu_list[i+1];
73-
} else {
74-
ze = i;
75-
}
76-
printf(" --tx popen//env:ZE_AFFINITY_MASK=%d,%d,%d,%d//env:OMP_NUM_THREADS=%d//python=\"numactl -l -C %d-%d python\"",
77-
ze,ze+1,ze+2,ze+3,4*cx,i*cx,(i+4)*cx-1);
78-
}
79-
}else {
80-
printf(" -n 1 ");
81-
}
82-
}')"
67+
pytest_extra_args="$(python ${{ github.workspace }}/.github/scripts/check-topology.py)"
8368
else
8469
pytest_extra_args="$(echo |awk -v x="${xpu_num}" -v z="${ZE_AFFINITY_MASK}" -v cx="${cpus_per_xpu}" '{
8570
if (x > 0) {

.github/scripts/check-topology.py

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
import os
2+
import sys
3+
4+
# Get the xelink group card affinity
5+
ret = os.system("xpu-smi topology -m 2>&1|tee topology.log > /dev/null")
6+
if ret == 0:
7+
gpu_dict = {}
8+
cpu_dict = {}
9+
with open("topology.log") as file:
10+
lines = file.readlines()
11+
for line in lines:
12+
if "CPU Affinity" in line:
13+
continue
14+
line = line.strip()
15+
if line.startswith("GPU "):
16+
items = line.split(" ")
17+
items = [x for x in items if x]
18+
gpu_id = items[1]
19+
cpu_affinity = items[-1].split(",")[0]
20+
i = gpu_id.split("/")[0]
21+
affinity = ""
22+
for j, item in enumerate(items):
23+
if "SYS" not in item and ("XL" in item or "S" in item):
24+
if len(affinity) == 0:
25+
affinity = str(j - 2)
26+
else:
27+
affinity = affinity + "," + str(j - 2)
28+
gpu_dict[i] = affinity
29+
cpu_dict[i] = cpu_affinity
30+
31+
value_to_keys = {}
32+
gpu_cpu_dict = {}
33+
for key, value in gpu_dict.items():
34+
if value not in value_to_keys:
35+
value_to_keys[value] = []
36+
value_to_keys[value].append(key)
37+
dist_group = []
38+
for key, value in value_to_keys.items():
39+
if key == ','.join(value_to_keys[key]):
40+
dist_group.append(key)
41+
for group in dist_group:
42+
cpu_aff = []
43+
for i in group.split(","):
44+
if cpu_dict[i] not in cpu_aff:
45+
cpu_aff.append(cpu_dict[i])
46+
if len(cpu_aff) == 1:
47+
gpu_cpu_dict[group] = ','.join(cpu_aff)
48+
if len(gpu_cpu_dict) == 0:
49+
print("No Xelink detected")
50+
sys.exit(255)
51+
pytest_extra_args = ""
52+
for key, value in gpu_cpu_dict.items():
53+
start_cpu = int(value.split("-")[0])
54+
end_cpu = int(value.split("-")[1])
55+
threads = end_cpu - start_cpu + 1
56+
pytest_extra_args = pytest_extra_args + \
57+
' --tx popen//env:ZE_AFFINITY_MASK=%s//env:OMP_NUM_THREADS=%d//python="numactl -l -C %s python"'\
58+
%(key, threads, value)
59+
print(pytest_extra_args)
60+
61+
else:
62+
print("xpu-smi topology failed")
63+
64+
sys.exit(255)

.github/workflows/_linux_ut.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ env:
3939
jobs:
4040
runner:
4141
runs-on: ${{ inputs.runner }}
42-
name: get-runner
42+
name: get-runner
4343
outputs:
4444
runner_id: ${{ steps.runner-info.outputs.runner_id }}
4545
user_id: ${{ steps.runner-info.outputs.user_id }}

test/xpu/skip_list_dist.py

Lines changed: 8 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -1,83 +1,33 @@
11
skip_dict = {
22
"../../../../test/distributed/fsdp/test_fsdp_checkpoint.py": None,
3-
"../../../../test/distributed/fsdp/test_checkpoint_wrapper.py": None,
4-
"../../../../test/distributed/fsdp/test_fsdp_backward_prefetch.py": None,
5-
"../../../../test/distributed/fsdp/test_fsdp_apply.py": None,
6-
"../../../../test/distributed/fsdp/test_fsdp_clip_grad_norm.py": None,
7-
"../../../../test/distributed/fsdp/test_fsdp_comm.py": None,
8-
"../../../../test/distributed/fsdp/test_fsdp_comm_hooks.py": None,
9-
"../../../../test/distributed/fsdp/test_fsdp_core.py": (
10-
"test_transformer_no_grad_mixed_precision_True_xpu",
3+
"../../../../test/distributed/fsdp/test_fsdp_clip_grad_norm.py": (
4+
"test_ddp_parity_xpu",
115
),
12-
"../../../../test/distributed/fsdp/test_fsdp_exec_order.py": None,
13-
"../../../../test/distributed/fsdp/test_fsdp_dtensor_state_dict.py": None,
6+
"../../../../test/distributed/fsdp/test_fsdp_comm.py": None,
147
"../../../../test/distributed/fsdp/test_fsdp_flatten_params.py": None,
15-
"../../../../test/distributed/fsdp/test_fsdp_freezing_weights.py": None,
16-
"../../../../test/distributed/fsdp/test_fsdp_hybrid_shard.py": None,
17-
"../../../../test/distributed/fsdp/test_fsdp_ignored_modules.py": None,
18-
"../../../../test/distributed/fsdp/test_fsdp_memory.py": None,
19-
"../../../../test/distributed/fsdp/test_fsdp_meta.py": None,
20-
"../../../../test/distributed/fsdp/test_fsdp_misc.py": None,
21-
"../../../../test/distributed/fsdp/test_fsdp_overlap.py": None,
22-
"../../../../test/distributed/fsdp/test_fsdp_pure_fp16.py": None,
238
"../../../../test/distributed/fsdp/test_fsdp_sharded_grad_scaler.py": None,
24-
"../../../../test/distributed/fsdp/test_fsdp_state_dict.py": None,
25-
"../../../../test/distributed/fsdp/test_fsdp_tp_integration.py": None,
26-
"../../../../test/distributed/fsdp/test_fsdp_traversal.py": None,
27-
"../../../../test/distributed/fsdp/test_fsdp_uneven.py": None,
289
"../../../../test/distributed/fsdp/test_fsdp_unshard_params.py": None,
29-
"../../../../test/distributed/fsdp/test_fsdp_use_orig_params.py": (
30-
"test_diff_hyperparams_sharding_strategy_str_full_shard",
31-
"test_diff_hyperparams_sharding_strategy_str_shard_grad_op",
32-
),
33-
"../../../../test/distributed/fsdp/test_hsdp_dtensor_state_dict.py": None,
34-
"../../../../test/distributed/fsdp/test_shard_utils.py": None,
3510
"../../../../test/distributed/fsdp/test_utils.py": None,
3611
"../../../../test/distributed/fsdp/test_wrap.py": None,
37-
"../../../../test/distributed/fsdp/test_fsdp_fine_tune.py": None,
3812
"../../../../test/distributed/fsdp/test_fsdp_fx.py": None,
3913
"../../../../test/distributed/fsdp/test_fsdp_input.py": None,
4014
"../../../../test/distributed/fsdp/test_fsdp_multiple_forward.py": None,
41-
"../../../../test/distributed/fsdp/test_fsdp_multiple_wrapping.py": None,
42-
"../../../../test/distributed/_composable/fsdp/test_fully_shard_clip_grad_norm_.py": None,
4315
"../../../../test/distributed/_composable/fsdp/test_fully_shard_comm.py": None,
44-
"../../../../test/distributed/_composable/fsdp/test_fully_shard_compile.py": (
45-
"test_compiled_autograd_ctx",
46-
"test_nested_fully_shard_backend_aot_eager",
47-
"test_nested_fully_shard_backend_aot_eager_decomp_partition",
48-
"test_nested_fully_shard_backend_inductor_fullgraph_True",
49-
"test_nested_fully_shard_backend_inductor_fullgraph_True_graph_partition",
50-
"test_simple_mlp_fullgraph_backend_aot_eager",
51-
"test_simple_mlp_fullgraph_backend_aot_eager_decomp_partition",
52-
"test_simple_mlp_fullgraph_backend_inductor",
53-
"test_transformer_backend_aot_eager",
54-
"test_transformer_backend_aot_eager_decomp_partition",
55-
),
56-
"../../../../test/distributed/_composable/fsdp/test_fully_shard_frozen.py": None,
57-
"../../../../test/distributed/_composable/fsdp/test_fully_shard_mixed_precision.py": None,
16+
"../../../../test/distributed/_composable/fsdp/test_fully_shard_compile.py": None,
5817
"../../../../test/distributed/_composable/fsdp/test_fully_shard_state_dict.py": (
5918
"test_cached_state_dict",
6019
"test_dp_state_dict_cpu_offload",
6120
),
62-
"../../../../test/distributed/_composable/fsdp/test_fully_shard_training.py": (
63-
"test_explicit_prefetching",
64-
"test_multi_forward_module",
65-
"test_train_parity_single_group_shard_dim0",
66-
"test_train_parity_single_group_shard_largest_dim",
67-
"test_train_parity_shard_placement_fn_shard_largest_dim",
68-
"test_3d_mlp_with_nd_mesh",
69-
),
70-
"../../../../test/distributed/_composable/test_composability/test_2d_composability.py": (
71-
"test_tp_with_fsdp_offloading",
72-
),
73-
"../../../../test/distributed/_composable/test_replicate_with_compiler.py": None,
74-
"../../../../test/distributed/_composable/test_composability/test_pp_composability.py": None,
21+
"../../../../test/distributed/_composable/fsdp/test_fully_shard_frozen.py": None,
7522
"../../../../test/distributed/_composable/test_checkpoint.py": None,
7623
"../../../../test/distributed/_composable/test_contract.py": None,
7724
"distributed/test_c10d_xccl.py": None,
7825
"distributed/test_c10d_ops_xccl.py": None,
7926
"../../../../test/distributed/test_functional_api.py": None,
27+
"../../../../test/distributed/test_c10d_common.py": None,
8028
"../../../../test/distributed/_tools/test_fsdp2_mem_tracker.py": None,
8129
"../../../../test/distributed/_tools/test_mem_tracker.py": None,
8230
"../../../../test/distributed/_tools/test_memory_tracker.py": None,
31+
"../../../../test/distributed/tensor/test_random_ops.py": None,
32+
"../../../../test/distributed/tensor/test_math_ops.py": None,
8333
}

0 commit comments

Comments
 (0)