______________________ test_torch_engine_sub_proc_cleanup ______________________
Traceback (most recent call last):
File "/home/runner/.local/lib/python3.10/site-packages/_pytest/runner.py", line 353, in CallInfo.from_call
line: result: TResult | None = func()
locals:
result = <local> None
TResult = <global> +TResult
func = <local> <function _pytest.runner.call_and_report.<locals>.<lambda>>
File "/home/runner/.local/lib/python3.10/site-packages/_pytest/runner.py", line 245, in call_and_report.<locals>.<lambda>
line: lambda: runtest_hook(item=item, **kwds),
locals:
runtest_hook = <local> <HookCaller 'pytest_runtest_call'>
item = <local> <Function test_torch_engine_sub_proc_cleanup>
kwds = <local> {}
File "/home/runner/.local/lib/python3.10/site-packages/pluggy/_hooks.py", line 512, in HookCaller.__call__
line: return self._hookexec(self.name, self._hookimpls.copy(), kwargs, firstresult)
locals:
self = <local> <HookCaller 'pytest_runtest_call'>
self._hookexec = <local> <bound method PluginManager._hookexec of <_pytest.config.PytestPluginManager object at 0x7f36d5da0a00>>
self.name = <local> 'pytest_runtest_call', len = 19
self._hookimpls = <local> [<HookImpl plugin_name='threadexception', plugin=<module '_pytest.threadexception' from '/home/runner/.local/lib/python3.10/site-packages/_pytest/threadexception.py'>>, <HookImpl plugin_name='unraisableexception', plugin=<module '_pytest.unraisableexception' from '/home/runner/.local/lib/python3...., len = 6
kwargs = <local> {'item': <Function test_torch_engine_sub_proc_cleanup>}
firstresult = <local> False
File "/home/runner/.local/lib/python3.10/site-packages/pluggy/_manager.py", line 120, in PluginManager._hookexec
line: return self._inner_hookexec(hook_name, methods, kwargs, firstresult)
locals:
self = <local> <_pytest.config.PytestPluginManager object at 0x7f36d5da0a00>
firstresult = <local> True
File "/home/runner/.local/lib/python3.10/site-packages/pluggy/_callers.py", line 167, in _multicall
line: raise exception
locals:
exception = <local> psutil.ZombieProcess(pid=15287, name='python', msg="PID still exists but it's a zombie")
File "/home/runner/.local/lib/python3.10/site-packages/pluggy/_callers.py", line 121, in _multicall
line: res = hook_impl.function(*args)
File "/home/runner/.local/lib/python3.10/site-packages/_pytest/python.py", line 166, in pytest_pyfunc_call
line: result = testfunction(**testargs)
locals:
testfunction = <local> <function test_torch_engine.test_torch_engine_sub_proc_cleanup>
testargs = <local> {}
File "/home/runner/work/returnn/returnn/tests/test_torch_engine.py", line 999, in test_torch_engine_sub_proc_cleanup
line: print(f"Child proc still running: {child_proc} {child_proc.cmdline()}")
locals:
child_proc = <local> psutil.Process(pid=15287, name='python', status='terminated', started='23:14:59')
File "/home/runner/.local/lib/python3.10/site-packages/psutil/__init__.py", line 748, in Process.cmdline
line: return self._proc.cmdline()
locals:
self = <local> psutil.Process(pid=15287, name='python', status='terminated', started='23:14:59')
self._proc = <local> <psutil._pslinux.Process object at 0x7f36d040b510>
self._proc.cmdline = <local> <bound method Process.cmdline of <psutil._pslinux.Process object at 0x7f36d040b510>>
File "/home/runner/.local/lib/python3.10/site-packages/psutil/_pslinux.py", line 1593, in Process.nice_set
line: return fun(self, *args, **kwargs)
locals:
fun = <local> <function psutil._pslinux.Process.cmdline>
self = <local> <psutil._pslinux.Process object at 0x7f36d040b510>
args = <local> ()
kwargs = <local> {}
File "/home/runner/.local/lib/python3.10/site-packages/psutil/_pslinux.py", line 1754, in Process.cmdline
line: self._raise_if_zombie()
locals:
self = <local> <psutil._pslinux.Process object at 0x7f36d040b510>
File "/home/runner/.local/lib/python3.10/site-packages/psutil/_pslinux.py", line 1648, in Process._raise_if_zombie
line: raise ZombieProcess(self.pid, self._name, self._ppid)
locals:
self = <local> <psutil._pslinux.Process object at 0x7f36d040b510>
self.pid = <local> 15287
self._name = <local> 'python', len = 6
self._ppid = <local> None
psutil.ZombieProcess: PID still exists but it's a zombie (pid=15287, name='python')
----------------------------- Captured stdout call -----------------------------
main (parent) pid is 15286
Using device: cpu (config)
Learning-rate-control: no file specified, not saving history (no proper restart possible)
Model: TrainTestModel(
(lin): Linear(in_features=9, out_features=2, bias=True)
)
net params #: 20
Optimizer: Adam (
Parameter Group 0
amsgrad: False
betas: (0.9, 0.999)
capturable: False
differentiable: False
eps: 1e-08
foreach: None
fused: False
lr: 1.0
maximize: False
weight_decay: 0
)
train proc: psutil.Process(pid=15286, name='python', status='running')
child proc: psutil.Process(pid=15287, name='python', status='sleeping', started='23:14:59') ['python', '-m', 'pytest', 'tests/test_torch_engine.py']
child proc: psutil.Process(pid=15294, name='TDL worker 0', status='sleeping', started='23:14:59') ['/opt/hostedtoolcache/Python/3.10.19/x64/bin/python', '-c', 'from multiprocessing.spawn import spawn_main; spawn_main(tracker_fd=17, pipe_handle=26)', '--multiprocessing-fork']
multi_proc_manager_with_watchdog process (15287: parent process 15286 is dead, new parent pid is 1, killing myself)
--------------------------- Captured stderr teardown ---------------------------
EXCEPTION
Traceback (most recent call last):
File "/opt/hostedtoolcache/Python/3.10.19/x64/lib/python3.10/multiprocessing/resource_sharer.py", line 138, in _ResourceSharer._serve
line: with self._listener.accept() as conn:
locals:
self = <local> <multiprocessing.resource_sharer._ResourceSharer object at 0x7f4120769e40>
self._listener = <local> <multiprocessing.connection.Listener object at 0x7f411f6813c0>
conn = <local> <multiprocessing.connection.Connection object at 0x7f411f681960>
File "/opt/hostedtoolcache/Python/3.10.19/x64/lib/python3.10/multiprocessing/connection.py", line 465, in Listener.accept
line: deliver_challenge(c, self._authkey)
locals:
c = <local> <multiprocessing.connection.Connection object at 0x7f411f681c30>
self = <local> <multiprocessing.connection.Listener object at 0x7f411f6813c0>
self._authkey = <local> b'+\x01\r\xfb\x1d\xdd\x05i\x9f\xb0\x1b\xb8*\\I\xd5\xa9\xc6\x8f\xf3z\xef\xf2\x010\x8d\x9d\x975\x97\x96\xf4', len = 32
File "/opt/hostedtoolcache/Python/3.10.19/x64/lib/python3.10/multiprocessing/connection.py", line 740, in deliver_challenge
line: response = connection.recv_bytes(256) # reject large message
locals:
connection = <local> <multiprocessing.connection.Connection object at 0x7f411f681c30>
File "/opt/hostedtoolcache/Python/3.10.19/x64/lib/python3.10/multiprocessing/connection.py", line 216, in _ConnectionBase.recv_bytes
line: buf = self._recv_bytes(maxlength)
locals:
self = <local> <multiprocessing.connection.Connection object at 0x7f411f681c30>
maxlength = <local> 256
File "/opt/hostedtoolcache/Python/3.10.19/x64/lib/python3.10/multiprocessing/connection.py", line 414, in Connection._recv_bytes
line: buf = self._recv(4)
locals:
self = <local> <multiprocessing.connection.Connection object at 0x7f411f681c30>
File "/opt/hostedtoolcache/Python/3.10.19/x64/lib/python3.10/multiprocessing/connection.py", line 379, in Connection._recv
line: chunk = read(handle, remaining)
locals:
read = <local> <built-in function read>
handle = <local> 7
remaining = <local> 4
ConnectionResetError: [Errno 104] Connection reset by peer
Let's see if we get this again.
https://github.com/rwth-i6/returnn/actions/runs/22647017683/job/65637402337#step:6:211
There might be a race condition?
Or just a hiccup of CI?
Let's see if we get this again.