@@ -40,6 +40,40 @@ def setup_logger(name, log_file, level=logging.INFO):
4040metriclogger = setup_logger ('pipeline_metric_logger' , 'pipeline_metric.log' )
4141
4242
43+ # A fallback solution to getting all child procs
44+ # in case psutil has problems (PermissionError).
45+ # It returns the same list as psutil.children(recursive=True).
46+ def getChildProcs (basepid ):
47+ cmd = '''
48+ childprocs() {
49+ local parent=$1
50+ if [ ! "$2" ]; then
51+ child_pid_list=""
52+ fi
53+ if [ "$parent" ] ; then
54+ child_pid_list="$child_pid_list $parent"
55+ for childpid in $(pgrep -P ${parent}); do
56+ childprocs $childpid "nottoplevel"
57+ done;
58+ fi
59+ # return via a string list (only if toplevel)
60+ if [ ! "$2" ]; then
61+ echo "${child_pid_list}"
62+ fi
63+ }
64+ '''
65+ cmd = cmd + '\n ' + 'childprocs ' + str (basepid )
66+ output = subprocess .check_output (cmd , shell = True )
67+ plist = []
68+ for p in output .strip ().split ():
69+ try :
70+ proc = psutil .Process (int (p ))
71+ except psutil .NoSuchProcess :
72+ continue
73+
74+ plist .append (proc )
75+ return plist
76+
4377#
4478# Code section to find all topological orderings
4579# of a DAG. This is used to know when we can schedule
@@ -363,20 +397,28 @@ def __init__(self, workflowfile, args, jmax=100):
363397 def SIGHandler (self , signum , frame ):
364398 # basically forcing shut down of all child processes
365399 actionlogger .info ("Signal " + str (signum ) + " caught" )
366- procs = psutil .Process ().children (recursive = True )
400+ try :
401+ procs = psutil .Process ().children (recursive = True )
402+ except (psutil .NoSuchProcess ):
403+ pass
404+ except (psutil .AccessDenied , PermissionError ):
405+ procs = getChildProcs (os .getpid ())
406+
367407 for p in procs :
368408 actionlogger .info ("Terminating " + str (p ))
369409 try :
370- p .terminate ()
410+ p .terminate ()
371411 except (psutil .NoSuchProcess , psutil .AccessDenied ):
372- pass
412+ pass
413+
373414 gone , alive = psutil .wait_procs (procs , timeout = 3 )
374415 for p in alive :
375- actionlogger .info ("Killing " + str (p ))
376416 try :
377- p .kill ()
417+ actionlogger .info ("Killing " + str (p ))
418+ p .kill ()
378419 except (psutil .NoSuchProcess , psutil .AccessDenied ):
379- pass
420+ pass
421+
380422 exit (1 )
381423
382424 def getallrequirements (self , t ):
@@ -434,6 +476,7 @@ def submit(self, tid, nice=0):
434476 p .nice (nice )
435477 self .nicevalues [tid ]= nice
436478 except (psutil .NoSuchProcess , psutil .AccessDenied ):
479+ actionlogger .error ('Couldn\' t set nice value of ' + str (p .pid ) + ' to ' + str (nice ) + ' -- current value is ' + str (p .nice ()))
437480 self .nicevalues [tid ]= 0
438481 return p
439482
@@ -539,9 +582,12 @@ def monitor(self, process_list):
539582 psutilProcs = [ proc ]
540583 # use psutil for CPU measurement
541584 psutilProcs = psutilProcs + proc .children (recursive = True )
542- except (psutil .NoSuchProcess , psutil . AccessDenied ):
585+ except (psutil .NoSuchProcess ):
543586 continue
544587
588+ except (psutil .AccessDenied , PermissionError ):
589+ psutilProcs = psutilProcs + getChildProcs (pid )
590+
545591 # accumulate total metrics (CPU, memory)
546592 totalCPU = 0.
547593 totalPSS = 0.
0 commit comments