@@ -521,6 +521,7 @@ def process_monitor(self, sproc: "subprocess.Popen[str]", kill_switch: threading
521
521
memory_usage : MutableSequence [Optional [int ]] = [None ]
522
522
523
523
mem_tm : "Optional[Timer]" = None
524
+ ks_tm : "Optional[Timer]" = None
524
525
525
526
def get_tree_mem_usage (memory_usage : MutableSequence [Optional [int ]]) -> None :
526
527
nonlocal mem_tm
@@ -542,10 +543,27 @@ def get_tree_mem_usage(memory_usage: MutableSequence[Optional[int]]) -> None:
542
543
if mem_tm is not None :
543
544
mem_tm .cancel ()
544
545
546
+ def monitor_kill_switch () -> None :
547
+ nonlocal ks_tm
548
+ if kill_switch .is_set ():
549
+ _logger .error ("[job %s] terminating by kill switch" , self .name )
550
+ if sproc .stdin : sproc .stdin .close ()
551
+ sproc .terminate ()
552
+ else :
553
+ ks_tm = Timer (interval = 1 , function = monitor_kill_switch )
554
+ ks_tm .daemon = True
555
+ ks_tm .start ()
556
+
557
+ ks_tm = Timer (interval = 1 , function = monitor_kill_switch )
558
+ ks_tm .daemon = True
559
+ ks_tm .start ()
560
+
545
561
mem_tm = Timer (interval = 1 , function = get_tree_mem_usage , args = (memory_usage ,))
546
562
mem_tm .daemon = True
547
563
mem_tm .start ()
564
+
548
565
sproc .wait ()
566
+ ks_tm .cancel ()
549
567
mem_tm .cancel ()
550
568
if memory_usage [0 ] is not None :
551
569
_logger .info (
@@ -859,20 +877,48 @@ def docker_monitor(
859
877
process : "subprocess.Popen[str]" ,
860
878
kill_switch : threading .Event ,
861
879
) -> None :
862
- """Record memory usage of the running Docker container."""
880
+ """Record memory usage of the running Docker container. Terminate if kill_switch is activated."""
881
+
882
+ ks_tm : "Optional[Timer]" = None
883
+ cid : Optional [str ] = None
884
+
885
+ def monitor_kill_switch () -> None :
886
+ nonlocal ks_tm
887
+ if kill_switch .is_set ():
888
+ _logger .error ("[job %s] terminating by kill switch" , self .name )
889
+ if process .stdin :
890
+ process .stdin .close ()
891
+ if cid is not None :
892
+ kill_proc = subprocess .Popen ( # nosec
893
+ [docker_exe , "kill" , cid ], shell = False # nosec
894
+ )
895
+ try :
896
+ kill_proc .wait (timeout = 10 )
897
+ except subprocess .TimeoutExpired :
898
+ kill_proc .kill ()
899
+ process .terminate () # Always terminate, even if we tried with the cidfile
900
+ else :
901
+ ks_tm = Timer (interval = 1 , function = monitor_kill_switch )
902
+ ks_tm .daemon = True
903
+ ks_tm .start ()
904
+
905
+ ks_tm = Timer (interval = 1 , function = monitor_kill_switch )
906
+ ks_tm .daemon = True
907
+ ks_tm .start ()
908
+
863
909
# Todo: consider switching to `docker create` / `docker start`
864
910
# instead of `docker run` as `docker create` outputs the container ID
865
911
# to stdout, but the container is frozen, thus allowing us to start the
866
912
# monitoring process without dealing with the cidfile or too-fast
867
913
# container execution
868
- cid : Optional [str ] = None
869
914
while cid is None :
870
915
time .sleep (1 )
871
916
# This is needed to avoid a race condition where the job
872
917
# was so fast that it already finished when it arrives here
873
918
if process .returncode is None :
874
919
process .poll ()
875
920
if process .returncode is not None :
921
+ ks_tm .cancel ()
876
922
if cleanup_cidfile :
877
923
try :
878
924
os .remove (cidfile )
@@ -904,6 +950,9 @@ def docker_monitor(
904
950
except OSError as exc :
905
951
_logger .warning ("Ignored error with %s stats: %s" , docker_exe , exc )
906
952
return
953
+ finally :
954
+ ks_tm .cancel ()
955
+
907
956
max_mem_percent : float = 0.0
908
957
mem_percent : float = 0.0
909
958
with open (stats_file_name ) as stats :
@@ -938,7 +987,7 @@ def _job_popen(
938
987
job_script_contents : Optional [str ] = None ,
939
988
timelimit : Optional [int ] = None ,
940
989
name : Optional [str ] = None ,
941
- monitor_function : Optional [Callable [["subprocess.Popen[str]" ], None ]] = None ,
990
+ monitor_function : Optional [Callable [["subprocess.Popen[str]" , "threading.Event" ], None ]] = None ,
942
991
default_stdout : Optional [Union [IO [bytes ], TextIO ]] = None ,
943
992
default_stderr : Optional [Union [IO [bytes ], TextIO ]] = None ,
944
993
) -> int :
@@ -993,7 +1042,7 @@ def terminate(): # type: () -> None
993
1042
tm .daemon = True
994
1043
tm .start ()
995
1044
if monitor_function :
996
- monitor_function (sproc )
1045
+ monitor_function (sproc , kill_switch )
997
1046
rcode = sproc .wait ()
998
1047
999
1048
if tm is not None :
@@ -1069,7 +1118,7 @@ def terminate(): # type: () -> None
1069
1118
tm .daemon = True
1070
1119
tm .start ()
1071
1120
if monitor_function :
1072
- monitor_function (sproc )
1121
+ monitor_function (sproc , kill_switch )
1073
1122
1074
1123
rcode = sproc .wait ()
1075
1124
0 commit comments