Skip to content

Commit babf8f7

Browse files
authored
improvement and bug fix for condor (#184)
* use port number returned by `listenany` * cleanup kill command
1 parent 426ab85 commit babf8f7

File tree

2 files changed

+12
-6
lines changed

2 files changed

+12
-6
lines changed

src/ClusterManagers.jl

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,8 @@ using Pkg
77
export launch, manage, kill, init_worker, connect
88
import Distributed: launch, manage, kill, init_worker, connect
99

10-
worker_arg() = `--worker=$(Distributed.init_multi(); cluster_cookie())`
10+
worker_cookie() = begin Distributed.init_multi(); cluster_cookie() end
11+
worker_arg() = `--worker=$(worker_cookie())`
1112

1213

1314
# PBS doesn't have the same semantics as SGE wrt to file accumulate,

src/condor.jl

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -26,11 +26,11 @@ function condor_script(portnum::Integer, np::Integer, params::Dict)
2626
println(scriptf, line)
2727
end
2828
println(scriptf, "cd $(Base.shell_escape(dir))")
29-
println(scriptf, "$(Base.shell_escape(exename)) $exeflags $(Base.shell_escape(worker_arg())) | $telnetexe $(Base.shell_escape(hostname)) $portnum")
29+
println(scriptf, "$(Base.shell_escape(exename)) $(Base.shell_escape(exeflags)) -e 'using Distributed; start_worker($(repr(worker_cookie())))' | $telnetexe $(Base.shell_escape(hostname)) $portnum")
3030
close(scriptf)
3131

3232
input_files = ["$tdir/$jobname.sh"]
33-
push!(input_files, extrainputs...)
33+
append!(input_files, extrainputs)
3434
subf = open("$tdir/$jobname.sub", "w")
3535
println(subf, "executable = /bin/bash")
3636
println(subf, "arguments = ./$jobname.sh")
@@ -54,7 +54,7 @@ end
5454
function launch(manager::HTCManager, params::Dict, instances_arr::Array, c::Condition)
5555
try
5656
portnum = rand(8000:9000)
57-
_, server = listenany(Sockets.getaddrinfo("0.0.0.0"), portnum)
57+
portnum, server = listenany(ip"0.0.0.0", portnum)
5858
np = manager.np
5959

6060
script = condor_script(portnum, np, params)
@@ -83,10 +83,15 @@ function launch(manager::HTCManager, params::Dict, instances_arr::Array, c::Cond
8383
end
8484
end
8585

86+
function kill(manager::HTCManager, id::Int64, config::WorkerConfig)
87+
remotecall(exit,id)
88+
close(config.io)
89+
end
90+
8691
function manage(manager::HTCManager, id::Integer, config::WorkerConfig, op::Symbol)
8792
if op == :finalize
88-
if !isnull(config.io)
89-
close(get(config.io))
93+
if !isnothing(config.io)
94+
close(config.io)
9095
end
9196
# elseif op == :interrupt
9297
# job = config[:job]

0 commit comments

Comments
 (0)