@@ -17,24 +17,28 @@ struct ElasticManager <: Distributed.ClusterManager
17
17
terminated:: Set{Int} # terminated worker ids
18
18
topology:: Symbol
19
19
sockname
20
+ manage_callback
20
21
printing_kwargs
21
22
22
- function ElasticManager (;addr= Sockets. IPv4 (" 127.0.0.1" ), port= 9009 , cookie= nothing , topology= :all_to_all , printing_kwargs= ())
23
+ function ElasticManager (;
24
+ addr= IPv4 (" 127.0.0.1" ), port= 9009 , cookie= nothing ,
25
+ topology= :all_to_all , manage_callback= elastic_no_op_callback, printing_kwargs= ()
26
+ )
23
27
Distributed. init_multi ()
24
28
cookie != = nothing && Distributed. cluster_cookie (cookie)
25
29
26
30
# Automatically check for the IP address of the local machine
27
31
if addr == :auto
28
32
try
29
- addr = Sockets. getipaddr (Distributed . IPv4)
33
+ addr = Sockets. getipaddr (Sockets . IPv4)
30
34
catch
31
35
error (" Failed to automatically get host's IP address. Please specify `addr=` explicitly." )
32
36
end
33
37
end
34
38
35
39
l_sock = Distributed. listen (addr, port)
36
40
37
- lman = new (Dict {Int, Distributed.WorkerConfig} (), Channel {Sockets.TCPSocket} (typemax (Int)), Set {Int} (), topology, Sockets. getsockname (l_sock), printing_kwargs)
41
+ lman = new (Dict {Int, Distributed.WorkerConfig} (), Channel {Sockets.TCPSocket} (typemax (Int)), Set {Int} (), topology, Sockets. getsockname (l_sock), manage_callback, printing_kwargs)
38
42
39
43
t1 = @async begin
40
44
while true
@@ -57,8 +61,10 @@ ElasticManager(port) = ElasticManager(;port=port)
57
61
ElasticManager (addr, port) = ElasticManager (;addr= addr, port= port)
58
62
ElasticManager (addr, port, cookie) = ElasticManager (;addr= addr, port= port, cookie= cookie)
59
63
64
+ elastic_no_op_callback (:: ElasticManager , :: Integer , :: Symbol ) = nothing
60
65
61
66
function process_worker_conn (mgr:: ElasticManager , s:: Sockets.TCPSocket )
67
+ @debug " ElasticManager got new worker connection"
62
68
# Socket is the worker's STDOUT
63
69
wc = Distributed. WorkerConfig ()
64
70
wc. io = s
94
100
function Distributed. launch (mgr:: ElasticManager , params:: Dict , launched:: Array , c:: Condition )
95
101
# The workers have already been started.
96
102
while isready (mgr. pending)
103
+ @debug " ElasticManager.launch new worker"
97
104
wc= Distributed. WorkerConfig ()
98
105
wc. io = take! (mgr. pending)
99
106
push! (launched, wc)
104
111
105
112
function Distributed. manage (mgr:: ElasticManager , id:: Integer , config:: Distributed.WorkerConfig , op:: Symbol )
106
113
if op == :register
114
+ @debug " ElasticManager registering process id $id "
107
115
mgr. active[id] = config
116
+ mgr. manage_callback (mgr, id, op)
108
117
elseif op == :deregister
118
+ @debug " ElasticManager deregistering process id $id "
119
+ mgr. manage_callback (mgr, id, op)
109
120
delete! (mgr. active, id)
110
121
push! (mgr. terminated, id)
111
122
end
@@ -138,9 +149,18 @@ function Base.show(io::IO, mgr::ElasticManager)
138
149
end
139
150
140
151
# Does not return. If executing from a REPL try
141
- # @async connect_to_cluster (.....)
152
+ # @async elastic_worker (.....)
142
153
# addr, port that a ElasticManager on the master processes is listening on.
143
- function elastic_worker (cookie, addr= " 127.0.0.1" , port= 9009 ; stdout_to_master= true )
154
+ function elastic_worker (
155
+ cookie:: AbstractString , addr:: AbstractString = " 127.0.0.1" , port:: Integer = 9009 ;
156
+ stdout_to_master:: Bool = true ,
157
+ Base. @nospecialize (env:: AbstractVector = [],)
158
+ )
159
+ @debug " ElasticManager.elastic_worker(cookie, $addr , $port ; stdout_to_master=$stdout_to_master , env=$env )"
160
+ for (k, v) in env
161
+ ENV [k] = v
162
+ end
163
+
144
164
c = connect (addr, port)
145
165
write (c, rpad (cookie, HDR_COOKIE_LEN)[1 : HDR_COOKIE_LEN])
146
166
stdout_to_master && redirect_stdout (c)
0 commit comments