Skip to content

Commit 898e61a

Browse files
Fix File.rename/2 race condition in lock implementation on windows (#14800)
1 parent 16fb815 commit 898e61a

File tree

1 file changed

+151
-23
lines changed

1 file changed

+151
-23
lines changed

lib/mix/lib/mix/sync/lock.ex

Lines changed: 151 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -32,8 +32,8 @@ defmodule Mix.Sync.Lock do
3232
# successfully link to lock_N. At that point we can clean up all
3333
# the files, so we perform these steps:
3434
#
35-
# * move our port_P to lock_0
36-
# * remove all the other port_P files
35+
# * replace lock_0 content with port P
36+
# * remove all port_P files
3737
# * remove all lock_1+ files
3838
#
3939
# It is important to perform these steps in this order, to avoid
@@ -44,14 +44,14 @@ defmodule Mix.Sync.Lock do
4444
# the port_P file will no longer exist (once lock_N is removed).
4545
#
4646
# Finally, note that we do not remove the lock file in `unlock/1`.
47-
# If we did that, another process could try to connect and fail
48-
# because the file would not exist, in such case the process would
49-
# assume the file is stale and needs to be replaced, therefore
50-
# possibly replacing another process who successfully links at the
51-
# empty spot. This means we effectively always leave a stale file,
52-
# however, in order to shortcut the port check for future processes,
53-
# we atomically replace the file content with port 0, to indicate
54-
# the file is stale.
47+
# If we did that, another process could read it before removal,
48+
# then it may connect and fail (once the socket is closed), in such
49+
# case the process would assume the file is stale and needs to be
50+
# replaced, therefore possibly replacing another process who
51+
# successfully links at the empty spot. This means we effectively
52+
# always leave a stale file, however, in order to shortcut the port
53+
# check for future processes, we atomically replace the file content
54+
# with port 0, to indicate the file is stale.
5555
#
5656
# The main caveat of using ephemeral TCP ports is that they are not
5757
# unique. This creates a theoretical scenario where the lock holder
@@ -72,6 +72,7 @@ defmodule Mix.Sync.Lock do
7272
@probe_data "mixlock"
7373
@probe_data_size byte_size(@probe_data)
7474
@probe_timeout_ms 5_000
75+
@version 2
7576

7677
@typedoc """
7778
Options for `with_lock/3`.
@@ -132,7 +133,11 @@ defmodule Mix.Sync.Lock do
132133
defp base_path do
133134
# We include user in the dir to avoid permission conflicts across users
134135
user = System.get_env("USER", "default")
135-
Path.join(System.tmp_dir!(), "mix_lock_#{Base.url_encode64(user, padding: false)}")
136+
137+
Path.join(
138+
System.tmp_dir!(),
139+
"mix_lock_#{@version}_#{Base.url_encode64(user, padding: false)}"
140+
)
136141
end
137142

138143
defp lock_disabled?(), do: System.get_env("MIX_OS_CONCURRENCY_LOCK") in ~w(0 false)
@@ -177,7 +182,7 @@ defmodule Mix.Sync.Lock do
177182
port_path = Path.join(path, "port_#{port}")
178183
os_pid = System.pid()
179184

180-
File.write!(port_path, <<port::unsigned-integer-32, os_pid::binary>>, [:raw])
185+
switch_file_create!(port_path, encode_lock_info(port, os_pid))
181186

182187
case grab_lock(path, port_path, 0) do
183188
{:ok, 0} ->
@@ -186,7 +191,7 @@ defmodule Mix.Sync.Lock do
186191

187192
{:ok, _n} ->
188193
# We grabbed lock_1+, so we need to replace lock_0 and clean up
189-
take_over(path, port_path)
194+
take_over(path, port, os_pid)
190195
%{socket: socket, path: path}
191196

192197
{:taken, probe_socket, os_pid} ->
@@ -253,10 +258,15 @@ defmodule Mix.Sync.Lock do
253258
end
254259

255260
defp fetch_probe_port(port_path) do
256-
case File.read(port_path) do
257-
{:ok, <<0::unsigned-integer-32>>} -> {:error, :ignore}
258-
{:ok, <<port::unsigned-integer-32, os_pid::binary>>} -> {:ok, port, os_pid}
259-
{:error, reason} -> {:error, reason}
261+
case switch_file_read(port_path) do
262+
{:ok, data} ->
263+
case decode_lock_info(data) do
264+
{0, _os_pid} -> {:error, :ignore}
265+
{port, os_pid} -> {:ok, port, os_pid}
266+
end
267+
268+
{:error, reason} ->
269+
{:error, reason}
260270
end
261271
end
262272

@@ -305,15 +315,14 @@ defmodule Mix.Sync.Lock do
305315
end
306316
end
307317

308-
defp take_over(path, port_path) do
318+
defp take_over(path, port, os_pid) do
309319
# The operations here must happen in precise order, so if anything
310320
# fails, we keep the files as is and the next process that grabs
311321
# the lock will do the cleanup
312322

313323
lock_path = Path.join(path, "lock_0")
314324

315-
# We linked to lock_N successfully, so port_path should exist
316-
File.rename!(port_path, lock_path)
325+
switch_file_replace!(lock_path, encode_lock_info(port, os_pid))
317326

318327
names = File.ls!(path)
319328

@@ -342,14 +351,133 @@ defmodule Mix.Sync.Lock do
342351
end
343352

344353
defp unlock(lock) do
345-
port_path = Path.join(lock.path, "port_0")
346354
lock_path = Path.join(lock.path, "lock_0")
347355

348-
File.write!(port_path, <<0::unsigned-integer-32>>, [:raw])
349-
File.rename!(port_path, lock_path)
356+
switch_file_replace!(lock_path, encode_lock_info(0, ""))
350357
after
351358
# Closing the socket will cause the accepting process to finish
352359
# and all accepted sockets (tied to that process) will get closed
353360
:gen_tcp.close(lock.socket)
354361
end
362+
363+
defp encode_lock_info(port, os_pid) do
364+
os_pid_size = byte_size(os_pid)
365+
366+
if os_pid_size > 32 do
367+
Mix.raise("unexpectedly long PID: #{inspect(os_pid)}")
368+
end
369+
370+
# The info needs to have fixed size, so we pad os_pid to maximum
371+
# of 32 bytes (we expect it to be a few bytes).
372+
padding_size = 32 - os_pid_size
373+
padding = :binary.copy(<<0>>, padding_size)
374+
375+
<<
376+
port::unsigned-integer-32,
377+
padding_size::unsigned-integer-8,
378+
padding::binary,
379+
os_pid::binary
380+
>>
381+
end
382+
383+
defp decode_lock_info(data) do
384+
<<
385+
port::unsigned-integer-32,
386+
padding_size::unsigned-integer-8,
387+
_padding::binary-size(padding_size),
388+
os_pid::binary
389+
>> = data
390+
391+
{port, os_pid}
392+
end
393+
394+
# We need a mechanism to atomically replace file content. Typically,
395+
# we could use File.rename/2 to do that, however File.rename/2 is
396+
# not atomic on Windows, if the destination exists [1].
397+
#
398+
# As an alternative approach we use a switch-file. The file content
399+
# consists of 1 switch byte (either 0 or 1) and two content segments
400+
# with fixed, equal lengths. The switch byte indicates which segment
401+
# is currently active. To replace the file content, we write to the
402+
# non-active segment and call :file.sync/1 to ensure the segment is
403+
# persisted, then we toggle the switch byte. While we cannot write
404+
# multiple bytes atomically (since they may reside in multiple disk
405+
# sectors), if we toggle only a single byte, there is no intermediate
406+
# invalid state, which gives us the atomic replace we need.
407+
#
408+
# Note that file content can be replaced only by a single process
409+
# at a time.
410+
#
411+
# [1]: https://github.com/elixir-lang/elixir/pull/14793#issuecomment-3338665065
412+
defp switch_file_create!(path, content) do
413+
data = <<0, content::binary, content::binary>>
414+
File.write!(path, data, [:raw])
415+
end
416+
417+
defp switch_file_replace!(path, new_content) do
418+
file = File.open!(path, [:read, :write, :binary, :raw])
419+
420+
content_size = byte_size(new_content)
421+
422+
<<switch_byte>> = read_bytes!(file, 1)
423+
424+
try do
425+
inactive_content_position =
426+
case switch_byte do
427+
0 -> 1 + content_size
428+
1 -> 1
429+
end
430+
431+
# Write new data
432+
file_pwrite_sync!(file, inactive_content_position, new_content)
433+
434+
# Toggle switch byte - it's a single byte so the content changes
435+
# atomically
436+
file_pwrite_sync!(file, 0, <<1 - switch_byte>>)
437+
after
438+
File.close(file)
439+
end
440+
end
441+
442+
defp switch_file_read(path) do
443+
with {:ok, data} <- File.read(path) do
444+
<<switch_byte, rest::binary>> = data
445+
content_size = rest |> byte_size() |> div(2)
446+
<<content1::binary-size(^content_size), content2::binary-size(^content_size)>> = rest
447+
448+
case switch_byte do
449+
0 -> {:ok, content1}
450+
1 -> {:ok, content2}
451+
end
452+
end
453+
end
454+
455+
defp read_bytes!(file, bytes) do
456+
case :file.read(file, bytes) do
457+
{:ok, data} ->
458+
data
459+
460+
:eof ->
461+
raise "unexpected EOF of file when reading file"
462+
463+
{:error, reason} ->
464+
raise File.Error, reason: reason, action: "read file"
465+
end
466+
end
467+
468+
defp file_pwrite_sync!(file, position, bytes) do
469+
case :file.pwrite(file, position, bytes) do
470+
:ok ->
471+
case :file.sync(file) do
472+
:ok ->
473+
:ok
474+
475+
{:error, reason} ->
476+
raise File.Error, reason: reason, action: "sync file"
477+
end
478+
479+
{:error, {_n, reason}} ->
480+
raise File.Error, reason: reason, action: "write to file at position"
481+
end
482+
end
355483
end

0 commit comments

Comments
 (0)