Skip to content

Commit 7949cff

Browse files
authored
fix(libnpmexec): improve withLock stability (#8577)
Various improvements to withLock and its tests - fix windows race conditions/errors - use second-level granularity when detecting lock compromise; this resolves a sporadic floating point issue under APFS, and makes this generally more robust no matter the underlying file system - improve touchLock logic so that it doesn't compromise the lock for the next holder or keep running the interval when cleanup fails ## Testing notes Fixes were verified via a [modified GHA workflow](https://github.com/jenseng/cli/actions/runs/17803264354) that ran all the tests 100 times 😅 ## References Related to #8512
1 parent d389614 commit 7949cff

File tree

2 files changed

+122
-21
lines changed

2 files changed

+122
-21
lines changed

workspaces/libnpmexec/lib/with-lock.js

Lines changed: 26 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ const { onExit } = require('signal-exit')
1818
// - more ergonomic compromised lock handling (i.e. withLock will reject, and callbacks have access to an AbortSignal)
1919
// - uses a more recent version of signal-exit
2020

21-
const touchInterval = 100
21+
const touchInterval = 1_000
2222
// mtime precision is platform dependent, so use a reasonably large threshold
2323
const staleThreshold = 5_000
2424

@@ -75,7 +75,7 @@ function acquireLock (lockPath) {
7575
try {
7676
await fs.mkdir(lockPath)
7777
} catch (err) {
78-
if (err.code !== 'EEXIST') {
78+
if (err.code !== 'EEXIST' && err.code !== 'EBUSY' && err.code !== 'EPERM') {
7979
throw err
8080
}
8181

@@ -86,9 +86,18 @@ function acquireLock (lockPath) {
8686
return retry(err)
8787
}
8888
if (status === 'stale') {
89-
// there is a very tiny window where another process could also release the stale lock and acquire it before we release it here; the lock compromise checker should detect this and throw an error
90-
deleteLock(lockPath, ['ENOENT', 'EBUSY']) // on windows, EBUSY can happen if another process is creating the lock; we'll just retry
89+
try {
90+
// there is a very tiny window where another process could also release the stale lock and acquire it before we release it here; the lock compromise checker should detect this and throw an error
91+
deleteLock(lockPath)
92+
} catch (e) {
93+
// on windows, EBUSY/EPERM can happen if another process is (re)creating the lock; maybe we can acquire it on a subsequent attempt 🤞
94+
if (e.code === 'EBUSY' || e.code === 'EPERM') {
95+
return retry(e)
96+
}
97+
throw e
98+
}
9199
}
100+
// immediately attempt to acquire the lock (no backoff)
92101
return await acquireLock(lockPath)
93102
}
94103
try {
@@ -100,12 +109,12 @@ function acquireLock (lockPath) {
100109
})
101110
}
102111

103-
function deleteLock (lockPath, ignoreCodes = ['ENOENT']) {
112+
function deleteLock (lockPath) {
104113
try {
105114
// synchronous, so we can call in an exit handler
106115
rmdirSync(lockPath)
107116
} catch (err) {
108-
if (!ignoreCodes.includes(err.code)) {
117+
if (err.code !== 'ENOENT') {
109118
throw err
110119
}
111120
}
@@ -131,31 +140,33 @@ async function getLockStatus (lockPath) {
131140
async function maintainLock (lockPath) {
132141
const controller = new AbortController()
133142
const stats = await fs.stat(lockPath)
134-
let mtimeMs = stats.mtimeMs
143+
// fs.utimes operates on floating points seconds (directly, or via strings/Date objects), which may not match the underlying filesystem's mtime precision, meaning that we might read a slightly different mtime than we write. always round to the nearest second, since all filesystems support at least second precision
144+
let mtime = Math.round(stats.mtimeMs / 1000)
135145
const signal = controller.signal
136146

137147
async function touchLock () {
138148
try {
139149
const currentStats = (await fs.stat(lockPath))
140-
if (currentStats.ino !== stats.ino || currentStats.mtimeMs !== mtimeMs) {
150+
const currentMtime = Math.round(currentStats.mtimeMs / 1000)
151+
if (currentStats.ino !== stats.ino || currentMtime !== mtime) {
141152
throw new Error('Lock compromised')
142153
}
143-
mtimeMs = Date.now()
144-
const mtime = new Date(mtimeMs)
145-
await fs.utimes(lockPath, mtime, mtime)
146-
} catch (err) {
147-
// stats mismatch or other fs error means the lock was compromised, unless we just released the lock during this iteration
154+
mtime = Math.round(Date.now() / 1000)
155+
// touch the lock, unless we just released it during this iteration
148156
if (currentLocks.has(lockPath)) {
149-
controller.abort()
157+
await fs.utimes(lockPath, mtime, mtime)
150158
}
159+
} catch (err) {
160+
// stats mismatch or other fs error means the lock was compromised
161+
controller.abort()
151162
}
152163
}
153164

154165
const timeout = setInterval(touchLock, touchInterval)
155166
timeout.unref()
156167
function cleanup () {
157-
deleteLock(lockPath)
158168
clearInterval(timeout)
169+
deleteLock(lockPath)
159170
}
160171
currentLocks.set(lockPath, cleanup)
161172
return signal

workspaces/libnpmexec/test/with-lock.js

Lines changed: 96 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -102,7 +102,7 @@ t.test('stale lock takeover', async (t) => {
102102
}
103103
}
104104
let statCalls = 0
105-
const mtimeMs = Date.now()
105+
const mtimeMs = Math.round(Date.now() / 1000) * 1000
106106
mockStat = async () => {
107107
if (++statCalls === 1) {
108108
return { mtimeMs: mtimeMs - 10_000 }
@@ -122,6 +122,48 @@ t.test('stale lock takeover', async (t) => {
122122
t.equal(mkdirCalls, 2, 'should make two mkdir calls')
123123
})
124124

125+
t.test('EBUSY during lock acquisition', async (t) => {
126+
let mkdirCalls = 0
127+
mockMkdir = async (...args) => {
128+
if (++mkdirCalls === 1) {
129+
throw Object.assign(new Error(), { code: 'EBUSY' })
130+
}
131+
return fs.promises.mkdir(...args)
132+
}
133+
const lockPath = path.join(fs.mkdtempSync(path.join(getTempDir(), 'test-')), 'concurrency.lock')
134+
t.ok(await withLock(lockPath, async () => true))
135+
t.equal(mkdirCalls, 2, 'should make two mkdir calls')
136+
})
137+
138+
t.test('EBUSY during stale lock takeover', async (t) => {
139+
let mkdirCalls = 0
140+
mockMkdir = async () => {
141+
if (++mkdirCalls === 1) {
142+
throw Object.assign(new Error(), { code: 'EEXIST' })
143+
}
144+
}
145+
let statCalls = 0
146+
const mtimeMs = Math.round(Date.now() / 1000) * 1000
147+
mockStat = async () => {
148+
if (++statCalls === 1) {
149+
return { mtimeMs: mtimeMs - 10_000 }
150+
} else {
151+
return { mtimeMs, ino: 1 }
152+
}
153+
}
154+
let rmdirSyncCalls = 0
155+
mockRmdirSync = () => {
156+
if (++rmdirSyncCalls === 1) {
157+
throw Object.assign(new Error(), { code: 'EBUSY' })
158+
}
159+
}
160+
161+
const lockPath = path.join(fs.mkdtempSync(path.join(getTempDir(), 'test-')), 'concurrency.lock')
162+
const lockPromise = withLock(lockPath, async () => 'test value')
163+
t.equal(await lockPromise, 'test value', 'should take over the lock')
164+
t.equal(mkdirCalls, 2, 'should make two mkdir calls')
165+
})
166+
125167
t.test('concurrent stale lock takeover', async (t) => {
126168
const lockPath = path.join(fs.mkdtempSync(path.join(getTempDir(), 'test-')), 'concurrency.lock')
127169
// make a stale lock
@@ -147,7 +189,7 @@ t.test('mkdir -> getLockStatus race', async (t) => {
147189
}
148190
}
149191
let statCalls = 0
150-
const mtimeMs = Date.now()
192+
const mtimeMs = Math.round(Date.now() / 1000) * 1000
151193
mockStat = async () => {
152194
if (++statCalls === 1) {
153195
throw Object.assign(new Error(), { code: 'ENOENT' })
@@ -167,6 +209,22 @@ t.test('mkdir -> getLockStatus race', async (t) => {
167209
t.equal(mkdirCalls, 2, 'should make two mkdir calls')
168210
})
169211

212+
t.test('mtime floating point mismatch', async (t) => {
213+
let mtimeMs = Math.round(Date.now() / 1000) * 1000
214+
mockStat = async () => {
215+
return { mtimeMs, ino: 1 }
216+
}
217+
mockUtimes = async (_, nextMtimeSeconds) => {
218+
mtimeMs = nextMtimeSeconds * 1000 - 0.001
219+
}
220+
221+
const lockPath = path.join(fs.mkdtempSync(path.join(getTempDir(), 'test-')), 'concurrency.lock')
222+
t.ok(await withLock(lockPath, async () => {
223+
await setTimeout(2000)
224+
return true
225+
}), 'should handle mtime floating point mismatches')
226+
})
227+
170228
t.test('unexpected errors', async (t) => {
171229
t.test('can\'t create lock', async (t) => {
172230
const lockPath = '/these/parent/directories/do/not/exist/so/it/should/fail.lock'
@@ -217,7 +275,7 @@ t.test('unexpected errors', async (t) => {
217275
mockStat = async () => {
218276
return { mtimeMs: Date.now(), ino: Math.floor(Math.random() * 1000000) }
219277
}
220-
await t.rejects(withLock(lockPath, () => setTimeout(1000)), { code: 'ECOMPROMISED' })
278+
await t.rejects(withLock(lockPath, () => setTimeout(2000)), { code: 'ECOMPROMISED' })
221279
})
222280

223281
t.test('lock compromised (deleted)', async (t) => {
@@ -226,10 +284,42 @@ t.test('unexpected errors', async (t) => {
226284
mockStat = async () => {
227285
throw Object.assign(new Error(), { code: 'ENOENT' })
228286
}
229-
await t.rejects(withLock(lockPath, () => setTimeout(1000)), { code: 'ECOMPROMISED' })
287+
await t.rejects(withLock(lockPath, () => setTimeout(2000)), { code: 'ECOMPROMISED' })
230288
})
231289
})
232290

291+
t.test('lock released during maintenance', async (t) => {
292+
// this test validates that if we release the lock while touchLock is running, it doesn't interfere with subsequent locks
293+
const lockPath = path.join(fs.mkdtempSync(path.join(getTempDir(), 'test-')), 'concurrency.lock')
294+
295+
let releaseLock
296+
const releaseLockPromise = new Promise((resolve) => {
297+
releaseLock = resolve
298+
})
299+
300+
let statCalls = 0
301+
mockStat = async (...args) => {
302+
const value = await fs.promises.stat(...args)
303+
if (++statCalls > 1) {
304+
// this runs during the setInterval; release the lock so that we no longer hold it
305+
await releaseLock('test value')
306+
await setTimeout()
307+
}
308+
return value
309+
}
310+
311+
let utimesCalls = 0
312+
mockUtimes = async () => {
313+
utimesCalls++
314+
}
315+
316+
const lockPromise = withLock(lockPath, () => releaseLockPromise)
317+
// since we unref the interval timeout, we need to wait to ensure it actually runs
318+
await setTimeout(2000)
319+
t.equal(await lockPromise, 'test value', 'should acquire the lock')
320+
t.equal(utimesCalls, 0, 'should never call utimes')
321+
})
322+
233323
t.test('onExit handler', async (t) => {
234324
t.ok(onExitHandler, 'should be registered')
235325
let rmdirSyncCalls = 0
@@ -241,8 +331,8 @@ t.test('onExit handler', async (t) => {
241331
const lockPath = path.join(fs.mkdtempSync(path.join(getTempDir(), 'test-')), 'concurrency.lock')
242332
// don't await it since the promise never resolves
243333
withLock(lockPath, () => new Promise(() => {})).catch(() => {})
244-
// ensure the lock is acquired
245-
await setTimeout(0)
334+
// since we unref the interval timeout, we need to wait to ensure it actually runs
335+
await setTimeout(2000)
246336
onExitHandler()
247337
t.ok(rmdirSyncCalls > 0, 'should have removed outstanding locks')
248338
})

0 commit comments

Comments
 (0)