diff --git a/.buildkite/commands/run-e2e-tests.sh b/.buildkite/commands/run-e2e-tests.sh index 6350727299..f9013f8885 100644 --- a/.buildkite/commands/run-e2e-tests.sh +++ b/.buildkite/commands/run-e2e-tests.sh @@ -174,12 +174,33 @@ else echo 'Installing Playwright browsers...' npx playwright install + # TEMP(AINFRA-2588 probe): Windows E2E hangs *after* the Playwright suite finishes + # (~26 min) — the runner process never exits and the job runs to the 180-min timeout. + # Snapshot the surviving process tree while it is hung to find the leaked child/handle. + # Runs in the background so it fires even though `npx playwright test` never returns. + probe_pid='' + if [ "$PLATFORM" = "windows" ]; then + ( + snapshot() { + echo "--- :mag: [hang-probe] surviving node/Studio/php processes at ${1}" + powershell -NoProfile -NonInteractive -Command "Get-CimInstance Win32_Process | Where-Object { \$_.Name -match 'node|Studio|php|electron|Squirrel' } | Select-Object ProcessId,ParentProcessId,Name,CommandLine | Format-Table -AutoSize -Wrap | Out-String -Width 1000" || true + } + sleep 2400; snapshot '40m' + sleep 1800; snapshot '70m' + ) & + probe_pid=$! + fi + echo 'Running Playwright tests...' # Capture the exit code so a failure doesn't trip `set -e` before we collect # the daemon logs (~/.studio/daemon/logs) for artifact upload. test_exit=0 npx playwright test || test_exit=$? + if [ -n "$probe_pid" ]; then + kill "$probe_pid" 2>/dev/null || true + fi + if [ "$test_exit" -ne 0 ] && [ -d "$HOME/.studio/daemon/logs" ]; then mkdir -p test-results/daemon-logs cp -r "$HOME/.studio/daemon/logs/." test-results/daemon-logs/ || true diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index 10c10133c8..1ba675fcd9 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -23,9 +23,9 @@ e2e_config: &e2e_config setup: { platform: [], arch: [] } adjustments: - with: { platform: mac, arch: arm64 } - # Windows E2E temporarily disabled while AINFRA-2588 investigates Windows E2E hangs in Buildkite. - # See https://linear.app/a8c/issue/AINFRA-2588/investigate-studio-windows-e2e-hangs-in-buildkite - # - with: { platform: windows, arch: x64 } + # TEMP(AINFRA-2588): Windows E2E re-enabled here to verify the closeApp() force-kill teardown + # fix lands the job under the 180-min timeout. Revert to the disabled form before merge. + - with: { platform: windows, arch: x64 } notify: - github_commit_status: context: E2E Tests diff --git a/apps/studio/e2e/e2e-helpers.ts b/apps/studio/e2e/e2e-helpers.ts index 960a1a881d..335717a955 100644 --- a/apps/studio/e2e/e2e-helpers.ts +++ b/apps/studio/e2e/e2e-helpers.ts @@ -1,4 +1,5 @@ import { randomUUID } from 'crypto'; +import { spawnSync } from 'node:child_process'; import { tmpdir } from 'os'; import path from 'path'; import { findLatestBuild, parseElectronApp } from 'electron-playwright-helpers'; @@ -80,7 +81,16 @@ export class E2ESession { await new Promise< void >( ( resolve ) => setTimeout( resolve, 2000 ) ); console.log( 'App closed successfully' ); } catch ( error ) { - console.log( 'Process exit timeout' ); + // The graceful quit stalled — a leaky IPC handler or a `site stop --all` child that + // won't exit keeps Electron alive (notably on Windows). Force-kill the process tree so + // the Playwright runner can exit, instead of leaving an orphaned process that blocks the + // run until the CI job timeout. + console.log( 'Process exit timeout; force-killing the app process tree' ); + forceKillProcessTree( childProcess ); + await Promise.race( [ + exitPromise, + new Promise< void >( ( resolve ) => setTimeout( resolve, 5_000 ) ), + ] ); } finally { this.stopCapturingMainProcessLogs(); } @@ -223,3 +233,23 @@ export class E2ESession { return this.mainProcessLogs.join( '' ).trim(); } } + +function forceKillProcessTree( childProcess: ChildProcess ): void { + const { pid } = childProcess; + if ( ! pid ) { + return; + } + + // `child.kill()` only signals the Electron main process; on Windows its renderer and php.exe + // descendants orphan and keep DLLs locked, so walk the whole tree with `taskkill /T`. Mirrors + // `killChild` in tools/common/lib/cli-process.ts. + if ( process.platform === 'win32' ) { + spawnSync( 'taskkill', [ '/F', '/T', '/PID', String( pid ) ], { + windowsHide: true, + stdio: 'ignore', + } ); + return; + } + + childProcess.kill( 'SIGKILL' ); +} diff --git a/apps/studio/e2e/global-teardown.ts b/apps/studio/e2e/global-teardown.ts new file mode 100644 index 0000000000..cd07456f05 --- /dev/null +++ b/apps/studio/e2e/global-teardown.ts @@ -0,0 +1,59 @@ +import { execFileSync } from 'node:child_process'; + +/** + * The E2E suite drives Studio, which spawns a long-lived CLI process-manager daemon + * (`resources/cli/process-manager-daemon.mjs`) to run each site's PHP servers. On Windows + * the daemon is orphaned when the Electron app quits and survives with its + * `php-server-child.mjs` / `php.exe` subtree; the Playwright runner then holds an open + * handle to it and never exits, so the CI job hangs to the 180-min timeout (AINFRA-2588). + * `closeApp()` / `site stop --all` don't reap it. Once the whole suite is done nothing + * needs the daemon, so kill any survivors here to let the runner exit. + */ +export default function globalTeardown() { + if ( process.platform === 'win32' ) { + reapWindowsStudioDaemons(); + } else { + reapUnixStudioDaemons(); + } +} + +function reapWindowsStudioDaemons() { + const query = + 'Get-CimInstance Win32_Process | ' + + "Where-Object { $_.CommandLine -match 'process-manager-daemon\\.mjs' } | " + + 'Select-Object -ExpandProperty ProcessId'; + + let pids: string[] = []; + try { + const out = execFileSync( + 'powershell', + [ '-NoProfile', '-NonInteractive', '-Command', query ], + { encoding: 'utf8' } + ); + pids = out.split( /\s+/ ).filter( Boolean ); + } catch { + // No daemon found, or powershell unavailable — nothing to reap. + return; + } + + // `taskkill /T` walks the tree, killing the daemon's php-server-child.mjs and php.exe + // descendants along with it. + for ( const pid of pids ) { + try { + execFileSync( 'taskkill', [ '/F', '/T', '/PID', pid ], { stdio: 'ignore' } ); + console.log( `[global-teardown] killed lingering Studio daemon tree (pid ${ pid })` ); + } catch { + // Already exited between the query and the kill. + } + } +} + +function reapUnixStudioDaemons() { + // macOS/Linux don't exhibit the hang, but reap any lingering daemon by command-line + // match so a leak can't regress silently. `pkill` exits non-zero when nothing matches. + try { + execFileSync( 'pkill', [ '-f', 'process-manager-daemon\\.mjs' ], { stdio: 'ignore' } ); + } catch { + // Nothing matched — fine. + } +} diff --git a/playwright.config.ts b/playwright.config.ts index a1ca088201..ec5c64fb1d 100644 --- a/playwright.config.ts +++ b/playwright.config.ts @@ -4,6 +4,10 @@ export default defineConfig( { testDir: './apps/studio/e2e', snapshotPathTemplate: '{testDir}/__screenshots__/{testFilePath}/{arg}{ext}', + // Reap Studio's orphaned CLI process-manager daemon after the suite so the runner can + // exit instead of hanging to the CI timeout on Windows (AINFRA-2588). + globalTeardown: './apps/studio/e2e/global-teardown.ts', + // The app only allows a single instance to be running at a time, so we can // only run one test at a time. workers: 1,