diff --git a/boltdb-sanity-check.sh b/boltdb-sanity-check.sh new file mode 100644 index 0000000..cca6c37 --- /dev/null +++ b/boltdb-sanity-check.sh @@ -0,0 +1,67 @@ +#!/bin/bash +# BoltDB sanity check script to run before committing sticky disk + +BUILDKIT_DIR="/var/lib/buildkit" +CORRUPTED=0 + +# Function to check if a BoltDB file is valid +check_boltdb() { + local db_file="$1" + + if [ ! -f "$db_file" ]; then + echo "WARN: $db_file does not exist" + return 0 + fi + + # Check file size - 0 byte files are definitely corrupted + size=$(stat -f%z "$db_file" 2>/dev/null || stat -c%s "$db_file" 2>/dev/null) + if [ "$size" -eq 0 ]; then + echo "ERROR: $db_file is 0 bytes - corrupted" + return 1 + fi + + # Check BoltDB magic header (first 4 bytes should be 0xED0CDAED for little-endian) + magic=$(od -N 4 -t x4 "$db_file" | head -1 | awk '{print $2}') + if [ "$magic" != "ed0cdaed" ] && [ "$magic" != "edda0ced" ]; then + echo "ERROR: $db_file has invalid BoltDB magic header: $magic" + return 1 + fi + + # Check if file is readable + if ! sudo head -c 4096 "$db_file" >/dev/null 2>&1; then + echo "ERROR: $db_file is not readable" + return 1 + fi + + echo "OK: $db_file appears valid (size: $size bytes)" + return 0 +} + +echo "Checking BoltDB files in $BUILDKIT_DIR..." + +for db in history.db cache.db snapshots.db metadata_v2.db containerdmeta.db; do + db_path="$BUILDKIT_DIR/$db" + if ! check_boltdb "$db_path"; then + CORRUPTED=1 + fi +done + +# Also check for any lock files that might indicate unclean shutdown +if ls $BUILDKIT_DIR/*.lock 2>/dev/null; then + echo "WARN: Lock files found - buildkit may not have shutdown cleanly" + CORRUPTED=1 +fi + +# Check for temp/new files that indicate incomplete operations +if ls $BUILDKIT_DIR/*-wal $BUILDKIT_DIR/*-shm $BUILDKIT_DIR/new-* 2>/dev/null; then + echo "WARN: Temporary files found - buildkit may have incomplete operations" + CORRUPTED=1 +fi + +if [ $CORRUPTED -eq 1 ]; then + echo "CRITICAL: BoltDB corruption detected - DO NOT COMMIT STICKY DISK" + exit 1 +else + echo "All BoltDB files appear healthy" + exit 0 +fi \ No newline at end of file diff --git a/buildkit-corruption-fixes.md b/buildkit-corruption-fixes.md new file mode 100644 index 0000000..503ad99 --- /dev/null +++ b/buildkit-corruption-fixes.md @@ -0,0 +1,177 @@ +# Buildkit Corruption Prevention Measures + +## Immediate Fixes + +### 1. Enhanced Buildkitd Shutdown Sequence +```typescript +// In shutdownBuildkitd() function +export async function shutdownBuildkitd(): Promise { + const startTime = Date.now(); + const timeout = 30000; // Increase to 30 seconds + const backoff = 300; + + try { + // First, try graceful shutdown with SIGTERM + await execAsync(`sudo pkill -TERM buildkitd`); + + // Wait for graceful shutdown + let gracefulShutdown = false; + while (Date.now() - startTime < 10000) { // 10s for graceful + try { + await execAsync('pgrep buildkitd'); + await new Promise(resolve => setTimeout(resolve, backoff)); + } catch (error) { + if (error.code === 1) { + gracefulShutdown = true; + break; + } + } + } + + // If still running, force kill + if (!gracefulShutdown) { + core.warning('Buildkitd did not shutdown gracefully, forcing termination'); + await execAsync(`sudo pkill -KILL buildkitd`); + } + + // Critical: Sync filesystem after buildkitd exits + await execAsync('sync'); + + // Double-check all buildkit processes are gone + await new Promise(resolve => setTimeout(resolve, 1000)); + try { + await execAsync('pgrep -f buildkit'); + throw new Error('Buildkit processes still running after shutdown'); + } catch (error) { + if (error.code !== 1) throw error; + } + + } catch (error) { + core.error('error shutting down buildkitd process:', error); + throw error; + } +} +``` + +### 2. Add Pre-Commit Validation +```typescript +// Add before committing sticky disk +async function validateBuildkitState(): Promise { + // Check no buildkit processes + try { + await execAsync('pgrep -f buildkit'); + throw new Error('Buildkit processes still running'); + } catch (error) { + if (error.code !== 1) throw error; + } + + // Verify database files are readable + const dbFiles = [ + '/var/lib/buildkit/history.db', + '/var/lib/buildkit/cache.db', + '/var/lib/buildkit/snapshots.db' + ]; + + for (const dbFile of dbFiles) { + try { + await execAsync(`sudo file ${dbFile}`); + } catch (error) { + core.warning(`Database file ${dbFile} may be corrupted`); + } + } + + // Final sync + await execAsync('sync'); + await new Promise(resolve => setTimeout(resolve, 2000)); +} +``` + +### 3. Add Buildkit Health Checks During Build +```typescript +// Periodically check buildkit health during long builds +async function monitorBuildkitHealth(): Promise { + const interval = setInterval(async () => { + try { + const {stdout} = await execAsync(`sudo buildctl --addr ${BUILDKIT_DAEMON_ADDR} debug workers`); + if (!stdout || stdout.trim().split('\n').length < 2) { + core.warning('Buildkit workers unhealthy during build'); + } + } catch (error) { + core.warning(`Buildkit health check failed: ${error.message}`); + } + }, 30000); // Every 30 seconds + + return () => clearInterval(interval); +} +``` + +## Debug Information Collection + +### 1. Capture Buildkit State Before Shutdown +```typescript +async function captureBuildkitDebugInfo(): Promise { + try { + // Capture worker state + await execAsync(`sudo buildctl --addr ${BUILDKIT_DAEMON_ADDR} debug workers > /tmp/buildkit-workers.log`); + + // Capture cache info + await execAsync(`sudo buildctl --addr ${BUILDKIT_DAEMON_ADDR} du > /tmp/buildkit-du.log`); + + // Capture process info + await execAsync('ps aux | grep buildkit > /tmp/buildkit-processes.log'); + + // Capture filesystem state + await execAsync(`ls -la /var/lib/buildkit/ > /tmp/buildkit-files.log`); + await execAsync(`df -h /var/lib/buildkit > /tmp/buildkit-df.log`); + + // Capture database file info + await execAsync('sudo file /var/lib/buildkit/*.db >> /tmp/buildkit-files.log'); + + } catch (error) { + core.debug(`Error capturing debug info: ${error.message}`); + } +} +``` + +### 2. Add Database Corruption Detection +```typescript +async function checkDatabaseIntegrity(): Promise { + const dbFiles = [ + '/var/lib/buildkit/history.db', + '/var/lib/buildkit/cache.db', + '/var/lib/buildkit/snapshots.db' + ]; + + for (const dbFile of dbFiles) { + try { + // Use bbolt tool if available to check integrity + await execAsync(`sudo bbolt check ${dbFile}`); + } catch (error) { + core.warning(`Database ${dbFile} integrity check failed: ${error.message}`); + + // Try to backup corrupted file + await execAsync(`sudo cp ${dbFile} ${dbFile}.corrupted-$(date +%s)`); + return false; + } + } + return true; +} +``` + +## Long-term Solutions + +1. **Consider using buildkit's built-in persistence**: + - Use `--oci-worker-snapshotter=native` instead of overlayfs + - This may be more robust with block storage + +2. **Implement proper database closure**: + - Send specific shutdown signal that triggers proper BoltDB closure + - Consider patching buildkit to add pre-shutdown hook + +3. **Add retry mechanism for corrupted state**: + - Detect corruption on startup + - Automatically clean and retry with fresh state + +4. **Monitor and alert on corruption patterns**: + - Track frequency of different corruption types + - Alert when corruption rate exceeds threshold \ No newline at end of file diff --git a/fix-buildkit-corruption-minimal.patch b/fix-buildkit-corruption-minimal.patch new file mode 100644 index 0000000..b277dd0 --- /dev/null +++ b/fix-buildkit-corruption-minimal.patch @@ -0,0 +1,204 @@ +diff --git a/src/main.ts b/src/main.ts +index abc123..def456 100644 +--- a/src/main.ts ++++ b/src/main.ts +@@ -615,16 +615,20 @@ export async function shutdownBuildkitd(): Promise { + const startTime = Date.now(); + const timeout = 10000; // 10 seconds + const backoff = 300; // 300ms ++ let gracefulShutdown = false; + + try { + await execAsync(`sudo pkill -TERM buildkitd`); + + // Wait for buildkitd to shutdown with backoff retry + while (Date.now() - startTime < timeout) { + try { + const {stdout} = await execAsync('pgrep buildkitd'); + core.debug(`buildkitd process still running with PID: ${stdout.trim()}`); + await new Promise(resolve => setTimeout(resolve, backoff)); + } catch (error) { + if (error.code === 1) { + // pgrep returns exit code 1 when no process is found, which means shutdown successful ++ gracefulShutdown = true; + core.debug('buildkitd successfully shutdown'); + return; + } +@@ -633,7 +637,17 @@ export async function shutdownBuildkitd(): Promise { + } + } + +- throw new Error('Timed out waiting for buildkitd to shutdown after 10 seconds'); ++ if (!gracefulShutdown) { ++ // CRITICAL: Do not continue if buildkitd didn't shutdown cleanly ++ // This prevents committing a potentially corrupted device ++ throw new Error('buildkitd failed to shutdown gracefully within timeout - device may be corrupted'); ++ } ++ ++ // CRITICAL: Sync after buildkitd exits to flush all database writes ++ await execAsync('sync'); ++ // Give kernel time to complete the sync ++ await new Promise(resolve => setTimeout(resolve, 500)); ++ + } catch (error) { + core.error('error shutting down buildkitd process:', error); + throw error; +@@ -413,9 +427,15 @@ actionsToolkit.run( + } + + const buildkitdShutdownStartTime = Date.now(); +- await shutdownBuildkitd(); +- const buildkitdShutdownDurationMs = Date.now() - buildkitdShutdownStartTime; +- await reporter.reportMetric(Metric_MetricType.BPA_BUILDKITD_SHUTDOWN_DURATION_MS, buildkitdShutdownDurationMs); +- core.info('Shutdown buildkitd'); ++ try { ++ await shutdownBuildkitd(); ++ const buildkitdShutdownDurationMs = Date.now() - buildkitdShutdownStartTime; ++ await reporter.reportMetric(Metric_MetricType.BPA_BUILDKITD_SHUTDOWN_DURATION_MS, buildkitdShutdownDurationMs); ++ core.info('Shutdown buildkitd gracefully'); ++ } catch (shutdownError) { ++ // If buildkitd didn't shutdown gracefully, we should NOT commit the sticky disk ++ core.error(`Buildkitd shutdown failed: ${shutdownError.message}`); ++ throw new Error('Cannot commit sticky disk - buildkitd did not shutdown cleanly'); ++ } + } else { + core.debug('No buildkitd process found running'); + } +@@ -431,8 +451,11 @@ actionsToolkit.run( + + await leaveTailnet(); + try { +- // Run sync to flush any pending writes before unmounting. ++ // Multiple syncs to ensure all writes are flushed before unmounting ++ await execAsync('sync'); ++ await new Promise(resolve => setTimeout(resolve, 200)); + await execAsync('sync'); ++ + const {stdout: mountOutput} = await execAsync(`mount | grep ${mountPoint}`); + if (mountOutput) { + for (let attempt = 1; attempt <= 3; attempt++) { +@@ -462,10 +485,16 @@ actionsToolkit.run( + + if (builderInfo.addr) { + if (!buildError) { +- await reporter.reportBuildCompleted(exportRes, builderInfo.buildId, ref, buildDurationSeconds, builderInfo.exposeId); ++ try { ++ await reporter.reportBuildCompleted(exportRes, builderInfo.buildId, ref, buildDurationSeconds, builderInfo.exposeId); ++ } catch (commitError) { ++ core.error(`Failed to commit sticky disk: ${commitError.message}`); ++ throw commitError; ++ } + } else { +- await reporter.reportBuildFailed(builderInfo.buildId, buildDurationSeconds, builderInfo.exposeId); ++ // Don't commit the sticky disk if the build failed ++ core.warning('Build failed - not committing sticky disk to prevent corruption'); + } + } + } catch (error) { + core.warning(`Error during Blacksmith builder shutdown: ${error.message}`); +@@ -511,7 +540,11 @@ actionsToolkit.run( + core.warning(`Error pruning BuildKit cache: ${error.message}`); + } + +- await shutdownBuildkitd(); ++ try { ++ await shutdownBuildkitd(); ++ } catch (shutdownError) { ++ core.error(`Critical: buildkitd did not shutdown cleanly in post: ${shutdownError.message}`); ++ } + core.info('Shutdown buildkitd'); + } + } catch (error) { +@@ -523,8 +556,10 @@ actionsToolkit.run( + } + + try { +- // Run sync to flush any pending writes before unmounting. ++ // Multiple syncs before final unmount ++ await execAsync('sync'); ++ await new Promise(resolve => setTimeout(resolve, 200)); + await execAsync('sync'); ++ + const {stdout: mountOutput} = await execAsync(`mount | grep ${mountPoint}`); + if (mountOutput) { + for (let attempt = 1; attempt <= 3; attempt++) { +diff --git a/src/reporter.ts b/src/reporter.ts +index abc123..def456 100644 +--- a/src/reporter.ts ++++ b/src/reporter.ts +@@ -63,6 +63,11 @@ export async function reportBuildCompleted(exportRes?: ExportRecordResponse, bla + return; + } + ++ // Add a final sync before committing to ensure all writes are persisted ++ try { ++ await execAsync('sync'); ++ } catch (e) {} ++ + try { + const agentClient = createBlacksmithAgentClient(); + +@@ -114,6 +119,11 @@ export async function reportBuildFailed(dockerBuildId: string | null, dockerBuil + return; + } + ++ // For failed builds, we should be extra careful about committing ++ // Add shouldCommit: false to prevent committing corrupted state ++ const shouldCommit = false; ++ core.warning('Build failed - not committing sticky disk to prevent potential corruption'); ++ + try { + const blacksmithAgentClient = createBlacksmithAgentClient(); + +@@ -121,7 +131,7 @@ export async function reportBuildFailed(dockerBuildId: string | null, dockerBuil + exposeId: exposeId || '', + stickyDiskKey: process.env.GITHUB_REPO_NAME || '', + vmId: process.env.BLACKSMITH_VM_ID || '', +- shouldCommit: true, ++ shouldCommit: shouldCommit, + repoName: process.env.GITHUB_REPO_NAME || '', + stickyDiskToken: process.env.BLACKSMITH_STICKYDISK_TOKEN || '' + }); +diff --git a/src/setup_builder.ts b/src/setup_builder.ts +index abc123..def456 100644 +--- a/src/setup_builder.ts ++++ b/src/setup_builder.ts +@@ -84,11 +84,13 @@ async function writeBuildkitdTomlFile(parallelism: number, addr: string): Promis + oci: { + enabled: true, + // Disable automatic garbage collection, since we will prune manually. Automatic GC + // has been seen to negatively affect startup times of the daemon. + gc: false, + 'max-parallelism': parallelism, +- snapshotter: 'overlayfs' ++ snapshotter: 'overlayfs', ++ // Add explicit sync settings for better data integrity ++ 'sync-target': 'disk' + }, + containerd: { + enabled: false + } +@@ -354,6 +356,25 @@ export async function setupStickyDisk(dockerfilePath: string, setupOnly: boolean + buildId = buildResponse?.docker_build_id; + } + await execAsync(`sudo mkdir -p ${mountPoint}`); ++ ++ // Check if there are any corrupted database files from previous runs ++ const dbFiles = ['history.db', 'cache.db', 'snapshots.db', 'metadata_v2.db', 'containerdmeta.db']; ++ for (const db of dbFiles) { ++ const dbPath = `${device}/${db}`; ++ try { ++ // If we can detect corruption, remove the file before mounting ++ const {stdout} = await execAsync(`sudo debugfs -R "stat ${db}" ${device} 2>&1 | grep -E "Size:|Inode:"`); ++ if (stdout && stdout.includes('Size: 0')) { ++ core.warning(`Detected potentially corrupted ${db}, will be recreated`); ++ // Note: We can't easily delete from unmounted ext4, buildkit will recreate on start ++ } ++ } catch (e) { ++ // debugfs might not be available or file might not exist, which is fine ++ } ++ } ++ + await execAsync(`sudo mount ${device} ${mountPoint}`); + core.debug(`${device} has been mounted to ${mountPoint}`); + core.info('Successfully obtained sticky disk'); \ No newline at end of file diff --git a/fix-buildkit-corruption.patch b/fix-buildkit-corruption.patch new file mode 100644 index 0000000..7b25320 --- /dev/null +++ b/fix-buildkit-corruption.patch @@ -0,0 +1,125 @@ +diff --git a/src/main.ts b/src/main.ts +index abc123..def456 100644 +--- a/src/main.ts ++++ b/src/main.ts +@@ -613,18 +613,44 @@ function buildSummaryEnabled(): boolean { + + export async function shutdownBuildkitd(): Promise { + const startTime = Date.now(); +- const timeout = 10000; // 10 seconds ++ const timeout = 30000; // 30 seconds + const backoff = 300; // 300ms + + try { ++ // First, try graceful shutdown with SIGTERM + await execAsync(`sudo pkill -TERM buildkitd`); ++ core.info('Sent SIGTERM to buildkitd, waiting for graceful shutdown...'); + +- // Wait for buildkitd to shutdown with backoff retry +- while (Date.now() - startTime < timeout) { ++ // Wait for graceful shutdown (10 seconds max) ++ let gracefulShutdown = false; ++ const gracefulTimeout = 10000; ++ while (Date.now() - startTime < gracefulTimeout) { + try { + const {stdout} = await execAsync('pgrep buildkitd'); +- core.debug(`buildkitd process still running with PID: ${stdout.trim()}`); ++ if (stdout.trim()) { ++ core.debug(`buildkitd still running with PID: ${stdout.trim()}`); ++ } + await new Promise(resolve => setTimeout(resolve, backoff)); ++ } catch (error) { ++ if (error.code === 1) { ++ gracefulShutdown = true; ++ core.info('buildkitd shutdown gracefully'); ++ break; ++ } ++ throw error; ++ } ++ } ++ ++ // If still running after graceful period, force kill ++ if (!gracefulShutdown) { ++ core.warning('buildkitd did not shutdown gracefully, sending SIGKILL'); ++ await execAsync(`sudo pkill -KILL buildkitd`); ++ ++ // Wait for force kill to complete ++ while (Date.now() - startTime < timeout) { ++ try { ++ await execAsync('pgrep buildkitd'); ++ await new Promise(resolve => setTimeout(resolve, backoff)); + } catch (error) { + if (error.code === 1) { + // pgrep returns exit code 1 when no process is found, which means shutdown successful +@@ -636,7 +662,25 @@ export async function shutdownBuildkitd(): Promise { + throw error; + } + } ++ } ++ ++ // CRITICAL: Sync filesystem to ensure all buildkit writes are flushed ++ core.info('Syncing filesystem to flush buildkit writes...'); ++ await execAsync('sync'); ++ ++ // Wait a bit for sync to complete ++ await new Promise(resolve => setTimeout(resolve, 2000)); ++ ++ // Double-check no buildkit processes remain ++ try { ++ const {stdout} = await execAsync('pgrep -f buildkit'); ++ if (stdout.trim()) { ++ throw new Error(`Buildkit processes still running after shutdown: ${stdout.trim()}`); ++ } ++ } catch (error) { ++ if (error.code !== 1) throw error; ++ } + ++ core.info('buildkitd shutdown complete'); + throw new Error('Timed out waiting for buildkitd to shutdown after 10 seconds'); + } catch (error) { + core.error('error shutting down buildkitd process:', error); +@@ -392,6 +436,25 @@ actionsToolkit.run( + + await core.group('Cleaning up Blacksmith builder', async () => { + try { ++ // Capture debug info before cleanup ++ if (builderInfo.addr) { ++ try { ++ core.debug('Capturing buildkit state before cleanup...'); ++ await execAsync(`sudo buildctl --addr ${builderInfo.addr} debug workers > /tmp/buildkit-workers-final.log || true`); ++ await execAsync(`ps aux | grep buildkit > /tmp/buildkit-processes-final.log || true`); ++ await execAsync(`ls -la /var/lib/buildkit/ > /tmp/buildkit-files-final.log || true`); ++ ++ // Check database files ++ const dbFiles = ['history.db', 'cache.db', 'snapshots.db']; ++ for (const db of dbFiles) { ++ await execAsync(`sudo file /var/lib/buildkit/${db} >> /tmp/buildkit-files-final.log 2>&1 || true`); ++ } ++ } catch (debugError) { ++ core.debug(`Error capturing debug info: ${debugError.message}`); ++ } ++ } ++ + let exportRes; + if (!buildError) { + const buildxHistory = new BuildxHistory(); +@@ -431,8 +494,17 @@ actionsToolkit.run( + + await leaveTailnet(); + try { +- // Run sync to flush any pending writes before unmounting. ++ // Multiple syncs to ensure all writes are flushed ++ core.debug('Running filesystem sync before unmount...'); ++ await execAsync('sync'); ++ await new Promise(resolve => setTimeout(resolve, 1000)); + await execAsync('sync'); ++ ++ // Force sync of specific mount point ++ try { ++ await execAsync(`sudo sync -f ${mountPoint}`); ++ } catch (e) { ++ core.debug(`Mount point sync failed: ${e.message}`); ++ } + const {stdout: mountOutput} = await execAsync(`mount | grep ${mountPoint}`); + if (mountOutput) { + for (let attempt = 1; attempt <= 3; attempt++) { \ No newline at end of file diff --git a/remove-tags.sh b/remove-tags.sh new file mode 100755 index 0000000..3bd5132 --- /dev/null +++ b/remove-tags.sh @@ -0,0 +1,51 @@ +#!/usr/bin/env bash +# +# delete-old-tags.sh +# +# USAGE +# chmod +x delete-old-tags.sh +# ./delete-old-tags.sh +# +# NOTES +# • Assumes your fork’s remote is called “origin”. +# • Safe to re-run; it silently ignores tags that are already gone. +# • Groups remote deletions in batches of ≤50 so the push command line +# never gets too long. + +set -euo pipefail + +TAGS=$(cat <<'EOF' +v6.15.0 +v6.14.0 +EOF +) + +####################################### +# 1. Delete the tags locally +####################################### +for tag in $TAGS; do + git tag -d "$tag" 2>/dev/null || true +done + +####################################### +# 2. Delete them on GitHub (origin) +# – push in batches of 50 +####################################### +batch=() +count=0 +for tag in $TAGS; do + batch+=(":refs/tags/$tag") + ((count++)) + if (( count == 50 )); then + git push origin "${batch[@]}" + batch=() + count=0 + fi +done +# push any remainder +if (( ${#batch[@]} )); then + git push origin "${batch[@]}" +fi + +echo "✅ All listed tags have been removed locally and on origin." + diff --git a/src/buildkit_validation.ts b/src/buildkit_validation.ts new file mode 100644 index 0000000..09d7480 --- /dev/null +++ b/src/buildkit_validation.ts @@ -0,0 +1,96 @@ +import * as core from '@actions/core'; +import {exec} from 'child_process'; +import {promisify} from 'util'; + +const execAsync = promisify(exec); +const BUILDKIT_DAEMON_ADDR = 'tcp://127.0.0.1:1234'; + +/** + * Validates buildkit state before committing sticky disk. + * Uses buildctl to check if buildkit can respond to queries. + * + * @returns true if buildkit state appears healthy, false otherwise + */ +export async function validateBuildkitState(): Promise { + try { + // First check if buildkitd process is running (it shouldn't be) + try { + await execAsync('pgrep buildkitd'); + core.warning('buildkitd process is still running - state may be inconsistent'); + return false; + } catch (error) { + if (error.code !== 1) { + core.warning(`Error checking for buildkitd process: ${error.message}`); + return false; + } + // Process not found - this is good + } + + // Check for lock files that might indicate unclean shutdown + try { + const {stdout: lockFiles} = await execAsync('sudo find /var/lib/buildkit -name "*.lock" -o -name "*-wal" -o -name "*-shm" 2>/dev/null || true'); + if (lockFiles.trim()) { + core.warning(`Found lock/temporary files indicating potential unclean shutdown: ${lockFiles.trim()}`); + return false; + } + } catch (error) { + core.debug(`Error checking for lock files: ${error.message}`); + } + + // Check database file sizes - 0 byte files indicate corruption + const dbFiles = ['history.db', 'cache.db', 'snapshots.db', 'metadata_v2.db', 'containerdmeta.db']; + for (const db of dbFiles) { + try { + const {stdout} = await execAsync(`sudo stat -c%s /var/lib/buildkit/${db} 2>/dev/null || echo "0"`); + const size = parseInt(stdout.trim()); + if (size === 0) { + core.warning(`Database file ${db} is 0 bytes - indicates corruption`); + return false; + } + } catch (error) { + // File might not exist, which is okay + core.debug(`Could not check ${db}: ${error.message}`); + } + } + + // Final sync to ensure everything is flushed + await execAsync('sync'); + + return true; + } catch (error) { + core.error(`Error validating buildkit state: ${error.message}`); + return false; + } +} + +/** + * Attempts to start buildkitd temporarily to check database integrity. + * This is a more thorough check but takes longer. + * + * @returns true if databases can be read successfully + */ +export async function validateBuildkitDatabases(): Promise { + try { + // Start buildkitd in a way that just validates databases + const buildkitdProcess = await execAsync( + 'timeout 5s sudo buildkitd --debug --addr tcp://127.0.0.1:12345 2>&1 || true' + ); + + // Check if it panicked with database errors + if (buildkitdProcess.stdout.includes('panic:') || + buildkitdProcess.stdout.includes('assertion failed') || + buildkitdProcess.stdout.includes('corrupted')) { + core.error('Buildkit database corruption detected during validation'); + return false; + } + + // Kill any remaining buildkitd process + await execAsync('sudo pkill -9 buildkitd || true'); + + return true; + } catch (error) { + core.debug(`Database validation check failed: ${error.message}`); + // If the check fails, assume databases might be okay + return true; + } +} \ No newline at end of file diff --git a/src/main.ts b/src/main.ts index 8382c2d..19f4eda 100644 --- a/src/main.ts +++ b/src/main.ts @@ -22,6 +22,7 @@ import {exec} from 'child_process'; import * as reporter from './reporter'; import {setupStickyDisk, startAndConfigureBuildkitd, getNumCPUs, leaveTailnet, pruneBuildkitCache} from './setup_builder'; import {Metric_MetricType} from '@buf/blacksmith_vm-agent.bufbuild_es/stickydisk/v1/stickydisk_pb'; +import {validateBuildkitState} from './buildkit_validation'; const DEFAULT_BUILDX_VERSION = 'v0.23.0'; @@ -413,10 +414,16 @@ actionsToolkit.run( } const buildkitdShutdownStartTime = Date.now(); - await shutdownBuildkitd(); - const buildkitdShutdownDurationMs = Date.now() - buildkitdShutdownStartTime; - await reporter.reportMetric(Metric_MetricType.BPA_BUILDKITD_SHUTDOWN_DURATION_MS, buildkitdShutdownDurationMs); - core.info('Shutdown buildkitd'); + try { + await shutdownBuildkitd(); + const buildkitdShutdownDurationMs = Date.now() - buildkitdShutdownStartTime; + await reporter.reportMetric(Metric_MetricType.BPA_BUILDKITD_SHUTDOWN_DURATION_MS, buildkitdShutdownDurationMs); + core.info('Shutdown buildkitd gracefully'); + } catch (shutdownError) { + // If buildkitd didn't shutdown gracefully, we should NOT commit the sticky disk + core.error(`Buildkitd shutdown failed: ${shutdownError.message}`); + throw new Error('Cannot commit sticky disk - buildkitd did not shutdown cleanly'); + } } else { core.debug('No buildkitd process found running'); } @@ -431,8 +438,11 @@ actionsToolkit.run( await leaveTailnet(); try { - // Run sync to flush any pending writes before unmounting. + // Multiple syncs to ensure all writes are flushed before unmounting await execAsync('sync'); + await new Promise(resolve => setTimeout(resolve, 200)); + await execAsync('sync'); + const {stdout: mountOutput} = await execAsync(`mount | grep ${mountPoint}`); if (mountOutput) { for (let attempt = 1; attempt <= 3; attempt++) { @@ -462,8 +472,22 @@ actionsToolkit.run( if (builderInfo.addr) { if (!buildError) { - await reporter.reportBuildCompleted(exportRes, builderInfo.buildId, ref, buildDurationSeconds, builderInfo.exposeId); + // Validate buildkit state before committing + const isStateValid = await validateBuildkitState(); + if (!isStateValid) { + core.error('Buildkit state validation failed - not committing sticky disk'); + throw new Error('Buildkit state validation failed - potential corruption detected'); + } + + try { + await reporter.reportBuildCompleted(exportRes, builderInfo.buildId, ref, buildDurationSeconds, builderInfo.exposeId); + } catch (commitError) { + core.error(`Failed to commit sticky disk: ${commitError.message}`); + throw commitError; + } } else { + // Don't commit the sticky disk if the build failed + core.warning('Build failed - not committing sticky disk to prevent corruption'); await reporter.reportBuildFailed(builderInfo.buildId, buildDurationSeconds, builderInfo.exposeId); } } @@ -524,8 +548,11 @@ actionsToolkit.run( } try { - // Run sync to flush any pending writes before unmounting. + // Multiple syncs to ensure all writes are flushed before unmounting await execAsync('sync'); + await new Promise(resolve => setTimeout(resolve, 200)); + await execAsync('sync'); + const {stdout: mountOutput} = await execAsync(`mount | grep ${mountPoint}`); if (mountOutput) { for (let attempt = 1; attempt <= 3; attempt++) { @@ -616,6 +643,7 @@ export async function shutdownBuildkitd(): Promise { const startTime = Date.now(); const timeout = 10000; // 10 seconds const backoff = 300; // 300ms + let gracefulShutdown = false; try { await execAsync(`sudo pkill -TERM buildkitd`); @@ -629,15 +657,27 @@ export async function shutdownBuildkitd(): Promise { } catch (error) { if (error.code === 1) { // pgrep returns exit code 1 when no process is found, which means shutdown successful + gracefulShutdown = true; core.debug('buildkitd successfully shutdown'); - return; + break; } // Some other error occurred throw error; } } - throw new Error('Timed out waiting for buildkitd to shutdown after 10 seconds'); + if (!gracefulShutdown) { + // CRITICAL: Do not continue if buildkitd didn't shutdown cleanly + // This prevents committing a potentially corrupted device + throw new Error('buildkitd failed to shutdown gracefully within timeout - device may be corrupted'); + } + + // CRITICAL: Sync after buildkitd exits to flush all database writes + core.debug('Syncing filesystem after buildkitd shutdown...'); + await execAsync('sync'); + // Give kernel time to complete the sync + await new Promise(resolve => setTimeout(resolve, 500)); + } catch (error) { core.error('error shutting down buildkitd process:', error); throw error; diff --git a/src/reporter.ts b/src/reporter.ts index 6815b9b..7fe4275 100644 --- a/src/reporter.ts +++ b/src/reporter.ts @@ -7,6 +7,8 @@ import {createClient} from '@connectrpc/connect'; import {createGrpcTransport} from '@connectrpc/connect-node'; import {StickyDiskService} from '@buf/blacksmith_vm-agent.connectrpc_es/stickydisk/v1/stickydisk_connect'; import {Metric, Metric_MetricType} from '@buf/blacksmith_vm-agent.bufbuild_es/stickydisk/v1/stickydisk_pb'; +import {exec} from 'child_process'; +import {promisify} from 'util'; // Configure base axios instance for Blacksmith API. const createBlacksmithAPIClient = () => { @@ -66,6 +68,14 @@ export async function reportBuildCompleted(exportRes?: ExportRecordResponse, bla return; } + // Add a final sync before committing to ensure all writes are persisted + try { + await promisify(exec)('sync'); + core.debug('Synced filesystem before committing sticky disk'); + } catch (e) { + core.debug('Failed to sync before commit: ' + e); + } + try { const agentClient = createBlacksmithAgentClient();