From 69e9bd2181217ada199663f3e8873dc1187713ab Mon Sep 17 00:00:00 2001 From: Raj Date: Wed, 17 Sep 2025 15:20:09 -0700 Subject: [PATCH 01/13] Add filesystem notification (FSNotify) support --- .../SandboxContext/SandboxContext.grpc.swift | 100 ++++++++++ .../SandboxContext/SandboxContext.pb.swift | 178 ++++++++++++++++++ .../SandboxContext/SandboxContext.proto | 21 +++ Sources/Containerization/Vminitd.swift | 11 ++ Sources/Integration/Suite.swift | 3 + Sources/cctl/FSNotifyCommand.swift | 121 ++++++++++++ Sources/cctl/cctl.swift | 1 + vminitd/Sources/vminitd/Server+GRPC.swift | 66 +++++++ 8 files changed, 501 insertions(+) create mode 100644 Sources/cctl/FSNotifyCommand.swift diff --git a/Sources/Containerization/SandboxContext/SandboxContext.grpc.swift b/Sources/Containerization/SandboxContext/SandboxContext.grpc.swift index ed59bfe2..96ae55d6 100644 --- a/Sources/Containerization/SandboxContext/SandboxContext.grpc.swift +++ b/Sources/Containerization/SandboxContext/SandboxContext.grpc.swift @@ -168,6 +168,11 @@ public protocol Com_Apple_Containerization_Sandbox_V3_SandboxContextClientProtoc _ request: Com_Apple_Containerization_Sandbox_V3_KillRequest, callOptions: CallOptions? ) -> UnaryCall + + func notifyFileSystemEvent( + _ request: Com_Apple_Containerization_Sandbox_V3_NotifyFileSystemEventRequest, + callOptions: CallOptions? + ) -> UnaryCall } extension Com_Apple_Containerization_Sandbox_V3_SandboxContextClientProtocol { @@ -661,6 +666,24 @@ extension Com_Apple_Containerization_Sandbox_V3_SandboxContextClientProtocol { interceptors: self.interceptors?.makeKillInterceptors() ?? [] ) } + + /// Notify guest of filesystem events from host. + /// + /// - Parameters: + /// - request: Request to send to NotifyFileSystemEvent. + /// - callOptions: Call options. + /// - Returns: A `UnaryCall` with futures for the metadata, status and response. + public func notifyFileSystemEvent( + _ request: Com_Apple_Containerization_Sandbox_V3_NotifyFileSystemEventRequest, + callOptions: CallOptions? = nil + ) -> UnaryCall { + return self.makeUnaryCall( + path: Com_Apple_Containerization_Sandbox_V3_SandboxContextClientMetadata.Methods.notifyFileSystemEvent.path, + request: request, + callOptions: callOptions ?? self.defaultCallOptions, + interceptors: self.interceptors?.makeNotifyFileSystemEventInterceptors() ?? [] + ) + } } @available(*, deprecated) @@ -860,6 +883,11 @@ public protocol Com_Apple_Containerization_Sandbox_V3_SandboxContextAsyncClientP _ request: Com_Apple_Containerization_Sandbox_V3_KillRequest, callOptions: CallOptions? ) -> GRPCAsyncUnaryCall + + func makeNotifyFileSystemEventCall( + _ request: Com_Apple_Containerization_Sandbox_V3_NotifyFileSystemEventRequest, + callOptions: CallOptions? + ) -> GRPCAsyncUnaryCall } @available(macOS 10.15, iOS 13, tvOS 13, watchOS 6, *) @@ -1195,6 +1223,18 @@ extension Com_Apple_Containerization_Sandbox_V3_SandboxContextAsyncClientProtoco interceptors: self.interceptors?.makeKillInterceptors() ?? [] ) } + + public func makeNotifyFileSystemEventCall( + _ request: Com_Apple_Containerization_Sandbox_V3_NotifyFileSystemEventRequest, + callOptions: CallOptions? = nil + ) -> GRPCAsyncUnaryCall { + return self.makeAsyncUnaryCall( + path: Com_Apple_Containerization_Sandbox_V3_SandboxContextClientMetadata.Methods.notifyFileSystemEvent.path, + request: request, + callOptions: callOptions ?? self.defaultCallOptions, + interceptors: self.interceptors?.makeNotifyFileSystemEventInterceptors() ?? [] + ) + } } @available(macOS 10.15, iOS 13, tvOS 13, watchOS 6, *) @@ -1522,6 +1562,18 @@ extension Com_Apple_Containerization_Sandbox_V3_SandboxContextAsyncClientProtoco interceptors: self.interceptors?.makeKillInterceptors() ?? [] ) } + + public func notifyFileSystemEvent( + _ request: Com_Apple_Containerization_Sandbox_V3_NotifyFileSystemEventRequest, + callOptions: CallOptions? = nil + ) async throws -> Com_Apple_Containerization_Sandbox_V3_NotifyFileSystemEventResponse { + return try await self.performAsyncUnaryCall( + path: Com_Apple_Containerization_Sandbox_V3_SandboxContextClientMetadata.Methods.notifyFileSystemEvent.path, + request: request, + callOptions: callOptions ?? self.defaultCallOptions, + interceptors: self.interceptors?.makeNotifyFileSystemEventInterceptors() ?? [] + ) + } } @available(macOS 10.15, iOS 13, tvOS 13, watchOS 6, *) @@ -1623,6 +1675,9 @@ public protocol Com_Apple_Containerization_Sandbox_V3_SandboxContextClientInterc /// - Returns: Interceptors to use when invoking 'kill'. func makeKillInterceptors() -> [ClientInterceptor] + + /// - Returns: Interceptors to use when invoking 'notifyFileSystemEvent'. + func makeNotifyFileSystemEventInterceptors() -> [ClientInterceptor] } public enum Com_Apple_Containerization_Sandbox_V3_SandboxContextClientMetadata { @@ -1657,6 +1712,7 @@ public enum Com_Apple_Containerization_Sandbox_V3_SandboxContextClientMetadata { Com_Apple_Containerization_Sandbox_V3_SandboxContextClientMetadata.Methods.configureHosts, Com_Apple_Containerization_Sandbox_V3_SandboxContextClientMetadata.Methods.sync, Com_Apple_Containerization_Sandbox_V3_SandboxContextClientMetadata.Methods.kill, + Com_Apple_Containerization_Sandbox_V3_SandboxContextClientMetadata.Methods.notifyFileSystemEvent, ] ) @@ -1822,6 +1878,12 @@ public enum Com_Apple_Containerization_Sandbox_V3_SandboxContextClientMetadata { path: "/com.apple.containerization.sandbox.v3.SandboxContext/Kill", type: GRPCCallType.unary ) + + public static let notifyFileSystemEvent = GRPCMethodDescriptor( + name: "NotifyFileSystemEvent", + path: "/com.apple.containerization.sandbox.v3.SandboxContext/NotifyFileSystemEvent", + type: GRPCCallType.unary + ) } } @@ -1912,6 +1974,9 @@ public protocol Com_Apple_Containerization_Sandbox_V3_SandboxContextProvider: Ca /// Send a signal to a process via the PID. func kill(request: Com_Apple_Containerization_Sandbox_V3_KillRequest, context: StatusOnlyCallContext) -> EventLoopFuture + + /// Notify guest of filesystem events from host. + func notifyFileSystemEvent(request: Com_Apple_Containerization_Sandbox_V3_NotifyFileSystemEventRequest, context: StatusOnlyCallContext) -> EventLoopFuture } extension Com_Apple_Containerization_Sandbox_V3_SandboxContextProvider { @@ -2169,6 +2234,15 @@ extension Com_Apple_Containerization_Sandbox_V3_SandboxContextProvider { userFunction: self.kill(request:context:) ) + case "NotifyFileSystemEvent": + return UnaryServerHandler( + context: context, + requestDeserializer: ProtobufDeserializer(), + responseSerializer: ProtobufSerializer(), + interceptors: self.interceptors?.makeNotifyFileSystemEventInterceptors() ?? [], + userFunction: self.notifyFileSystemEvent(request:context:) + ) + default: return nil } @@ -2345,6 +2419,12 @@ public protocol Com_Apple_Containerization_Sandbox_V3_SandboxContextAsyncProvide request: Com_Apple_Containerization_Sandbox_V3_KillRequest, context: GRPCAsyncServerCallContext ) async throws -> Com_Apple_Containerization_Sandbox_V3_KillResponse + + /// Notify guest of filesystem events from host. + func notifyFileSystemEvent( + request: Com_Apple_Containerization_Sandbox_V3_NotifyFileSystemEventRequest, + context: GRPCAsyncServerCallContext + ) async throws -> Com_Apple_Containerization_Sandbox_V3_NotifyFileSystemEventResponse } @available(macOS 10.15, iOS 13, tvOS 13, watchOS 6, *) @@ -2609,6 +2689,15 @@ extension Com_Apple_Containerization_Sandbox_V3_SandboxContextAsyncProvider { wrapping: { try await self.kill(request: $0, context: $1) } ) + case "NotifyFileSystemEvent": + return GRPCAsyncServerHandler( + context: context, + requestDeserializer: ProtobufDeserializer(), + responseSerializer: ProtobufSerializer(), + interceptors: self.interceptors?.makeNotifyFileSystemEventInterceptors() ?? [], + wrapping: { try await self.notifyFileSystemEvent(request: $0, context: $1) } + ) + default: return nil } @@ -2724,6 +2813,10 @@ public protocol Com_Apple_Containerization_Sandbox_V3_SandboxContextServerInterc /// - Returns: Interceptors to use when handling 'kill'. /// Defaults to calling `self.makeInterceptors()`. func makeKillInterceptors() -> [ServerInterceptor] + + /// - Returns: Interceptors to use when handling 'notifyFileSystemEvent'. + /// Defaults to calling `self.makeInterceptors()`. + func makeNotifyFileSystemEventInterceptors() -> [ServerInterceptor] } public enum Com_Apple_Containerization_Sandbox_V3_SandboxContextServerMetadata { @@ -2758,6 +2851,7 @@ public enum Com_Apple_Containerization_Sandbox_V3_SandboxContextServerMetadata { Com_Apple_Containerization_Sandbox_V3_SandboxContextServerMetadata.Methods.configureHosts, Com_Apple_Containerization_Sandbox_V3_SandboxContextServerMetadata.Methods.sync, Com_Apple_Containerization_Sandbox_V3_SandboxContextServerMetadata.Methods.kill, + Com_Apple_Containerization_Sandbox_V3_SandboxContextServerMetadata.Methods.notifyFileSystemEvent, ] ) @@ -2923,5 +3017,11 @@ public enum Com_Apple_Containerization_Sandbox_V3_SandboxContextServerMetadata { path: "/com.apple.containerization.sandbox.v3.SandboxContext/Kill", type: GRPCCallType.unary ) + + public static let notifyFileSystemEvent = GRPCMethodDescriptor( + name: "NotifyFileSystemEvent", + path: "/com.apple.containerization.sandbox.v3.SandboxContext/NotifyFileSystemEvent", + type: GRPCCallType.unary + ) } } diff --git a/Sources/Containerization/SandboxContext/SandboxContext.pb.swift b/Sources/Containerization/SandboxContext/SandboxContext.pb.swift index f4b63152..1b63d400 100644 --- a/Sources/Containerization/SandboxContext/SandboxContext.pb.swift +++ b/Sources/Containerization/SandboxContext/SandboxContext.pb.swift @@ -37,6 +37,56 @@ fileprivate struct _GeneratedWithProtocGenSwiftVersion: SwiftProtobuf.ProtobufAP typealias Version = _2 } +public enum Com_Apple_Containerization_Sandbox_V3_FileSystemEventType: SwiftProtobuf.Enum, Swift.CaseIterable { + public typealias RawValue = Int + case create // = 0 + case delete // = 1 + case link // = 2 + case unlink // = 3 + case modify // = 4 + case undefined // = 99 + case UNRECOGNIZED(Int) + + public init() { + self = .create + } + + public init?(rawValue: Int) { + switch rawValue { + case 0: self = .create + case 1: self = .delete + case 2: self = .link + case 3: self = .unlink + case 4: self = .modify + case 99: self = .undefined + default: self = .UNRECOGNIZED(rawValue) + } + } + + public var rawValue: Int { + switch self { + case .create: return 0 + case .delete: return 1 + case .link: return 2 + case .unlink: return 3 + case .modify: return 4 + case .undefined: return 99 + case .UNRECOGNIZED(let i): return i + } + } + + // The compiler won't synthesize support with the UNRECOGNIZED case. + public static let allCases: [Com_Apple_Containerization_Sandbox_V3_FileSystemEventType] = [ + .create, + .delete, + .link, + .unlink, + .modify, + .undefined, + ] + +} + public struct Com_Apple_Containerization_Sandbox_V3_Stdio: Sendable { // SwiftProtobuf.Message conformance is added in an extension below. See the // `Message` and `Message+*Additions` files in the SwiftProtobuf library for @@ -1206,10 +1256,58 @@ public struct Com_Apple_Containerization_Sandbox_V3_NetworkStats: Sendable { public init() {} } +public struct Com_Apple_Containerization_Sandbox_V3_NotifyFileSystemEventRequest: Sendable { + // SwiftProtobuf.Message conformance is added in an extension below. See the + // `Message` and `Message+*Additions` files in the SwiftProtobuf library for + // methods supported on all messages. + + public var path: String = String() + + public var eventType: Com_Apple_Containerization_Sandbox_V3_FileSystemEventType = .create + + public var unknownFields = SwiftProtobuf.UnknownStorage() + + public init() {} +} + +public struct Com_Apple_Containerization_Sandbox_V3_NotifyFileSystemEventResponse: Sendable { + // SwiftProtobuf.Message conformance is added in an extension below. See the + // `Message` and `Message+*Additions` files in the SwiftProtobuf library for + // methods supported on all messages. + + public var success: Bool = false + + public var error: String { + get {return _error ?? String()} + set {_error = newValue} + } + /// Returns true if `error` has been explicitly set. + public var hasError: Bool {return self._error != nil} + /// Clears the value of `error`. Subsequent reads from it will return its default value. + public mutating func clearError() {self._error = nil} + + public var unknownFields = SwiftProtobuf.UnknownStorage() + + public init() {} + + fileprivate var _error: String? = nil +} + // MARK: - Code below here is support for the SwiftProtobuf runtime. fileprivate let _protobuf_package = "com.apple.containerization.sandbox.v3" +extension Com_Apple_Containerization_Sandbox_V3_FileSystemEventType: SwiftProtobuf._ProtoNameProviding { + public static let _protobuf_nameMap: SwiftProtobuf._NameMap = [ + 0: .same(proto: "CREATE"), + 1: .same(proto: "DELETE"), + 2: .same(proto: "LINK"), + 3: .same(proto: "UNLINK"), + 4: .same(proto: "MODIFY"), + 99: .same(proto: "UNDEFINED"), + ] +} + extension Com_Apple_Containerization_Sandbox_V3_Stdio: SwiftProtobuf.Message, SwiftProtobuf._MessageImplementationBase, SwiftProtobuf._ProtoNameProviding { public static let protoMessageName: String = _protobuf_package + ".Stdio" public static let _protobuf_nameMap: SwiftProtobuf._NameMap = [ @@ -3612,3 +3710,83 @@ extension Com_Apple_Containerization_Sandbox_V3_NetworkStats: SwiftProtobuf.Mess return true } } + +extension Com_Apple_Containerization_Sandbox_V3_NotifyFileSystemEventRequest: SwiftProtobuf.Message, SwiftProtobuf._MessageImplementationBase, SwiftProtobuf._ProtoNameProviding { + public static let protoMessageName: String = _protobuf_package + ".NotifyFileSystemEventRequest" + public static let _protobuf_nameMap: SwiftProtobuf._NameMap = [ + 1: .same(proto: "path"), + 2: .standard(proto: "event_type"), + ] + + public mutating func decodeMessage(decoder: inout D) throws { + while let fieldNumber = try decoder.nextFieldNumber() { + // The use of inline closures is to circumvent an issue where the compiler + // allocates stack space for every case branch when no optimizations are + // enabled. https://github.com/apple/swift-protobuf/issues/1034 + switch fieldNumber { + case 1: try { try decoder.decodeSingularStringField(value: &self.path) }() + case 2: try { try decoder.decodeSingularEnumField(value: &self.eventType) }() + default: break + } + } + } + + public func traverse(visitor: inout V) throws { + if !self.path.isEmpty { + try visitor.visitSingularStringField(value: self.path, fieldNumber: 1) + } + if self.eventType != .create { + try visitor.visitSingularEnumField(value: self.eventType, fieldNumber: 2) + } + try unknownFields.traverse(visitor: &visitor) + } + + public static func ==(lhs: Com_Apple_Containerization_Sandbox_V3_NotifyFileSystemEventRequest, rhs: Com_Apple_Containerization_Sandbox_V3_NotifyFileSystemEventRequest) -> Bool { + if lhs.path != rhs.path {return false} + if lhs.eventType != rhs.eventType {return false} + if lhs.unknownFields != rhs.unknownFields {return false} + return true + } +} + +extension Com_Apple_Containerization_Sandbox_V3_NotifyFileSystemEventResponse: SwiftProtobuf.Message, SwiftProtobuf._MessageImplementationBase, SwiftProtobuf._ProtoNameProviding { + public static let protoMessageName: String = _protobuf_package + ".NotifyFileSystemEventResponse" + public static let _protobuf_nameMap: SwiftProtobuf._NameMap = [ + 1: .same(proto: "success"), + 2: .same(proto: "error"), + ] + + public mutating func decodeMessage(decoder: inout D) throws { + while let fieldNumber = try decoder.nextFieldNumber() { + // The use of inline closures is to circumvent an issue where the compiler + // allocates stack space for every case branch when no optimizations are + // enabled. https://github.com/apple/swift-protobuf/issues/1034 + switch fieldNumber { + case 1: try { try decoder.decodeSingularBoolField(value: &self.success) }() + case 2: try { try decoder.decodeSingularStringField(value: &self._error) }() + default: break + } + } + } + + public func traverse(visitor: inout V) throws { + // The use of inline closures is to circumvent an issue where the compiler + // allocates stack space for every if/case branch local when no optimizations + // are enabled. https://github.com/apple/swift-protobuf/issues/1034 and + // https://github.com/apple/swift-protobuf/issues/1182 + if self.success != false { + try visitor.visitSingularBoolField(value: self.success, fieldNumber: 1) + } + try { if let v = self._error { + try visitor.visitSingularStringField(value: v, fieldNumber: 2) + } }() + try unknownFields.traverse(visitor: &visitor) + } + + public static func ==(lhs: Com_Apple_Containerization_Sandbox_V3_NotifyFileSystemEventResponse, rhs: Com_Apple_Containerization_Sandbox_V3_NotifyFileSystemEventResponse) -> Bool { + if lhs.success != rhs.success {return false} + if lhs._error != rhs._error {return false} + if lhs.unknownFields != rhs.unknownFields {return false} + return true + } +} diff --git a/Sources/Containerization/SandboxContext/SandboxContext.proto b/Sources/Containerization/SandboxContext/SandboxContext.proto index ea3b90ed..398e5192 100644 --- a/Sources/Containerization/SandboxContext/SandboxContext.proto +++ b/Sources/Containerization/SandboxContext/SandboxContext.proto @@ -66,6 +66,8 @@ service SandboxContext { rpc Sync(SyncRequest) returns (SyncResponse); // Send a signal to a process via the PID. rpc Kill(KillRequest) returns (KillResponse); + // Notify guest of filesystem events from host. + rpc NotifyFileSystemEvent(NotifyFileSystemEventRequest) returns (NotifyFileSystemEventResponse); } message Stdio { @@ -352,3 +354,22 @@ message NetworkStats { uint64 receivedErrors = 6; uint64 transmittedErrors = 7; } + +enum FileSystemEventType { + CREATE = 0; + DELETE = 1; + LINK = 2; + UNLINK = 3; + MODIFY = 4; + UNDEFINED = 99; +} + +message NotifyFileSystemEventRequest { + string path = 1; + FileSystemEventType event_type = 2; +} + +message NotifyFileSystemEventResponse { + bool success = 1; + optional string error = 2; +} diff --git a/Sources/Containerization/Vminitd.swift b/Sources/Containerization/Vminitd.swift index 225f3b2b..2085ff25 100644 --- a/Sources/Containerization/Vminitd.swift +++ b/Sources/Containerization/Vminitd.swift @@ -427,6 +427,17 @@ extension Vminitd { try await Task.sleep(for: .milliseconds(10)) try await self.sync() } + + /// Send a filesystem event notification to the guest. + public func notifyFileSystemEvent(path: String, eventType: Com_Apple_Containerization_Sandbox_V3_FileSystemEventType) async throws + -> Com_Apple_Containerization_Sandbox_V3_NotifyFileSystemEventResponse + { + let request = Com_Apple_Containerization_Sandbox_V3_NotifyFileSystemEventRequest.with { + $0.path = path + $0.eventType = eventType + } + return try await client.notifyFileSystemEvent(request) + } } extension Hosts { diff --git a/Sources/Integration/Suite.swift b/Sources/Integration/Suite.swift index 32b8ef24..29bfd9ae 100644 --- a/Sources/Integration/Suite.swift +++ b/Sources/Integration/Suite.swift @@ -309,6 +309,9 @@ struct IntegrationSuite: AsyncParsableCommand { Test("pod container filesystem isolation", testPodContainerFilesystemIsolation), Test("pod container PID namespace isolation", testPodContainerPIDNamespaceIsolation), Test("pod container independent resource limits", testPodContainerIndependentResourceLimits), + + // fsnotify + Test("fsnotify events", testFSNotifyEvents), ] let passed: Atomic = Atomic(0) diff --git a/Sources/cctl/FSNotifyCommand.swift b/Sources/cctl/FSNotifyCommand.swift new file mode 100644 index 00000000..158a4774 --- /dev/null +++ b/Sources/cctl/FSNotifyCommand.swift @@ -0,0 +1,121 @@ +//===----------------------------------------------------------------------===// +// Copyright © 2025 Apple Inc. and the Containerization project authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//===----------------------------------------------------------------------===// + +import ArgumentParser +import Containerization +import ContainerizationError +import Foundation +import GRPC +import NIOCore +import NIOPosix + +extension Application { + struct FSNotify: AsyncParsableCommand { + static let configuration = CommandConfiguration( + commandName: "fsnotify", + abstract: "Send filesystem notification events to a running container" + ) + + @Option(name: [.customLong("container"), .customShort("c")], help: "Container ID to send notification to") + var containerID: String + + @Option(name: [.customLong("path"), .customShort("p")], help: "Path in the container to notify about") + var path: String + + @Option(name: [.customLong("event"), .customShort("e")], help: "Event type (create, delete, modify, link, unlink)") + var eventType: String = "modify" + + @Option(name: .customLong("vsock-socket"), help: "Path to the container's VSock socket") + var vsockSocket: String? + + @Option(name: .customLong("vsock-port"), help: "VSock port to connect to (default: 1024)") + var vsockPort: UInt32 = 1024 + + func run() async throws { + let eventType = try parseEventType(eventType) + + print("Sending FSNotify event to container '\(containerID)':") + print(" Path: \(path)") + print(" Event: \(eventType)") + + guard let socket = vsockSocket else { + print("Error: --vsock-socket parameter required") + print("Usage: cctl fsnotify --container --path --vsock-socket ") + print("") + print("Note: For end-to-end testing with real containers, use:") + print(" cctl test --include 'fsnotify events'") + throw ExitCode.failure + } + try await sendFSNotificationViaSocket( + socket: socket, + path: path, + eventType: eventType + ) + + print("FSNotify event sent successfully") + } + + private func parseEventType(_ eventString: String) throws -> Com_Apple_Containerization_Sandbox_V3_FileSystemEventType { + switch eventString.lowercased() { + case "create": + return .create + case "delete": + return .delete + case "modify": + return .modify + case "link": + return .link + case "unlink": + return .unlink + default: + throw "Invalid event type '\(eventString)'. Valid options: create, delete, modify, link, unlink" + } + } + + private func sendFSNotificationViaSocket( + socket: String, + path: String, + eventType: Com_Apple_Containerization_Sandbox_V3_FileSystemEventType + ) async throws { + let group = MultiThreadedEventLoopGroup(numberOfThreads: 1) + + do { + // Connect to the container's VSock socket + let client = Vminitd.Client(socket: socket, group: group) + let vminitd = Vminitd(client: client) + + // Send the notification using the public API + let response = try await vminitd.notifyFileSystemEvent(path: path, eventType: eventType) + + if !response.success { + let errorMsg = response.hasError ? response.error : "Unknown error" + throw "FSNotify failed: \(errorMsg)" + } + + // Close the connection + try await vminitd.close() + + } catch { + // Ensure group is shutdown even if there's an error + try await group.shutdownGracefully() + throw error + } + + // Shutdown the event loop group + try await group.shutdownGracefully() + } + } +} diff --git a/Sources/cctl/cctl.swift b/Sources/cctl/cctl.swift index a7e879c5..12ec0727 100644 --- a/Sources/cctl/cctl.swift +++ b/Sources/cctl/cctl.swift @@ -66,6 +66,7 @@ struct Application: AsyncParsableCommand { Login.self, Rootfs.self, Run.self, + FSNotify.self, ] ) } diff --git a/vminitd/Sources/vminitd/Server+GRPC.swift b/vminitd/Sources/vminitd/Server+GRPC.swift index 9b9d9228..a00f7368 100644 --- a/vminitd/Sources/vminitd/Server+GRPC.swift +++ b/vminitd/Sources/vminitd/Server+GRPC.swift @@ -1081,6 +1081,72 @@ extension Initd: Com_Apple_Containerization_Sandbox_V3_SandboxContextAsyncProvid $0.result = r } } + + func notifyFileSystemEvent( + request: Com_Apple_Containerization_Sandbox_V3_NotifyFileSystemEventRequest, + context: GRPCAsyncServerCallContext + ) async throws -> Com_Apple_Containerization_Sandbox_V3_NotifyFileSystemEventResponse { + log.debug( + "notifyFileSystemEvent", + metadata: [ + "path": "\(request.path)", + "eventType": "\(request.eventType)", + ]) + + do { + try await generateSyntheticInotifyEvent( + path: request.path, + eventType: request.eventType + ) + + return .with { + $0.success = true + } + } catch { + log.error( + "notifyFileSystemEvent", + metadata: [ + "error": "\(error)" + ]) + + return .with { + $0.success = false + $0.error = error.localizedDescription + } + } + } + + private func generateSyntheticInotifyEvent( + path: String, + eventType: Com_Apple_Containerization_Sandbox_V3_FileSystemEventType + ) async throws { + switch eventType { + case .modify: + // Touch file to update timestamp -> generates IN_ATTRIB event + let now = Date() + try FileManager.default.setAttributes( + [.modificationDate: now], + ofItemAtPath: path + ) + + case .create: + // Use chmod with same permissions to generate IN_ATTRIB event + let attributes = try FileManager.default.attributesOfItem(atPath: path) + let permissions = attributes[.posixPermissions] as? NSNumber ?? NSNumber(value: 0o644) + try FileManager.default.setAttributes( + [.posixPermissions: permissions], + ofItemAtPath: path + ) + + case .delete: + // We can't generate delete events for files that don't exist + // This would need to be handled by the application layer + log.warning("Delete events cannot be synthesized for existing files") + + default: + log.warning("Unsupported filesystem event type: \(eventType)") + } + } } extension Com_Apple_Containerization_Sandbox_V3_ConfigureHostsRequest { From 7d8349987d620ab5d1e2d74342296dcb8261f676 Mon Sep 17 00:00:00 2001 From: Raj Date: Mon, 22 Sep 2025 14:48:29 -0700 Subject: [PATCH 02/13] fix comments --- .../SandboxContext/SandboxContext.grpc.swift | 73 +++++++++------- .../SandboxContext/SandboxContext.proto | 2 +- Sources/Containerization/Vminitd.swift | 14 ++- vminitd/Sources/vminitd/Server+GRPC.swift | 86 ++++++++----------- 4 files changed, 94 insertions(+), 81 deletions(-) diff --git a/Sources/Containerization/SandboxContext/SandboxContext.grpc.swift b/Sources/Containerization/SandboxContext/SandboxContext.grpc.swift index 96ae55d6..67486d94 100644 --- a/Sources/Containerization/SandboxContext/SandboxContext.grpc.swift +++ b/Sources/Containerization/SandboxContext/SandboxContext.grpc.swift @@ -170,9 +170,9 @@ public protocol Com_Apple_Containerization_Sandbox_V3_SandboxContextClientProtoc ) -> UnaryCall func notifyFileSystemEvent( - _ request: Com_Apple_Containerization_Sandbox_V3_NotifyFileSystemEventRequest, - callOptions: CallOptions? - ) -> UnaryCall + callOptions: CallOptions?, + handler: @escaping (Com_Apple_Containerization_Sandbox_V3_NotifyFileSystemEventResponse) -> Void + ) -> BidirectionalStreamingCall } extension Com_Apple_Containerization_Sandbox_V3_SandboxContextClientProtocol { @@ -669,19 +669,22 @@ extension Com_Apple_Containerization_Sandbox_V3_SandboxContextClientProtocol { /// Notify guest of filesystem events from host. /// + /// Callers should use the `send` method on the returned object to send messages + /// to the server. The caller should send an `.end` after the final message has been sent. + /// /// - Parameters: - /// - request: Request to send to NotifyFileSystemEvent. /// - callOptions: Call options. - /// - Returns: A `UnaryCall` with futures for the metadata, status and response. + /// - handler: A closure called when each response is received from the server. + /// - Returns: A `ClientStreamingCall` with futures for the metadata and status. public func notifyFileSystemEvent( - _ request: Com_Apple_Containerization_Sandbox_V3_NotifyFileSystemEventRequest, - callOptions: CallOptions? = nil - ) -> UnaryCall { - return self.makeUnaryCall( + callOptions: CallOptions? = nil, + handler: @escaping (Com_Apple_Containerization_Sandbox_V3_NotifyFileSystemEventResponse) -> Void + ) -> BidirectionalStreamingCall { + return self.makeBidirectionalStreamingCall( path: Com_Apple_Containerization_Sandbox_V3_SandboxContextClientMetadata.Methods.notifyFileSystemEvent.path, - request: request, callOptions: callOptions ?? self.defaultCallOptions, - interceptors: self.interceptors?.makeNotifyFileSystemEventInterceptors() ?? [] + interceptors: self.interceptors?.makeNotifyFileSystemEventInterceptors() ?? [], + handler: handler ) } } @@ -885,9 +888,8 @@ public protocol Com_Apple_Containerization_Sandbox_V3_SandboxContextAsyncClientP ) -> GRPCAsyncUnaryCall func makeNotifyFileSystemEventCall( - _ request: Com_Apple_Containerization_Sandbox_V3_NotifyFileSystemEventRequest, callOptions: CallOptions? - ) -> GRPCAsyncUnaryCall + ) -> GRPCAsyncBidirectionalStreamingCall } @available(macOS 10.15, iOS 13, tvOS 13, watchOS 6, *) @@ -1225,12 +1227,10 @@ extension Com_Apple_Containerization_Sandbox_V3_SandboxContextAsyncClientProtoco } public func makeNotifyFileSystemEventCall( - _ request: Com_Apple_Containerization_Sandbox_V3_NotifyFileSystemEventRequest, callOptions: CallOptions? = nil - ) -> GRPCAsyncUnaryCall { - return self.makeAsyncUnaryCall( + ) -> GRPCAsyncBidirectionalStreamingCall { + return self.makeAsyncBidirectionalStreamingCall( path: Com_Apple_Containerization_Sandbox_V3_SandboxContextClientMetadata.Methods.notifyFileSystemEvent.path, - request: request, callOptions: callOptions ?? self.defaultCallOptions, interceptors: self.interceptors?.makeNotifyFileSystemEventInterceptors() ?? [] ) @@ -1563,13 +1563,25 @@ extension Com_Apple_Containerization_Sandbox_V3_SandboxContextAsyncClientProtoco ) } - public func notifyFileSystemEvent( - _ request: Com_Apple_Containerization_Sandbox_V3_NotifyFileSystemEventRequest, + public func notifyFileSystemEvent( + _ requests: RequestStream, callOptions: CallOptions? = nil - ) async throws -> Com_Apple_Containerization_Sandbox_V3_NotifyFileSystemEventResponse { - return try await self.performAsyncUnaryCall( + ) -> GRPCAsyncResponseStream where RequestStream: Sequence, RequestStream.Element == Com_Apple_Containerization_Sandbox_V3_NotifyFileSystemEventRequest { + return self.performAsyncBidirectionalStreamingCall( path: Com_Apple_Containerization_Sandbox_V3_SandboxContextClientMetadata.Methods.notifyFileSystemEvent.path, - request: request, + requests: requests, + callOptions: callOptions ?? self.defaultCallOptions, + interceptors: self.interceptors?.makeNotifyFileSystemEventInterceptors() ?? [] + ) + } + + public func notifyFileSystemEvent( + _ requests: RequestStream, + callOptions: CallOptions? = nil + ) -> GRPCAsyncResponseStream where RequestStream: AsyncSequence & Sendable, RequestStream.Element == Com_Apple_Containerization_Sandbox_V3_NotifyFileSystemEventRequest { + return self.performAsyncBidirectionalStreamingCall( + path: Com_Apple_Containerization_Sandbox_V3_SandboxContextClientMetadata.Methods.notifyFileSystemEvent.path, + requests: requests, callOptions: callOptions ?? self.defaultCallOptions, interceptors: self.interceptors?.makeNotifyFileSystemEventInterceptors() ?? [] ) @@ -1882,7 +1894,7 @@ public enum Com_Apple_Containerization_Sandbox_V3_SandboxContextClientMetadata { public static let notifyFileSystemEvent = GRPCMethodDescriptor( name: "NotifyFileSystemEvent", path: "/com.apple.containerization.sandbox.v3.SandboxContext/NotifyFileSystemEvent", - type: GRPCCallType.unary + type: GRPCCallType.bidirectionalStreaming ) } } @@ -1976,7 +1988,7 @@ public protocol Com_Apple_Containerization_Sandbox_V3_SandboxContextProvider: Ca func kill(request: Com_Apple_Containerization_Sandbox_V3_KillRequest, context: StatusOnlyCallContext) -> EventLoopFuture /// Notify guest of filesystem events from host. - func notifyFileSystemEvent(request: Com_Apple_Containerization_Sandbox_V3_NotifyFileSystemEventRequest, context: StatusOnlyCallContext) -> EventLoopFuture + func notifyFileSystemEvent(context: StreamingResponseCallContext) -> EventLoopFuture<(StreamEvent) -> Void> } extension Com_Apple_Containerization_Sandbox_V3_SandboxContextProvider { @@ -2235,12 +2247,12 @@ extension Com_Apple_Containerization_Sandbox_V3_SandboxContextProvider { ) case "NotifyFileSystemEvent": - return UnaryServerHandler( + return BidirectionalStreamingServerHandler( context: context, requestDeserializer: ProtobufDeserializer(), responseSerializer: ProtobufSerializer(), interceptors: self.interceptors?.makeNotifyFileSystemEventInterceptors() ?? [], - userFunction: self.notifyFileSystemEvent(request:context:) + observerFactory: self.notifyFileSystemEvent(context:) ) default: @@ -2422,9 +2434,10 @@ public protocol Com_Apple_Containerization_Sandbox_V3_SandboxContextAsyncProvide /// Notify guest of filesystem events from host. func notifyFileSystemEvent( - request: Com_Apple_Containerization_Sandbox_V3_NotifyFileSystemEventRequest, + requestStream: GRPCAsyncRequestStream, + responseStream: GRPCAsyncResponseStreamWriter, context: GRPCAsyncServerCallContext - ) async throws -> Com_Apple_Containerization_Sandbox_V3_NotifyFileSystemEventResponse + ) async throws } @available(macOS 10.15, iOS 13, tvOS 13, watchOS 6, *) @@ -2695,7 +2708,7 @@ extension Com_Apple_Containerization_Sandbox_V3_SandboxContextAsyncProvider { requestDeserializer: ProtobufDeserializer(), responseSerializer: ProtobufSerializer(), interceptors: self.interceptors?.makeNotifyFileSystemEventInterceptors() ?? [], - wrapping: { try await self.notifyFileSystemEvent(request: $0, context: $1) } + wrapping: { try await self.notifyFileSystemEvent(requestStream: $0, responseStream: $1, context: $2) } ) default: @@ -3021,7 +3034,7 @@ public enum Com_Apple_Containerization_Sandbox_V3_SandboxContextServerMetadata { public static let notifyFileSystemEvent = GRPCMethodDescriptor( name: "NotifyFileSystemEvent", path: "/com.apple.containerization.sandbox.v3.SandboxContext/NotifyFileSystemEvent", - type: GRPCCallType.unary + type: GRPCCallType.bidirectionalStreaming ) } } diff --git a/Sources/Containerization/SandboxContext/SandboxContext.proto b/Sources/Containerization/SandboxContext/SandboxContext.proto index 398e5192..e4eb9a63 100644 --- a/Sources/Containerization/SandboxContext/SandboxContext.proto +++ b/Sources/Containerization/SandboxContext/SandboxContext.proto @@ -67,7 +67,7 @@ service SandboxContext { // Send a signal to a process via the PID. rpc Kill(KillRequest) returns (KillResponse); // Notify guest of filesystem events from host. - rpc NotifyFileSystemEvent(NotifyFileSystemEventRequest) returns (NotifyFileSystemEventResponse); + rpc NotifyFileSystemEvent(stream NotifyFileSystemEventRequest) returns (stream NotifyFileSystemEventResponse); } message Stdio { diff --git a/Sources/Containerization/Vminitd.swift b/Sources/Containerization/Vminitd.swift index 2085ff25..fc4ca794 100644 --- a/Sources/Containerization/Vminitd.swift +++ b/Sources/Containerization/Vminitd.swift @@ -436,7 +436,19 @@ extension Vminitd { $0.path = path $0.eventType = eventType } - return try await client.notifyFileSystemEvent(request) + + let requests = AsyncStream { continuation in + continuation.yield(request) + continuation.finish() + } + + let responses = client.notifyFileSystemEvent(requests) + + for try await response in responses { + return response + } + + throw ContainerizationError(.internalError, message: "No response received from notifyFileSystemEvent") } } diff --git a/vminitd/Sources/vminitd/Server+GRPC.swift b/vminitd/Sources/vminitd/Server+GRPC.swift index a00f7368..77409155 100644 --- a/vminitd/Sources/vminitd/Server+GRPC.swift +++ b/vminitd/Sources/vminitd/Server+GRPC.swift @@ -1083,35 +1083,41 @@ extension Initd: Com_Apple_Containerization_Sandbox_V3_SandboxContextAsyncProvid } func notifyFileSystemEvent( - request: Com_Apple_Containerization_Sandbox_V3_NotifyFileSystemEventRequest, + requestStream: GRPCAsyncRequestStream, + responseStream: GRPCAsyncResponseStreamWriter, context: GRPCAsyncServerCallContext - ) async throws -> Com_Apple_Containerization_Sandbox_V3_NotifyFileSystemEventResponse { - log.debug( - "notifyFileSystemEvent", - metadata: [ - "path": "\(request.path)", - "eventType": "\(request.eventType)", - ]) - - do { - try await generateSyntheticInotifyEvent( - path: request.path, - eventType: request.eventType - ) - - return .with { - $0.success = true - } - } catch { - log.error( + ) async throws { + for try await request in requestStream { + log.debug( "notifyFileSystemEvent", metadata: [ - "error": "\(error)" + "path": "\(request.path)", + "eventType": "\(request.eventType)", ]) - return .with { - $0.success = false - $0.error = error.localizedDescription + do { + try await generateSyntheticInotifyEvent( + path: request.path, + eventType: request.eventType + ) + + let response = Com_Apple_Containerization_Sandbox_V3_NotifyFileSystemEventResponse.with { + $0.success = true + } + try await responseStream.send(response) + + } catch { + log.error( + "notifyFileSystemEvent", + metadata: [ + "error": "\(error)" + ]) + + let response = Com_Apple_Containerization_Sandbox_V3_NotifyFileSystemEventResponse.with { + $0.success = false + $0.error = error.localizedDescription + } + try await responseStream.send(response) } } } @@ -1120,32 +1126,14 @@ extension Initd: Com_Apple_Containerization_Sandbox_V3_SandboxContextAsyncProvid path: String, eventType: Com_Apple_Containerization_Sandbox_V3_FileSystemEventType ) async throws { - switch eventType { - case .modify: - // Touch file to update timestamp -> generates IN_ATTRIB event - let now = Date() - try FileManager.default.setAttributes( - [.modificationDate: now], - ofItemAtPath: path - ) - - case .create: - // Use chmod with same permissions to generate IN_ATTRIB event - let attributes = try FileManager.default.attributesOfItem(atPath: path) - let permissions = attributes[.posixPermissions] as? NSNumber ?? NSNumber(value: 0o644) - try FileManager.default.setAttributes( - [.posixPermissions: permissions], - ofItemAtPath: path - ) - - case .delete: - // We can't generate delete events for files that don't exist - // This would need to be handled by the application layer - log.warning("Delete events cannot be synthesized for existing files") - - default: - log.warning("Unsupported filesystem event type: \(eventType)") + let attributes = try FileManager.default.attributesOfItem(atPath: path) + guard let permissions = attributes[.posixPermissions] as? NSNumber else { + throw GRPCStatus(code: .internalError, message: "Failed to get file permissions for path: \(path)") } + try FileManager.default.setAttributes( + [.posixPermissions: permissions], + ofItemAtPath: path + ) } } From dd56895664a7b9c5f87dcb006dcf433a288696a1 Mon Sep 17 00:00:00 2001 From: Raj Date: Wed, 24 Sep 2025 15:34:52 -0700 Subject: [PATCH 03/13] fix integration test issues --- vminitd/Sources/vminitd/Server+GRPC.swift | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/vminitd/Sources/vminitd/Server+GRPC.swift b/vminitd/Sources/vminitd/Server+GRPC.swift index 77409155..daf843b4 100644 --- a/vminitd/Sources/vminitd/Server+GRPC.swift +++ b/vminitd/Sources/vminitd/Server+GRPC.swift @@ -1085,7 +1085,7 @@ extension Initd: Com_Apple_Containerization_Sandbox_V3_SandboxContextAsyncProvid func notifyFileSystemEvent( requestStream: GRPCAsyncRequestStream, responseStream: GRPCAsyncResponseStreamWriter, - context: GRPCAsyncServerCallContext + context: GRPC.GRPCAsyncServerCallContext ) async throws { for try await request in requestStream { log.debug( @@ -1126,6 +1126,10 @@ extension Initd: Com_Apple_Containerization_Sandbox_V3_SandboxContextAsyncProvid path: String, eventType: Com_Apple_Containerization_Sandbox_V3_FileSystemEventType ) async throws { + if eventType == .delete && !FileManager.default.fileExists(atPath: path) { + return + } + let attributes = try FileManager.default.attributesOfItem(atPath: path) guard let permissions = attributes[.posixPermissions] as? NSNumber else { throw GRPCStatus(code: .internalError, message: "Failed to get file permissions for path: \(path)") From b7f5bcaeb2cf2b6a17834f5e0f581eb168c3b7dc Mon Sep 17 00:00:00 2001 From: Raj Date: Thu, 25 Sep 2025 14:35:45 -0700 Subject: [PATCH 04/13] implement thread-per-container namespace isolation for FSNotify (test failing) --- .../SandboxContext/SandboxContext.pb.swift | 8 + .../SandboxContext/SandboxContext.proto | 1 + Sources/Containerization/Vminitd.swift | 9 +- Sources/cctl/FSNotifyCommand.swift | 6 +- .../Sources/vminitd/ManagedContainer.swift | 169 +++++++++++++++++- 5 files changed, 188 insertions(+), 5 deletions(-) diff --git a/Sources/Containerization/SandboxContext/SandboxContext.pb.swift b/Sources/Containerization/SandboxContext/SandboxContext.pb.swift index 1b63d400..a4994a38 100644 --- a/Sources/Containerization/SandboxContext/SandboxContext.pb.swift +++ b/Sources/Containerization/SandboxContext/SandboxContext.pb.swift @@ -1265,6 +1265,8 @@ public struct Com_Apple_Containerization_Sandbox_V3_NotifyFileSystemEventRequest public var eventType: Com_Apple_Containerization_Sandbox_V3_FileSystemEventType = .create + public var containerID: String = String() + public var unknownFields = SwiftProtobuf.UnknownStorage() public init() {} @@ -3716,6 +3718,7 @@ extension Com_Apple_Containerization_Sandbox_V3_NotifyFileSystemEventRequest: Sw public static let _protobuf_nameMap: SwiftProtobuf._NameMap = [ 1: .same(proto: "path"), 2: .standard(proto: "event_type"), + 3: .standard(proto: "container_id"), ] public mutating func decodeMessage(decoder: inout D) throws { @@ -3726,6 +3729,7 @@ extension Com_Apple_Containerization_Sandbox_V3_NotifyFileSystemEventRequest: Sw switch fieldNumber { case 1: try { try decoder.decodeSingularStringField(value: &self.path) }() case 2: try { try decoder.decodeSingularEnumField(value: &self.eventType) }() + case 3: try { try decoder.decodeSingularStringField(value: &self.containerID) }() default: break } } @@ -3738,12 +3742,16 @@ extension Com_Apple_Containerization_Sandbox_V3_NotifyFileSystemEventRequest: Sw if self.eventType != .create { try visitor.visitSingularEnumField(value: self.eventType, fieldNumber: 2) } + if !self.containerID.isEmpty { + try visitor.visitSingularStringField(value: self.containerID, fieldNumber: 3) + } try unknownFields.traverse(visitor: &visitor) } public static func ==(lhs: Com_Apple_Containerization_Sandbox_V3_NotifyFileSystemEventRequest, rhs: Com_Apple_Containerization_Sandbox_V3_NotifyFileSystemEventRequest) -> Bool { if lhs.path != rhs.path {return false} if lhs.eventType != rhs.eventType {return false} + if lhs.containerID != rhs.containerID {return false} if lhs.unknownFields != rhs.unknownFields {return false} return true } diff --git a/Sources/Containerization/SandboxContext/SandboxContext.proto b/Sources/Containerization/SandboxContext/SandboxContext.proto index e4eb9a63..722772e9 100644 --- a/Sources/Containerization/SandboxContext/SandboxContext.proto +++ b/Sources/Containerization/SandboxContext/SandboxContext.proto @@ -367,6 +367,7 @@ enum FileSystemEventType { message NotifyFileSystemEventRequest { string path = 1; FileSystemEventType event_type = 2; + string container_id = 3; } message NotifyFileSystemEventResponse { diff --git a/Sources/Containerization/Vminitd.swift b/Sources/Containerization/Vminitd.swift index fc4ca794..84a64962 100644 --- a/Sources/Containerization/Vminitd.swift +++ b/Sources/Containerization/Vminitd.swift @@ -429,12 +429,15 @@ extension Vminitd { } /// Send a filesystem event notification to the guest. - public func notifyFileSystemEvent(path: String, eventType: Com_Apple_Containerization_Sandbox_V3_FileSystemEventType) async throws - -> Com_Apple_Containerization_Sandbox_V3_NotifyFileSystemEventResponse - { + public func notifyFileSystemEvent( + path: String, + eventType: Com_Apple_Containerization_Sandbox_V3_FileSystemEventType, + containerID: String + ) async throws -> Com_Apple_Containerization_Sandbox_V3_NotifyFileSystemEventResponse { let request = Com_Apple_Containerization_Sandbox_V3_NotifyFileSystemEventRequest.with { $0.path = path $0.eventType = eventType + $0.containerID = containerID } let requests = AsyncStream { continuation in diff --git a/Sources/cctl/FSNotifyCommand.swift b/Sources/cctl/FSNotifyCommand.swift index 158a4774..82b82605 100644 --- a/Sources/cctl/FSNotifyCommand.swift +++ b/Sources/cctl/FSNotifyCommand.swift @@ -98,7 +98,11 @@ extension Application { let vminitd = Vminitd(client: client) // Send the notification using the public API - let response = try await vminitd.notifyFileSystemEvent(path: path, eventType: eventType) + let response = try await vminitd.notifyFileSystemEvent( + path: path, + eventType: eventType, + containerID: containerID + ) if !response.success { let errorMsg = response.hasError ? response.error : "Unknown error" diff --git a/vminitd/Sources/vminitd/ManagedContainer.swift b/vminitd/Sources/vminitd/ManagedContainer.swift index 013e9a93..bdd6a324 100644 --- a/vminitd/Sources/vminitd/ManagedContainer.swift +++ b/vminitd/Sources/vminitd/ManagedContainer.swift @@ -15,11 +15,19 @@ //===----------------------------------------------------------------------===// import Cgroup +import Containerization import ContainerizationError import ContainerizationOCI import ContainerizationOS import Foundation import Logging +import Synchronization + +#if canImport(Musl) +import Musl +#elseif canImport(Glibc) +import Glibc +#endif actor ManagedContainer { let id: String @@ -29,6 +37,131 @@ actor ManagedContainer { private let log: Logger private let bundle: ContainerizationOCI.Bundle private var execs: [String: ManagedProcess] = [:] + private var namespaceWorker: NamespaceWorker? + + /// Worker thread that runs in container's namespace for filesystem operations + private final class NamespaceWorker: @unchecked Sendable { + private let containerID: String + private let containerPID: Int32 + private var workerThread: Thread? + private let eventQueue: Mutex<[FileSystemEvent]> = Mutex([]) + private let semaphore: DispatchSemaphore = DispatchSemaphore(value: 0) + private let shouldStop: Atomic = Atomic(false) + + struct FileSystemEvent: Sendable { + let path: String + let eventType: Com_Apple_Containerization_Sandbox_V3_FileSystemEventType + let completion: @Sendable (Result) -> Void + } + + init(containerID: String, containerPID: Int32) { + self.containerID = containerID + self.containerPID = containerPID + } + + func start() throws { + guard workerThread == nil else { + throw ContainerizationError(.invalidState, message: "NamespaceWorker already started") + } + + let thread = Thread { [weak self] in + self?.runWorkerLoop() + } + thread.name = "namespace-worker-\(containerID)" + workerThread = thread + thread.start() + } + + func enqueueEvent(path: String, eventType: Com_Apple_Containerization_Sandbox_V3_FileSystemEventType) async throws { + try await withCheckedThrowingContinuation { continuation in + let event = FileSystemEvent( + path: path, + eventType: eventType, + completion: { @Sendable result in + continuation.resume(with: result) + } + ) + + eventQueue.withLock { (queue: inout [FileSystemEvent]) in + queue.append(event) + } + semaphore.signal() + } + } + + func stop() { + shouldStop.store(true, ordering: .relaxed) + semaphore.signal() // Wake up the worker thread + workerThread?.cancel() + workerThread = nil + } + + private func runWorkerLoop() { + // Enter container namespace + do { + try enterContainerNamespace() + } catch { + return + } + + // Worker loop + while !shouldStop.load(ordering: .relaxed) { + semaphore.wait() + + // Check stop condition again after waking up + if shouldStop.load(ordering: .relaxed) { + break + } + + // Process all queued events + let events = eventQueue.withLock { (queue: inout [FileSystemEvent]) -> [FileSystemEvent] in + let currentEvents = Array(queue) + queue.removeAll() + return currentEvents + } + + for event in events { + do { + try generateSyntheticInotifyEvent(path: event.path, eventType: event.eventType) + event.completion(.success(())) + } catch { + event.completion(.failure(error)) + } + } + } + } + + private func enterContainerNamespace() throws { + let nsPath = "/proc/\(containerPID)/ns/mnt" + let fd = open(nsPath, O_RDONLY) + guard fd >= 0 else { + throw ContainerizationError(.internalError, message: "Failed to open namespace file: \(nsPath)") + } + defer { _ = close(fd) } + + guard setns(fd, CLONE_NEWNS) == 0 else { + throw ContainerizationError(.internalError, message: "Failed to setns to mount namespace: \(errno)") + } + } + + private func generateSyntheticInotifyEvent( + path: String, + eventType: Com_Apple_Containerization_Sandbox_V3_FileSystemEventType + ) throws { + if eventType == .delete && !FileManager.default.fileExists(atPath: path) { + return + } + + let attributes = try FileManager.default.attributesOfItem(atPath: path) + guard let permissions = attributes[.posixPermissions] as? NSNumber else { + throw ContainerizationError(.internalError, message: "Failed to get file permissions for path: \(path)") + } + try FileManager.default.setAttributes( + [.posixPermissions: permissions], + ofItemAtPath: path + ) + } + } var pid: Int32? { self.initProcess.pid @@ -77,6 +210,9 @@ actor ManagedContainer { self.id = id self.bundle = bundle self.log = log + + // Initialize namespace worker - will be started after process starts + self.namespaceWorker = nil } catch { try? cgManager.delete() throw error @@ -94,6 +230,26 @@ extension ManagedContainer { } } + /// Start namespace worker thread after container process starts + private func startNamespaceWorker() throws { + let pid = self.initProcess.pid + guard pid > 0 else { + throw ContainerizationError(.invalidState, message: "Container process not started") + } + + let worker = NamespaceWorker(containerID: self.id, containerPID: pid) + try worker.start() + self.namespaceWorker = worker + } + + /// Execute filesystem event using dedicated namespace thread + func executeFileSystemEvent(path: String, eventType: Com_Apple_Containerization_Sandbox_V3_FileSystemEventType) async throws { + guard let worker = self.namespaceWorker else { + throw ContainerizationError(.invalidState, message: "Namespace worker not started for container \(self.id)") + } + try await worker.enqueueEvent(path: path, eventType: eventType) + } + func createExec( id: String, stdio: HostStdio, @@ -119,7 +275,14 @@ extension ManagedContainer { func start(execID: String) async throws -> Int32 { let proc = try self.getExecOrInit(execID: execID) - return try await ProcessSupervisor.default.start(process: proc) + let pid = try await ProcessSupervisor.default.start(process: proc) + + // Start namespace worker thread if this is the init process + if execID == self.id { + try self.startNamespaceWorker() + } + + return pid } func wait(execID: String) async throws -> ManagedProcess.ExitStatus { @@ -153,6 +316,10 @@ extension ManagedContainer { } func delete() throws { + // Stop namespace worker thread + self.namespaceWorker?.stop() + self.namespaceWorker = nil + try self.bundle.delete() try self.cgroupManager.delete(force: true) } From 5e19a21108fe851e5404afe98e2ef5801ca6acca Mon Sep 17 00:00:00 2001 From: Raj Date: Fri, 26 Sep 2025 17:33:58 -0700 Subject: [PATCH 05/13] switch to fork-based FSNotify architecture to resolve setns() multithreading restrictions --- Sources/Integration/Suite.swift | 3 +- .../Sources/vminitd/ManagedContainer.swift | 351 +++++++++++++++--- vminitd/Sources/vminitd/Server+GRPC.swift | 39 +- 3 files changed, 312 insertions(+), 81 deletions(-) diff --git a/Sources/Integration/Suite.swift b/Sources/Integration/Suite.swift index 29bfd9ae..530b5756 100644 --- a/Sources/Integration/Suite.swift +++ b/Sources/Integration/Suite.swift @@ -162,8 +162,7 @@ struct IntegrationSuite: AsyncParsableCommand { static let eventLoop = MultiThreadedEventLoopGroup(numberOfThreads: System.coreCount) - func bootstrap(_ testID: String) async throws -> (rootfs: Containerization.Mount, vmm: VirtualMachineManager, image: Containerization.Image, bootlog: URL) { - let reference = "ghcr.io/linuxcontainers/alpine:3.20" + func bootstrap(_ testID: String, reference: String = "ghcr.io/linuxcontainers/alpine:3.20") async throws -> (rootfs: Containerization.Mount, vmm: VirtualMachineManager, image: Containerization.Image, bootlog: URL) { let store = Self.imageStore let initImage = try await store.getInitImage(reference: Self.initImage) diff --git a/vminitd/Sources/vminitd/ManagedContainer.swift b/vminitd/Sources/vminitd/ManagedContainer.swift index bdd6a324..fb5b46e1 100644 --- a/vminitd/Sources/vminitd/ManagedContainer.swift +++ b/vminitd/Sources/vminitd/ManagedContainer.swift @@ -39,108 +39,347 @@ actor ManagedContainer { private var execs: [String: ManagedProcess] = [:] private var namespaceWorker: NamespaceWorker? - /// Worker thread that runs in container's namespace for filesystem operations + /// Worker child process that runs in container's namespace for filesystem operations private final class NamespaceWorker: @unchecked Sendable { private let containerID: String private let containerPID: Int32 - private var workerThread: Thread? - private let eventQueue: Mutex<[FileSystemEvent]> = Mutex([]) - private let semaphore: DispatchSemaphore = DispatchSemaphore(value: 0) + private var childPID: Int32? + private var parentSocket: Int32? + private var eventIDCounter: UInt32 = 0 + private let pendingEvents: Mutex<[UInt32: CheckedContinuation]> = Mutex([:]) + private var responseReaderTask: Task? private let shouldStop: Atomic = Atomic(false) - struct FileSystemEvent: Sendable { - let path: String - let eventType: Com_Apple_Containerization_Sandbox_V3_FileSystemEventType - let completion: @Sendable (Result) -> Void - } - init(containerID: String, containerPID: Int32) { self.containerID = containerID self.containerPID = containerPID } func start() throws { - guard workerThread == nil else { + guard childPID == nil else { throw ContainerizationError(.invalidState, message: "NamespaceWorker already started") } - let thread = Thread { [weak self] in - self?.runWorkerLoop() + // Create socketpair for parent-child communication + var sockets: [Int32] = [0, 0] + guard socketpair(AF_UNIX, SOCK_STREAM, 0, &sockets) == 0 else { + throw ContainerizationError(.internalError, message: "Failed to create socketpair: errno \(errno)") + } + + let parentSocket = sockets[0] + let childSocket = sockets[1] + + // Fork child process + let pid = fork() + guard pid >= 0 else { + close(parentSocket) + close(childSocket) + throw ContainerizationError(.internalError, message: "Failed to fork: errno \(errno)") + } + + if pid == 0 { + // Child process + close(parentSocket) + runChildProcess(socket: childSocket) + exit(0) + } else { + // Parent process + close(childSocket) + self.childPID = pid + self.parentSocket = parentSocket + + // Wait for child to signal ready or failure + var signal: UInt8 = 0 + let readResult = read(parentSocket, &signal, 1) + + if readResult != 1 { + // Child failed to send signal + close(parentSocket) + self.parentSocket = nil + var status: Int32 = 0 + waitpid(pid, &status, 0) + self.childPID = nil + throw ContainerizationError(.internalError, message: "Child process failed to start") + } + + if signal == 0xFF { + // Child failed to enter namespace + close(parentSocket) + self.parentSocket = nil + var status: Int32 = 0 + waitpid(pid, &status, 0) + self.childPID = nil + throw ContainerizationError(.internalError, message: "Child process failed to enter container namespace") + } + + if signal != 0xAA { + // Unexpected signal + close(parentSocket) + self.parentSocket = nil + var status: Int32 = 0 + waitpid(pid, &status, 0) + self.childPID = nil + throw ContainerizationError(.internalError, message: "Child process sent unexpected signal: \(signal)") + } + + // Start response reader task + self.responseReaderTask = Task { [weak self] in + await self?.readChildResponses() + } } - thread.name = "namespace-worker-\(containerID)" - workerThread = thread - thread.start() } func enqueueEvent(path: String, eventType: Com_Apple_Containerization_Sandbox_V3_FileSystemEventType) async throws { - try await withCheckedThrowingContinuation { continuation in - let event = FileSystemEvent( - path: path, - eventType: eventType, - completion: { @Sendable result in - continuation.resume(with: result) - } - ) + guard let socket = parentSocket, !shouldStop.load(ordering: .relaxed) else { + throw ContainerizationError(.invalidState, message: "NamespaceWorker not running") + } + + let eventID = eventIDCounter + eventIDCounter += 1 - eventQueue.withLock { (queue: inout [FileSystemEvent]) in - queue.append(event) + // Store continuation for this event + try await withCheckedThrowingContinuation { (continuation: CheckedContinuation) in + pendingEvents.withLock { events in + events[eventID] = continuation + } + + // Send event to child process + do { + try sendEventToChild(socket: socket, eventID: eventID, path: path, eventType: eventType) + } catch { + // Remove from pending events if send failed + _ = pendingEvents.withLock { events in + events.removeValue(forKey: eventID) + } + continuation.resume(throwing: error) } - semaphore.signal() } } func stop() { shouldStop.store(true, ordering: .relaxed) - semaphore.signal() // Wake up the worker thread - workerThread?.cancel() - workerThread = nil + + // Cancel response reader task + responseReaderTask?.cancel() + responseReaderTask = nil + + // Close parent socket + if let socket = parentSocket { + close(socket) + parentSocket = nil + } + + // Terminate child process + if let pid = childPID { + #if canImport(Musl) + Musl.kill(pid, SIGTERM) + #elseif canImport(Glibc) + Glibc.kill(pid, SIGTERM) + #endif + + // Wait for child to exit + var status: Int32 = 0 + waitpid(pid, &status, 0) + childPID = nil + } + + // Cancel all pending events + pendingEvents.withLock { events in + for (_, continuation) in events { + continuation.resume(throwing: ContainerizationError(.cancelled, message: "NamespaceWorker stopped")) + } + events.removeAll() + } } - private func runWorkerLoop() { + private func runChildProcess(socket: Int32) { // Enter container namespace do { try enterContainerNamespace() } catch { - return + // Signal parent that namespace entry failed, then exit + var failureResponse: UInt8 = 0xFF // Special failure signal + _ = write(socket, &failureResponse, 1) + close(socket) + exit(1) + } + + // Signal parent that we're ready + var readySignal: UInt8 = 0xAA // Ready signal + guard write(socket, &readySignal, 1) == 1 else { + close(socket) + exit(1) } - // Worker loop + // Child event loop + while true { + do { + // Read event from parent + guard let (eventID, path, eventType) = try readEventFromParent(socket: socket) else { + break // Parent closed socket + } + + // Process filesystem event + var success: UInt8 = 1 + do { + try generateSyntheticInotifyEvent(path: path, eventType: eventType) + } catch { + success = 0 + } + + // Send response to parent + try sendResponseToParent(socket: socket, eventID: eventID, success: success) + } catch { + break + } + } + + close(socket) + } + + private func readChildResponses() async { + guard let socket = parentSocket else { return } + while !shouldStop.load(ordering: .relaxed) { - semaphore.wait() + do { + // Read response from child + guard let (eventID, success) = try readResponseFromChild(socket: socket) else { + break // Socket closed + } - // Check stop condition again after waking up - if shouldStop.load(ordering: .relaxed) { + // Resume the corresponding continuation + pendingEvents.withLock { events in + if let continuation = events.removeValue(forKey: eventID) { + if success == 1 { + continuation.resume() + } else { + continuation.resume(throwing: ContainerizationError(.internalError, message: "Child process failed to process filesystem event")) + } + } + } + } catch { break } + } + } - // Process all queued events - let events = eventQueue.withLock { (queue: inout [FileSystemEvent]) -> [FileSystemEvent] in - let currentEvents = Array(queue) - queue.removeAll() - return currentEvents + private func sendEventToChild(socket: Int32, eventID: UInt32, path: String, eventType: Com_Apple_Containerization_Sandbox_V3_FileSystemEventType) throws { + let pathData = path.data(using: .utf8) ?? Data() + let pathLen = UInt32(pathData.count) + let eventTypeValue = UInt32(eventType.rawValue) + + // Binary protocol: [event_type:4][path_len:4][path:N][event_id:4] + var buffer = Data() + buffer.append(contentsOf: withUnsafeBytes(of: eventTypeValue.bigEndian) { Data($0) }) + buffer.append(contentsOf: withUnsafeBytes(of: pathLen.bigEndian) { Data($0) }) + buffer.append(pathData) + buffer.append(contentsOf: withUnsafeBytes(of: eventID.bigEndian) { Data($0) }) + + try buffer.withUnsafeBytes { bytes in + let written = write(socket, bytes.bindMemory(to: UInt8.self).baseAddress, buffer.count) + guard written == buffer.count else { + throw ContainerizationError(.internalError, message: "Failed to write event to child: written \(written), expected \(buffer.count)") } + } + } - for event in events { - do { - try generateSyntheticInotifyEvent(path: event.path, eventType: event.eventType) - event.completion(.success(())) - } catch { - event.completion(.failure(error)) - } + private func readEventFromParent(socket: Int32) throws -> (UInt32, String, Com_Apple_Containerization_Sandbox_V3_FileSystemEventType)? { + // Read event_type:4 + var eventTypeValue: UInt32 = 0 + guard read(socket, &eventTypeValue, 4) == 4 else { return nil } + eventTypeValue = UInt32(bigEndian: eventTypeValue) + + // Read path_len:4 + var pathLen: UInt32 = 0 + guard read(socket, &pathLen, 4) == 4 else { return nil } + pathLen = UInt32(bigEndian: pathLen) + + // Read path:N + let pathData = UnsafeMutablePointer.allocate(capacity: Int(pathLen)) + defer { pathData.deallocate() } + guard read(socket, pathData, Int(pathLen)) == pathLen else { return nil } + let pathBytes = Data(bytes: pathData, count: Int(pathLen)) + guard let path = String(data: pathBytes, encoding: .utf8) else { return nil } + + // Read event_id:4 + var eventID: UInt32 = 0 + guard read(socket, &eventID, 4) == 4 else { return nil } + eventID = UInt32(bigEndian: eventID) + + guard let eventType = Com_Apple_Containerization_Sandbox_V3_FileSystemEventType(rawValue: Int(eventTypeValue)) else { + return nil + } + + return (eventID, path, eventType) + } + + private func sendResponseToParent(socket: Int32, eventID: UInt32, success: UInt8) throws { + // Binary protocol: [event_id:4][success:1] + var buffer = Data() + buffer.append(contentsOf: withUnsafeBytes(of: eventID.bigEndian) { Data($0) }) + buffer.append(success) + + try buffer.withUnsafeBytes { bytes in + let written = write(socket, bytes.bindMemory(to: UInt8.self).baseAddress, buffer.count) + guard written == buffer.count else { + throw ContainerizationError(.internalError, message: "Failed to write response to parent") } } } + private func readResponseFromChild(socket: Int32) throws -> (UInt32, UInt8)? { + // Read event_id:4 + var eventID: UInt32 = 0 + guard read(socket, &eventID, 4) == 4 else { return nil } + eventID = UInt32(bigEndian: eventID) + + // Read success:1 + var success: UInt8 = 0 + guard read(socket, &success, 1) == 1 else { return nil } + + return (eventID, success) + } + private func enterContainerNamespace() throws { let nsPath = "/proc/\(containerPID)/ns/mnt" + let vmNsPath = "/proc/self/ns/mnt" + + guard FileManager.default.fileExists(atPath: nsPath) else { + throw ContainerizationError(.internalError, message: "Namespace file does not exist: \(nsPath)") + } + + // Compare namespace inodes to see if they're the same + let containerNsStatPtr = UnsafeMutablePointer.allocate(capacity: 1) + let vmNsStatPtr = UnsafeMutablePointer.allocate(capacity: 1) + defer { + containerNsStatPtr.deallocate() + vmNsStatPtr.deallocate() + } + + let containerStatResult = stat(nsPath, containerNsStatPtr) + let vmStatResult = stat(vmNsPath, vmNsStatPtr) + + if containerStatResult == 0 && vmStatResult == 0 { + let containerInode = containerNsStatPtr.pointee.st_ino + let vmInode = vmNsStatPtr.pointee.st_ino + + if containerInode == vmInode { + // Skip setns() since we're already in the right namespace + return + } + } + let fd = open(nsPath, O_RDONLY) guard fd >= 0 else { - throw ContainerizationError(.internalError, message: "Failed to open namespace file: \(nsPath)") + throw ContainerizationError(.internalError, message: "Failed to open namespace file: \(nsPath), errno \(errno)") + } + defer { + _ = close(fd) } - defer { _ = close(fd) } - guard setns(fd, CLONE_NEWNS) == 0 else { - throw ContainerizationError(.internalError, message: "Failed to setns to mount namespace: \(errno)") + let setnsResult = setns(fd, CLONE_NEWNS) + guard setnsResult == 0 else { + throw ContainerizationError(.internalError, message: "Failed to setns to mount namespace: errno \(errno)") } } @@ -230,7 +469,7 @@ extension ManagedContainer { } } - /// Start namespace worker thread after container process starts + /// Start namespace worker child process after container process starts private func startNamespaceWorker() throws { let pid = self.initProcess.pid guard pid > 0 else { @@ -242,7 +481,7 @@ extension ManagedContainer { self.namespaceWorker = worker } - /// Execute filesystem event using dedicated namespace thread + /// Execute filesystem event using dedicated namespace child process func executeFileSystemEvent(path: String, eventType: Com_Apple_Containerization_Sandbox_V3_FileSystemEventType) async throws { guard let worker = self.namespaceWorker else { throw ContainerizationError(.invalidState, message: "Namespace worker not started for container \(self.id)") @@ -277,7 +516,7 @@ extension ManagedContainer { let proc = try self.getExecOrInit(execID: execID) let pid = try await ProcessSupervisor.default.start(process: proc) - // Start namespace worker thread if this is the init process + // Start namespace worker child process if this is the init process if execID == self.id { try self.startNamespaceWorker() } @@ -316,7 +555,7 @@ extension ManagedContainer { } func delete() throws { - // Stop namespace worker thread + // Stop namespace worker child process self.namespaceWorker?.stop() self.namespaceWorker = nil diff --git a/vminitd/Sources/vminitd/Server+GRPC.swift b/vminitd/Sources/vminitd/Server+GRPC.swift index daf843b4..35ac3813 100644 --- a/vminitd/Sources/vminitd/Server+GRPC.swift +++ b/vminitd/Sources/vminitd/Server+GRPC.swift @@ -1091,16 +1091,27 @@ extension Initd: Com_Apple_Containerization_Sandbox_V3_SandboxContextAsyncProvid log.debug( "notifyFileSystemEvent", metadata: [ + "containerID": "\(request.containerID)", "path": "\(request.path)", "eventType": "\(request.eventType)", ]) - do { - try await generateSyntheticInotifyEvent( - path: request.path, - eventType: request.eventType - ) + guard let container = await self.state.containers[request.containerID] else { + log.warning( + "fs event for non-existent container", + metadata: [ + "containerID": "\(request.containerID)" + ]) + let response = Com_Apple_Containerization_Sandbox_V3_NotifyFileSystemEventResponse.with { + $0.success = false + $0.error = "fs event for non-existent container: \(request.containerID)" + } + try await responseStream.send(response) + return + } + do { + try await container.executeFileSystemEvent(path: request.path, eventType: request.eventType) let response = Com_Apple_Containerization_Sandbox_V3_NotifyFileSystemEventResponse.with { $0.success = true } @@ -1121,24 +1132,6 @@ extension Initd: Com_Apple_Containerization_Sandbox_V3_SandboxContextAsyncProvid } } } - - private func generateSyntheticInotifyEvent( - path: String, - eventType: Com_Apple_Containerization_Sandbox_V3_FileSystemEventType - ) async throws { - if eventType == .delete && !FileManager.default.fileExists(atPath: path) { - return - } - - let attributes = try FileManager.default.attributesOfItem(atPath: path) - guard let permissions = attributes[.posixPermissions] as? NSNumber else { - throw GRPCStatus(code: .internalError, message: "Failed to get file permissions for path: \(path)") - } - try FileManager.default.setAttributes( - [.posixPermissions: permissions], - ofItemAtPath: path - ) - } } extension Com_Apple_Containerization_Sandbox_V3_ConfigureHostsRequest { From 91d4788580973e3022261ba347cb4726224ed9c4 Mon Sep 17 00:00:00 2001 From: Raj Date: Wed, 8 Oct 2025 16:17:02 -0700 Subject: [PATCH 06/13] fix comments --- .../SandboxContext/SandboxContext.pb.swift | 5 - .../SandboxContext/SandboxContext.proto | 1 - Sources/Containerization/Vminitd.swift | 46 +- .../vminitd/FilesystemEventWorker.swift | 378 +++++++++++++++++ .../Sources/vminitd/ManagedContainer.swift | 395 +----------------- vminitd/Sources/vminitd/Server+GRPC.swift | 3 +- 6 files changed, 427 insertions(+), 401 deletions(-) create mode 100644 vminitd/Sources/vminitd/FilesystemEventWorker.swift diff --git a/Sources/Containerization/SandboxContext/SandboxContext.pb.swift b/Sources/Containerization/SandboxContext/SandboxContext.pb.swift index a4994a38..1d640cdc 100644 --- a/Sources/Containerization/SandboxContext/SandboxContext.pb.swift +++ b/Sources/Containerization/SandboxContext/SandboxContext.pb.swift @@ -44,7 +44,6 @@ public enum Com_Apple_Containerization_Sandbox_V3_FileSystemEventType: SwiftProt case link // = 2 case unlink // = 3 case modify // = 4 - case undefined // = 99 case UNRECOGNIZED(Int) public init() { @@ -58,7 +57,6 @@ public enum Com_Apple_Containerization_Sandbox_V3_FileSystemEventType: SwiftProt case 2: self = .link case 3: self = .unlink case 4: self = .modify - case 99: self = .undefined default: self = .UNRECOGNIZED(rawValue) } } @@ -70,7 +68,6 @@ public enum Com_Apple_Containerization_Sandbox_V3_FileSystemEventType: SwiftProt case .link: return 2 case .unlink: return 3 case .modify: return 4 - case .undefined: return 99 case .UNRECOGNIZED(let i): return i } } @@ -82,7 +79,6 @@ public enum Com_Apple_Containerization_Sandbox_V3_FileSystemEventType: SwiftProt .link, .unlink, .modify, - .undefined, ] } @@ -1306,7 +1302,6 @@ extension Com_Apple_Containerization_Sandbox_V3_FileSystemEventType: SwiftProtob 2: .same(proto: "LINK"), 3: .same(proto: "UNLINK"), 4: .same(proto: "MODIFY"), - 99: .same(proto: "UNDEFINED"), ] } diff --git a/Sources/Containerization/SandboxContext/SandboxContext.proto b/Sources/Containerization/SandboxContext/SandboxContext.proto index 722772e9..f6458306 100644 --- a/Sources/Containerization/SandboxContext/SandboxContext.proto +++ b/Sources/Containerization/SandboxContext/SandboxContext.proto @@ -361,7 +361,6 @@ enum FileSystemEventType { LINK = 2; UNLINK = 3; MODIFY = 4; - UNDEFINED = 99; } message NotifyFileSystemEventRequest { diff --git a/Sources/Containerization/Vminitd.swift b/Sources/Containerization/Vminitd.swift index 84a64962..e8b053f3 100644 --- a/Sources/Containerization/Vminitd.swift +++ b/Sources/Containerization/Vminitd.swift @@ -333,6 +333,8 @@ extension Vminitd: VirtualMachineAgent { /// Vminitd specific rpcs. extension Vminitd { + public typealias FileSystemEventRequest = Com_Apple_Containerization_Sandbox_V3_NotifyFileSystemEventRequest + public typealias FileSystemEventResponse = Com_Apple_Containerization_Sandbox_V3_NotifyFileSystemEventResponse /// Sets up an emulator in the guest. public func setupEmulator(binaryPath: String, configuration: Binfmt.Entry) async throws { let request = Com_Apple_Containerization_Sandbox_V3_SetupEmulatorRequest.with { @@ -428,30 +430,44 @@ extension Vminitd { try await self.sync() } - /// Send a filesystem event notification to the guest. + /// Send filesystem event notifications to the guest + public func notifyFileSystemEvents( + _ events: [FileSystemEventRequest] + ) async throws -> [FileSystemEventResponse] { + let requests = AsyncStream { continuation in + for event in events { + continuation.yield(event) + } + continuation.finish() + } + + let responses = client.notifyFileSystemEvent(requests) + var results: [FileSystemEventResponse] = [] + + for try await response in responses { + results.append(response) + } + + guard results.count == events.count else { + throw ContainerizationError(.internalError, message: "Expected \(events.count) responses, got \(results.count)") + } + + return results + } + public func notifyFileSystemEvent( path: String, eventType: Com_Apple_Containerization_Sandbox_V3_FileSystemEventType, containerID: String - ) async throws -> Com_Apple_Containerization_Sandbox_V3_NotifyFileSystemEventResponse { - let request = Com_Apple_Containerization_Sandbox_V3_NotifyFileSystemEventRequest.with { + ) async throws -> FileSystemEventResponse { + let request = FileSystemEventRequest.with { $0.path = path $0.eventType = eventType $0.containerID = containerID } - let requests = AsyncStream { continuation in - continuation.yield(request) - continuation.finish() - } - - let responses = client.notifyFileSystemEvent(requests) - - for try await response in responses { - return response - } - - throw ContainerizationError(.internalError, message: "No response received from notifyFileSystemEvent") + let responses = try await notifyFileSystemEvents([request]) + return responses[0] } } diff --git a/vminitd/Sources/vminitd/FilesystemEventWorker.swift b/vminitd/Sources/vminitd/FilesystemEventWorker.swift new file mode 100644 index 00000000..2c0ba463 --- /dev/null +++ b/vminitd/Sources/vminitd/FilesystemEventWorker.swift @@ -0,0 +1,378 @@ +//===----------------------------------------------------------------------===// +// Copyright © 2025 Apple Inc. and the Containerization project authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//===----------------------------------------------------------------------===// + +import Containerization +import ContainerizationError +import Foundation +import NIOCore +import NIOPosix +import Synchronization + +#if canImport(Musl) +import Musl +#elseif canImport(Glibc) +import Glibc +#endif + +final class FilesystemEventWorker: @unchecked Sendable { + private static let handshakeReady: UInt8 = 0xAA + private static let handshakeFailure: UInt8 = 0xFF + + private let containerID: String + private let containerPID: Int32 + private var childPID: Int32? + private var parentSocket: Int32? + private var channel: Channel? + private let eventLoop: EventLoop + private var eventIDCounter: UInt32 = 0 + private let pendingEvents: Mutex<[UInt32: CheckedContinuation]> = Mutex([:]) + private let shouldStop: Atomic = Atomic(false) + + init(containerID: String, containerPID: Int32, eventLoop: EventLoop) { + self.containerID = containerID + self.containerPID = containerPID + self.eventLoop = eventLoop + } + + func start() throws { + guard childPID == nil else { + throw ContainerizationError(.invalidState, message: "FilesystemEventWorker already started") + } + + var sockets: [Int32] = [0, 0] + guard socketpair(AF_UNIX, SOCK_STREAM, 0, &sockets) == 0 else { + throw ContainerizationError(.internalError, message: "Failed to create socketpair: errno \(errno)") + } + + let parentSocket = sockets[0] + let childSocket = sockets[1] + + let pid = fork() + guard pid >= 0 else { + close(parentSocket) + close(childSocket) + throw ContainerizationError(.internalError, message: "Failed to fork: errno \(errno)") + } + + if pid == 0 { + close(parentSocket) + runChildProcess(socket: childSocket) + exit(0) + } else { + close(childSocket) + self.childPID = pid + self.parentSocket = parentSocket + + var handshake: UInt8 = 0 + let readResult = read(parentSocket, &handshake, 1) + + if readResult != 1 { + close(parentSocket) + self.parentSocket = nil + var status: Int32 = 0 + waitpid(pid, &status, 0) + self.childPID = nil + throw ContainerizationError(.internalError, message: "Child process failed to start") + } + + if handshake == Self.handshakeFailure { + close(parentSocket) + self.parentSocket = nil + var status: Int32 = 0 + waitpid(pid, &status, 0) + self.childPID = nil + throw ContainerizationError(.internalError, message: "Child process failed to enter container namespace") + } + + if handshake != Self.handshakeReady { + close(parentSocket) + self.parentSocket = nil + var status: Int32 = 0 + waitpid(pid, &status, 0) + self.childPID = nil + throw ContainerizationError(.internalError, message: "Child process sent unexpected handshake: \(handshake)") + } + + do { + let bootstrap = NIOPipeBootstrap(group: eventLoop) + .channelInitializer { channel in + let handler = ResponseHandler(worker: self) + return channel.pipeline.addHandler(handler) + } + self.channel = try bootstrap.takingOwnershipOfDescriptor(inputOutput: parentSocket).wait() + } catch { + close(parentSocket) + self.parentSocket = nil + var status: Int32 = 0 + waitpid(pid, &status, 0) + self.childPID = nil + throw ContainerizationError(.internalError, message: "Failed to setup NIO channel: \(error)") + } + } + } + + func enqueueEvent(path: String, eventType: Com_Apple_Containerization_Sandbox_V3_FileSystemEventType) async throws { + guard let socket = parentSocket, !shouldStop.load(ordering: .relaxed) else { + throw ContainerizationError(.invalidState, message: "FilesystemEventWorker not running") + } + + let eventID = eventIDCounter + eventIDCounter += 1 + + try await withCheckedThrowingContinuation { (continuation: CheckedContinuation) in + pendingEvents.withLock { events in + events[eventID] = continuation + } + + do { + try sendEventToChild(socket: socket, eventID: eventID, path: path, eventType: eventType) + } catch { + _ = pendingEvents.withLock { events in + events.removeValue(forKey: eventID) + } + continuation.resume(throwing: error) + } + } + } + + func stop() { + shouldStop.store(true, ordering: .relaxed) + + if let channel = self.channel { + try? channel.close().wait() + self.channel = nil + } + + self.parentSocket = nil + + if let pid = childPID { + #if canImport(Musl) + Musl.kill(pid, SIGTERM) + #elseif canImport(Glibc) + Glibc.kill(pid, SIGTERM) + #endif + + var status: Int32 = 0 + waitpid(pid, &status, 0) + childPID = nil + } + + pendingEvents.withLock { events in + for (_, continuation) in events { + continuation.resume(throwing: ContainerizationError(.cancelled, message: "FilesystemEventWorker stopped")) + } + events.removeAll() + } + } + + private func runChildProcess(socket: Int32) { + do { + try enterContainerNamespace() + } catch { + var failureHandshake = Self.handshakeFailure + _ = write(socket, &failureHandshake, 1) + close(socket) + exit(1) + } + + var readyHandshake = Self.handshakeReady + guard write(socket, &readyHandshake, 1) == 1 else { + close(socket) + exit(1) + } + + while true { + do { + guard let (eventID, path, eventType) = try readEventFromParent(socket: socket) else { + break + } + + var success: UInt8 = 1 + do { + try generateSyntheticInotifyEvent(path: path, eventType: eventType) + } catch { + success = 0 + } + + try sendResponseToParent(socket: socket, eventID: eventID, success: success) + } catch { + break + } + } + + close(socket) + } + + private func sendEventToChild(socket: Int32, eventID: UInt32, path: String, eventType: Com_Apple_Containerization_Sandbox_V3_FileSystemEventType) throws { + let pathData = path.data(using: .utf8) ?? Data() + let pathLen = UInt32(pathData.count) + let eventTypeValue = UInt32(eventType.rawValue) + + var buffer = Data() + buffer.append(contentsOf: withUnsafeBytes(of: eventTypeValue.bigEndian) { Data($0) }) + buffer.append(contentsOf: withUnsafeBytes(of: pathLen.bigEndian) { Data($0) }) + buffer.append(pathData) + buffer.append(contentsOf: withUnsafeBytes(of: eventID.bigEndian) { Data($0) }) + + try buffer.withUnsafeBytes { bytes in + let written = write(socket, bytes.bindMemory(to: UInt8.self).baseAddress, buffer.count) + guard written == buffer.count else { + throw ContainerizationError(.internalError, message: "Failed to write event to child: written \(written), expected \(buffer.count)") + } + } + } + + private func readEventFromParent(socket: Int32) throws -> (UInt32, String, Com_Apple_Containerization_Sandbox_V3_FileSystemEventType)? { + var eventTypeValue: UInt32 = 0 + guard read(socket, &eventTypeValue, 4) == 4 else { return nil } + eventTypeValue = UInt32(bigEndian: eventTypeValue) + + var pathLen: UInt32 = 0 + guard read(socket, &pathLen, 4) == 4 else { return nil } + pathLen = UInt32(bigEndian: pathLen) + + let pathData = UnsafeMutablePointer.allocate(capacity: Int(pathLen)) + defer { pathData.deallocate() } + guard read(socket, pathData, Int(pathLen)) == pathLen else { return nil } + let pathBytes = Data(bytes: pathData, count: Int(pathLen)) + guard let path = String(data: pathBytes, encoding: .utf8) else { return nil } + + var eventID: UInt32 = 0 + guard read(socket, &eventID, 4) == 4 else { return nil } + eventID = UInt32(bigEndian: eventID) + + guard let eventType = Com_Apple_Containerization_Sandbox_V3_FileSystemEventType(rawValue: Int(eventTypeValue)) else { + return nil + } + + return (eventID, path, eventType) + } + + private func sendResponseToParent(socket: Int32, eventID: UInt32, success: UInt8) throws { + var buffer = Data() + buffer.append(contentsOf: withUnsafeBytes(of: eventID.bigEndian) { Data($0) }) + buffer.append(success) + + try buffer.withUnsafeBytes { bytes in + let written = write(socket, bytes.bindMemory(to: UInt8.self).baseAddress, buffer.count) + guard written == buffer.count else { + throw ContainerizationError(.internalError, message: "Failed to write response to parent") + } + } + } + + private func enterContainerNamespace() throws { + let nsPath = "/proc/\(containerPID)/ns/mnt" + let vmNsPath = "/proc/self/ns/mnt" + + guard FileManager.default.fileExists(atPath: nsPath) else { + throw ContainerizationError(.internalError, message: "Namespace file does not exist: \(nsPath)") + } + + let containerNsStatPtr = UnsafeMutablePointer.allocate(capacity: 1) + let vmNsStatPtr = UnsafeMutablePointer.allocate(capacity: 1) + defer { + containerNsStatPtr.deallocate() + vmNsStatPtr.deallocate() + } + + let containerStatResult = stat(nsPath, containerNsStatPtr) + let vmStatResult = stat(vmNsPath, vmNsStatPtr) + + if containerStatResult == 0 && vmStatResult == 0 { + let containerInode = containerNsStatPtr.pointee.st_ino + let vmInode = vmNsStatPtr.pointee.st_ino + + if containerInode == vmInode { + return + } + } + + let fd = open(nsPath, O_RDONLY) + guard fd >= 0 else { + throw ContainerizationError(.internalError, message: "Failed to open namespace file: \(nsPath), errno \(errno)") + } + defer { + _ = close(fd) + } + + let setnsResult = setns(fd, CLONE_NEWNS) + guard setnsResult == 0 else { + throw ContainerizationError(.internalError, message: "Failed to setns to mount namespace: errno \(errno)") + } + } + + private func generateSyntheticInotifyEvent( + path: String, + eventType: Com_Apple_Containerization_Sandbox_V3_FileSystemEventType + ) throws { + if eventType == .delete && !FileManager.default.fileExists(atPath: path) { + return + } + + let attributes = try FileManager.default.attributesOfItem(atPath: path) + guard let permissions = attributes[.posixPermissions] as? NSNumber else { + throw ContainerizationError(.internalError, message: "Failed to get file permissions for path: \(path)") + } + try FileManager.default.setAttributes( + [.posixPermissions: permissions], + ofItemAtPath: path + ) + } + + private final class ResponseHandler: ChannelInboundHandler, @unchecked Sendable { + typealias InboundIn = ByteBuffer + + private var buffer = ByteBuffer() + private unowned let worker: FilesystemEventWorker + + init(worker: FilesystemEventWorker) { + self.worker = worker + } + + func channelRead(context: ChannelHandlerContext, data: NIOAny) { + var inBuffer = unwrapInboundIn(data) + buffer.writeBuffer(&inBuffer) + + while buffer.readableBytes >= 5 { + guard let eventID = buffer.readInteger(endianness: .big, as: UInt32.self), + let success = buffer.readInteger(as: UInt8.self) + else { + break + } + + worker.pendingEvents.withLock { events in + if let continuation = events.removeValue(forKey: eventID) { + if success == 1 { + continuation.resume() + } else { + continuation.resume(throwing: ContainerizationError(.internalError, message: "Child process failed to process filesystem event")) + } + } + } + } + } + + func errorCaught(context: ChannelHandlerContext, error: Error) { + worker.pendingEvents.withLock { events in + for (_, continuation) in events { + continuation.resume(throwing: error) + } + events.removeAll() + } + } + } +} diff --git a/vminitd/Sources/vminitd/ManagedContainer.swift b/vminitd/Sources/vminitd/ManagedContainer.swift index fb5b46e1..ec9f8872 100644 --- a/vminitd/Sources/vminitd/ManagedContainer.swift +++ b/vminitd/Sources/vminitd/ManagedContainer.swift @@ -21,6 +21,7 @@ import ContainerizationOCI import ContainerizationOS import Foundation import Logging +import NIOCore import Synchronization #if canImport(Musl) @@ -36,371 +37,9 @@ actor ManagedContainer { private let cgroupManager: Cgroup2Manager private let log: Logger private let bundle: ContainerizationOCI.Bundle + private let group: EventLoopGroup private var execs: [String: ManagedProcess] = [:] - private var namespaceWorker: NamespaceWorker? - - /// Worker child process that runs in container's namespace for filesystem operations - private final class NamespaceWorker: @unchecked Sendable { - private let containerID: String - private let containerPID: Int32 - private var childPID: Int32? - private var parentSocket: Int32? - private var eventIDCounter: UInt32 = 0 - private let pendingEvents: Mutex<[UInt32: CheckedContinuation]> = Mutex([:]) - private var responseReaderTask: Task? - private let shouldStop: Atomic = Atomic(false) - - init(containerID: String, containerPID: Int32) { - self.containerID = containerID - self.containerPID = containerPID - } - - func start() throws { - guard childPID == nil else { - throw ContainerizationError(.invalidState, message: "NamespaceWorker already started") - } - - // Create socketpair for parent-child communication - var sockets: [Int32] = [0, 0] - guard socketpair(AF_UNIX, SOCK_STREAM, 0, &sockets) == 0 else { - throw ContainerizationError(.internalError, message: "Failed to create socketpair: errno \(errno)") - } - - let parentSocket = sockets[0] - let childSocket = sockets[1] - - // Fork child process - let pid = fork() - guard pid >= 0 else { - close(parentSocket) - close(childSocket) - throw ContainerizationError(.internalError, message: "Failed to fork: errno \(errno)") - } - - if pid == 0 { - // Child process - close(parentSocket) - runChildProcess(socket: childSocket) - exit(0) - } else { - // Parent process - close(childSocket) - self.childPID = pid - self.parentSocket = parentSocket - - // Wait for child to signal ready or failure - var signal: UInt8 = 0 - let readResult = read(parentSocket, &signal, 1) - - if readResult != 1 { - // Child failed to send signal - close(parentSocket) - self.parentSocket = nil - var status: Int32 = 0 - waitpid(pid, &status, 0) - self.childPID = nil - throw ContainerizationError(.internalError, message: "Child process failed to start") - } - - if signal == 0xFF { - // Child failed to enter namespace - close(parentSocket) - self.parentSocket = nil - var status: Int32 = 0 - waitpid(pid, &status, 0) - self.childPID = nil - throw ContainerizationError(.internalError, message: "Child process failed to enter container namespace") - } - - if signal != 0xAA { - // Unexpected signal - close(parentSocket) - self.parentSocket = nil - var status: Int32 = 0 - waitpid(pid, &status, 0) - self.childPID = nil - throw ContainerizationError(.internalError, message: "Child process sent unexpected signal: \(signal)") - } - - // Start response reader task - self.responseReaderTask = Task { [weak self] in - await self?.readChildResponses() - } - } - } - - func enqueueEvent(path: String, eventType: Com_Apple_Containerization_Sandbox_V3_FileSystemEventType) async throws { - guard let socket = parentSocket, !shouldStop.load(ordering: .relaxed) else { - throw ContainerizationError(.invalidState, message: "NamespaceWorker not running") - } - - let eventID = eventIDCounter - eventIDCounter += 1 - - // Store continuation for this event - try await withCheckedThrowingContinuation { (continuation: CheckedContinuation) in - pendingEvents.withLock { events in - events[eventID] = continuation - } - - // Send event to child process - do { - try sendEventToChild(socket: socket, eventID: eventID, path: path, eventType: eventType) - } catch { - // Remove from pending events if send failed - _ = pendingEvents.withLock { events in - events.removeValue(forKey: eventID) - } - continuation.resume(throwing: error) - } - } - } - - func stop() { - shouldStop.store(true, ordering: .relaxed) - - // Cancel response reader task - responseReaderTask?.cancel() - responseReaderTask = nil - - // Close parent socket - if let socket = parentSocket { - close(socket) - parentSocket = nil - } - - // Terminate child process - if let pid = childPID { - #if canImport(Musl) - Musl.kill(pid, SIGTERM) - #elseif canImport(Glibc) - Glibc.kill(pid, SIGTERM) - #endif - - // Wait for child to exit - var status: Int32 = 0 - waitpid(pid, &status, 0) - childPID = nil - } - - // Cancel all pending events - pendingEvents.withLock { events in - for (_, continuation) in events { - continuation.resume(throwing: ContainerizationError(.cancelled, message: "NamespaceWorker stopped")) - } - events.removeAll() - } - } - - private func runChildProcess(socket: Int32) { - // Enter container namespace - do { - try enterContainerNamespace() - } catch { - // Signal parent that namespace entry failed, then exit - var failureResponse: UInt8 = 0xFF // Special failure signal - _ = write(socket, &failureResponse, 1) - close(socket) - exit(1) - } - - // Signal parent that we're ready - var readySignal: UInt8 = 0xAA // Ready signal - guard write(socket, &readySignal, 1) == 1 else { - close(socket) - exit(1) - } - - // Child event loop - while true { - do { - // Read event from parent - guard let (eventID, path, eventType) = try readEventFromParent(socket: socket) else { - break // Parent closed socket - } - - // Process filesystem event - var success: UInt8 = 1 - do { - try generateSyntheticInotifyEvent(path: path, eventType: eventType) - } catch { - success = 0 - } - - // Send response to parent - try sendResponseToParent(socket: socket, eventID: eventID, success: success) - } catch { - break - } - } - - close(socket) - } - - private func readChildResponses() async { - guard let socket = parentSocket else { return } - - while !shouldStop.load(ordering: .relaxed) { - do { - // Read response from child - guard let (eventID, success) = try readResponseFromChild(socket: socket) else { - break // Socket closed - } - - // Resume the corresponding continuation - pendingEvents.withLock { events in - if let continuation = events.removeValue(forKey: eventID) { - if success == 1 { - continuation.resume() - } else { - continuation.resume(throwing: ContainerizationError(.internalError, message: "Child process failed to process filesystem event")) - } - } - } - } catch { - break - } - } - } - - private func sendEventToChild(socket: Int32, eventID: UInt32, path: String, eventType: Com_Apple_Containerization_Sandbox_V3_FileSystemEventType) throws { - let pathData = path.data(using: .utf8) ?? Data() - let pathLen = UInt32(pathData.count) - let eventTypeValue = UInt32(eventType.rawValue) - - // Binary protocol: [event_type:4][path_len:4][path:N][event_id:4] - var buffer = Data() - buffer.append(contentsOf: withUnsafeBytes(of: eventTypeValue.bigEndian) { Data($0) }) - buffer.append(contentsOf: withUnsafeBytes(of: pathLen.bigEndian) { Data($0) }) - buffer.append(pathData) - buffer.append(contentsOf: withUnsafeBytes(of: eventID.bigEndian) { Data($0) }) - - try buffer.withUnsafeBytes { bytes in - let written = write(socket, bytes.bindMemory(to: UInt8.self).baseAddress, buffer.count) - guard written == buffer.count else { - throw ContainerizationError(.internalError, message: "Failed to write event to child: written \(written), expected \(buffer.count)") - } - } - } - - private func readEventFromParent(socket: Int32) throws -> (UInt32, String, Com_Apple_Containerization_Sandbox_V3_FileSystemEventType)? { - // Read event_type:4 - var eventTypeValue: UInt32 = 0 - guard read(socket, &eventTypeValue, 4) == 4 else { return nil } - eventTypeValue = UInt32(bigEndian: eventTypeValue) - - // Read path_len:4 - var pathLen: UInt32 = 0 - guard read(socket, &pathLen, 4) == 4 else { return nil } - pathLen = UInt32(bigEndian: pathLen) - - // Read path:N - let pathData = UnsafeMutablePointer.allocate(capacity: Int(pathLen)) - defer { pathData.deallocate() } - guard read(socket, pathData, Int(pathLen)) == pathLen else { return nil } - let pathBytes = Data(bytes: pathData, count: Int(pathLen)) - guard let path = String(data: pathBytes, encoding: .utf8) else { return nil } - - // Read event_id:4 - var eventID: UInt32 = 0 - guard read(socket, &eventID, 4) == 4 else { return nil } - eventID = UInt32(bigEndian: eventID) - - guard let eventType = Com_Apple_Containerization_Sandbox_V3_FileSystemEventType(rawValue: Int(eventTypeValue)) else { - return nil - } - - return (eventID, path, eventType) - } - - private func sendResponseToParent(socket: Int32, eventID: UInt32, success: UInt8) throws { - // Binary protocol: [event_id:4][success:1] - var buffer = Data() - buffer.append(contentsOf: withUnsafeBytes(of: eventID.bigEndian) { Data($0) }) - buffer.append(success) - - try buffer.withUnsafeBytes { bytes in - let written = write(socket, bytes.bindMemory(to: UInt8.self).baseAddress, buffer.count) - guard written == buffer.count else { - throw ContainerizationError(.internalError, message: "Failed to write response to parent") - } - } - } - - private func readResponseFromChild(socket: Int32) throws -> (UInt32, UInt8)? { - // Read event_id:4 - var eventID: UInt32 = 0 - guard read(socket, &eventID, 4) == 4 else { return nil } - eventID = UInt32(bigEndian: eventID) - - // Read success:1 - var success: UInt8 = 0 - guard read(socket, &success, 1) == 1 else { return nil } - - return (eventID, success) - } - - private func enterContainerNamespace() throws { - let nsPath = "/proc/\(containerPID)/ns/mnt" - let vmNsPath = "/proc/self/ns/mnt" - - guard FileManager.default.fileExists(atPath: nsPath) else { - throw ContainerizationError(.internalError, message: "Namespace file does not exist: \(nsPath)") - } - - // Compare namespace inodes to see if they're the same - let containerNsStatPtr = UnsafeMutablePointer.allocate(capacity: 1) - let vmNsStatPtr = UnsafeMutablePointer.allocate(capacity: 1) - defer { - containerNsStatPtr.deallocate() - vmNsStatPtr.deallocate() - } - - let containerStatResult = stat(nsPath, containerNsStatPtr) - let vmStatResult = stat(vmNsPath, vmNsStatPtr) - - if containerStatResult == 0 && vmStatResult == 0 { - let containerInode = containerNsStatPtr.pointee.st_ino - let vmInode = vmNsStatPtr.pointee.st_ino - - if containerInode == vmInode { - // Skip setns() since we're already in the right namespace - return - } - } - - let fd = open(nsPath, O_RDONLY) - guard fd >= 0 else { - throw ContainerizationError(.internalError, message: "Failed to open namespace file: \(nsPath), errno \(errno)") - } - defer { - _ = close(fd) - } - - let setnsResult = setns(fd, CLONE_NEWNS) - guard setnsResult == 0 else { - throw ContainerizationError(.internalError, message: "Failed to setns to mount namespace: errno \(errno)") - } - } - - private func generateSyntheticInotifyEvent( - path: String, - eventType: Com_Apple_Containerization_Sandbox_V3_FileSystemEventType - ) throws { - if eventType == .delete && !FileManager.default.fileExists(atPath: path) { - return - } - - let attributes = try FileManager.default.attributesOfItem(atPath: path) - guard let permissions = attributes[.posixPermissions] as? NSNumber else { - throw ContainerizationError(.internalError, message: "Failed to get file permissions for path: \(path)") - } - try FileManager.default.setAttributes( - [.posixPermissions: permissions], - ofItemAtPath: path - ) - } - } + private var filesystemEventWorker: FilesystemEventWorker? var pid: Int32? { self.initProcess.pid @@ -410,7 +49,8 @@ actor ManagedContainer { id: String, stdio: HostStdio, spec: ContainerizationOCI.Spec, - log: Logger + log: Logger, + group: EventLoopGroup ) throws { var cgroupsPath: String if let cgPath = spec.linux?.cgroupsPath { @@ -449,9 +89,9 @@ actor ManagedContainer { self.id = id self.bundle = bundle self.log = log + self.group = group - // Initialize namespace worker - will be started after process starts - self.namespaceWorker = nil + self.filesystemEventWorker = nil } catch { try? cgManager.delete() throw error @@ -469,22 +109,21 @@ extension ManagedContainer { } } - /// Start namespace worker child process after container process starts - private func startNamespaceWorker() throws { + private func startFilesystemEventWorker() throws { let pid = self.initProcess.pid guard pid > 0 else { throw ContainerizationError(.invalidState, message: "Container process not started") } - let worker = NamespaceWorker(containerID: self.id, containerPID: pid) + let eventLoop = group.next() + let worker = FilesystemEventWorker(containerID: self.id, containerPID: pid, eventLoop: eventLoop) try worker.start() - self.namespaceWorker = worker + self.filesystemEventWorker = worker } - /// Execute filesystem event using dedicated namespace child process func executeFileSystemEvent(path: String, eventType: Com_Apple_Containerization_Sandbox_V3_FileSystemEventType) async throws { - guard let worker = self.namespaceWorker else { - throw ContainerizationError(.invalidState, message: "Namespace worker not started for container \(self.id)") + guard let worker = self.filesystemEventWorker else { + throw ContainerizationError(.invalidState, message: "Filesystem event worker not started for container \(self.id)") } try await worker.enqueueEvent(path: path, eventType: eventType) } @@ -516,9 +155,8 @@ extension ManagedContainer { let proc = try self.getExecOrInit(execID: execID) let pid = try await ProcessSupervisor.default.start(process: proc) - // Start namespace worker child process if this is the init process if execID == self.id { - try self.startNamespaceWorker() + try self.startFilesystemEventWorker() } return pid @@ -555,9 +193,8 @@ extension ManagedContainer { } func delete() throws { - // Stop namespace worker child process - self.namespaceWorker?.stop() - self.namespaceWorker = nil + self.filesystemEventWorker?.stop() + self.filesystemEventWorker = nil try self.bundle.delete() try self.cgroupManager.delete(force: true) diff --git a/vminitd/Sources/vminitd/Server+GRPC.swift b/vminitd/Sources/vminitd/Server+GRPC.swift index 35ac3813..c5ddf983 100644 --- a/vminitd/Sources/vminitd/Server+GRPC.swift +++ b/vminitd/Sources/vminitd/Server+GRPC.swift @@ -492,7 +492,8 @@ extension Initd: Com_Apple_Containerization_Sandbox_V3_SandboxContextAsyncProvid id: request.id, stdio: stdioPorts, spec: ociSpec, - log: self.log + log: self.log, + group: self.group ) try await self.state.add(container: ctr) } From 87ade01121c1146a09046295a689850b0a9e45f8 Mon Sep 17 00:00:00 2001 From: Raj Date: Mon, 13 Oct 2025 13:38:55 -0700 Subject: [PATCH 07/13] edit vminitd to have a fs-notify subcommand and fork+exec ourselves --- vminitd/Package.swift | 1 + vminitd/Sources/vminitd/Application.swift | 290 +++++++++++++++--- .../vminitd/FilesystemEventWorker.swift | 276 ++++++----------- .../Sources/vminitd/ManagedContainer.swift | 30 +- vminitd/Sources/vminitd/ManagedProcess.swift | 4 +- .../Sources/vminitd/ProcessSupervisor.swift | 4 +- 6 files changed, 354 insertions(+), 251 deletions(-) diff --git a/vminitd/Package.swift b/vminitd/Package.swift index 7c3b81c2..9070ae75 100644 --- a/vminitd/Package.swift +++ b/vminitd/Package.swift @@ -47,6 +47,7 @@ let package = Package( .executableTarget( name: "vminitd", dependencies: [ + .product(name: "ArgumentParser", package: "swift-argument-parser"), .product(name: "Logging", package: "swift-log"), .product(name: "_NIOFileSystem", package: "swift-nio"), .product(name: "Containerization", package: "containerization"), diff --git a/vminitd/Sources/vminitd/Application.swift b/vminitd/Sources/vminitd/Application.swift index af04ef2c..bd85e617 100644 --- a/vminitd/Sources/vminitd/Application.swift +++ b/vminitd/Sources/vminitd/Application.swift @@ -14,6 +14,7 @@ // limitations under the License. //===----------------------------------------------------------------------===// +import ArgumentParser import Containerization import ContainerizationError import ContainerizationOS @@ -25,15 +26,28 @@ import NIOPosix #if os(Linux) import Musl import LCShim +#else +import Darwin #endif @main -struct Application { - private static let foregroundEnvVar = "FOREGROUND" - private static let vsockPort = 1024 - private static let standardErrorLock = NSLock() +struct Application: AsyncParsableCommand { + static let configuration = CommandConfiguration( + commandName: "vminitd", + abstract: "VM init process and container agent", + version: "1.0.0", + subcommands: [ + InitCommand.self, + FsNotifyCommand.self, + ], + defaultSubcommand: InitCommand.self + ) - private static func runInForeground(_ log: Logger) throws { + static let foregroundEnvVar = "FOREGROUND" + static let vsockPort = 1024 + static let standardErrorLock = NSLock() + + static func runInForeground(_ log: Logger) throws { log.info("running vminitd under pid1") var command = Command("/sbin/vminitd") @@ -41,13 +55,16 @@ struct Application { command.stdin = .standardInput command.stdout = .standardOutput command.stderr = .standardError - command.environment = ["\(foregroundEnvVar)=1"] + + var env = ProcessInfo.processInfo.environment + env[foregroundEnvVar] = "1" + command.environment = env.map { "\($0.key)=\($0.value)" } try command.start() _ = try command.wait() } - private static func adjustLimits() throws { + static func adjustLimits() throws { var limits = rlimit() guard getrlimit(RLIMIT_NOFILE, &limits) == 0 else { throw POSIXError(.init(rawValue: errno)!) @@ -60,64 +77,241 @@ struct Application { } @Sendable - private static func standardError(label: String) -> StreamLogHandler { + static func standardError(label: String) -> StreamLogHandler { standardErrorLock.withLock { StreamLogHandler.standardError(label: label) } } - static func main() async throws { - LoggingSystem.bootstrap(standardError) - var log = Logger(label: "vminitd") + static func exit(_ code: Int32) -> Never { + #if os(Linux) + Musl.exit(code) + #else + Darwin.exit(code) + #endif + } +} + +extension Application { + struct InitCommand: AsyncParsableCommand { + static let configuration = CommandConfiguration( + commandName: "init", + abstract: "Run vminitd as init process (default)" + ) + + func run() async throws { + LoggingSystem.bootstrap(Application.standardError) + var log = Logger(label: "vminitd") + + try Application.adjustLimits() + + // when running under debug mode, launch vminitd as a sub process of pid1 + // so that we get a chance to collect better logs and errors before pid1 exists + // and the kernel panics. + #if DEBUG + let environment = ProcessInfo.processInfo.environment + let foreground = environment[Application.foregroundEnvVar] + log.info("checking for shim var \(Application.foregroundEnvVar)=\(String(describing: foreground))") + + if foreground == nil { + try Application.runInForeground(log) + Application.exit(0) + } - try adjustLimits() + // since we are not running as pid1 in this mode we must set ourselves + // as a subpreaper so that all child processes are reaped by us and not + // passed onto our parent. + CZ_set_sub_reaper() + #endif - // when running under debug mode, launch vminitd as a sub process of pid1 - // so that we get a chance to collect better logs and errors before pid1 exists - // and the kernel panics. - #if DEBUG - let environment = ProcessInfo.processInfo.environment - let foreground = environment[Self.foregroundEnvVar] - log.info("checking for shim var \(foregroundEnvVar)=\(String(describing: foreground))") + signal(SIGPIPE, SIG_IGN) - if foreground == nil { - try runInForeground(log) - exit(0) + // Because the sysctl rpc wouldn't make sense if this didn't always exist, we + // ALWAYS mount /proc. + guard Musl.mount("proc", "/proc", "proc", 0, "") == 0 else { + log.error("failed to mount /proc") + Application.exit(1) + } + guard Musl.mount("tmpfs", "/run", "tmpfs", 0, "") == 0 else { + log.error("failed to mount /run") + Application.exit(1) + } + try Binfmt.mount() + + log.logLevel = .debug + + log.info("vminitd booting") + let eg = MultiThreadedEventLoopGroup(numberOfThreads: System.coreCount) + let server = Initd(log: log, group: eg) + + do { + log.info("serving vminitd API") + try await server.serve(port: Application.vsockPort) + log.info("vminitd API returned") + } catch { + log.error("vminitd boot error \(error)") + Application.exit(1) + } } + } +} - // since we are not running as pid1 in this mode we must set ourselves - // as a subpreaper so that all child processes are reaped by us and not - // passed onto our parent. - CZ_set_sub_reaper() - #endif +extension Application { + struct FsNotifyCommand: ParsableCommand { + static let configuration = CommandConfiguration( + commandName: "fs-notify", + abstract: "Internal command to run filesystem notification worker in container namespace", + shouldDisplay: false + ) + + @Argument(help: "Container PID whose namespace to enter") + var containerPID: Int32 + + private static let handshakeReady: UInt8 = 0xAA + private static let handshakeFailure: UInt8 = 0xFF + + func run() throws { + // FD 3 = socket (extraFiles[0]), FD 4 = error pipe (extraFiles[1]) + let socketFD: Int32 = 3 + let errorPipeFD: Int32 = 4 + + do { + try enterContainerNamespace(containerPID: containerPID) + close(errorPipeFD) + } catch { + let errorMsg = "Failed to enter namespace: \(error)" + _ = errorMsg.utf8CString.withUnsafeBufferPointer { buffer in + // -1 to skip null terminator + write(errorPipeFD, buffer.baseAddress, buffer.count - 1) + } + close(errorPipeFD) - signal(SIGPIPE, SIG_IGN) + var failureHandshake = Self.handshakeFailure + _ = write(socketFD, &failureHandshake, 1) + close(socketFD) + Application.exit(1) + } - // Because the sysctl rpc wouldn't make sense if this didn't always exist, we - // ALWAYS mount /proc. - guard Musl.mount("proc", "/proc", "proc", 0, "") == 0 else { - log.error("failed to mount /proc") - exit(1) + var readyHandshake = Self.handshakeReady + guard write(socketFD, &readyHandshake, 1) == 1 else { + close(socketFD) + Application.exit(1) + } + + while true { + do { + guard let (eventID, path, eventType) = try readEventFromParent(socket: socketFD) else { + break + } + + var success: UInt8 = 1 + do { + try generateSyntheticInotifyEvent(path: path, eventType: eventType) + } catch { + success = 0 + } + + try sendResponseToParent(socket: socketFD, eventID: eventID, success: success) + } catch { + break + } + } + + close(socketFD) } - guard Musl.mount("tmpfs", "/run", "tmpfs", 0, "") == 0 else { - log.error("failed to mount /run") - exit(1) + + private func enterContainerNamespace(containerPID: Int32) throws { + let nsPath = "/proc/\(containerPID)/ns/mnt" + let vmNsPath = "/proc/self/ns/mnt" + + let containerNsStatPtr = UnsafeMutablePointer.allocate(capacity: 1) + let vmNsStatPtr = UnsafeMutablePointer.allocate(capacity: 1) + defer { + containerNsStatPtr.deallocate() + vmNsStatPtr.deallocate() + } + + let containerStatResult = stat(nsPath, containerNsStatPtr) + let vmStatResult = stat(vmNsPath, vmNsStatPtr) + + if containerStatResult == 0 && vmStatResult == 0 { + let containerInode = containerNsStatPtr.pointee.st_ino + let vmInode = vmNsStatPtr.pointee.st_ino + + if containerInode == vmInode { + return + } + } + + let fd = open(nsPath, O_RDONLY) + guard fd >= 0 else { + throw ContainerizationError(.internalError, message: "Failed to open namespace file: \(nsPath), errno \(errno)") + } + defer { + _ = close(fd) + } + let _ = unshare(CLONE_FS) + let setnsResult = setns(fd, CLONE_NEWNS) + guard setnsResult == 0 else { + throw ContainerizationError(.internalError, message: "Failed to setns to mount namespace: errno \(errno)") + } } - try Binfmt.mount() - log.logLevel = .debug + private func readEventFromParent(socket: Int32) throws -> (UInt32, String, Com_Apple_Containerization_Sandbox_V3_FileSystemEventType)? { + var eventTypeValue: UInt32 = 0 + guard read(socket, &eventTypeValue, 4) == 4 else { return nil } + eventTypeValue = UInt32(bigEndian: eventTypeValue) + + var pathLen: UInt32 = 0 + guard read(socket, &pathLen, 4) == 4 else { return nil } + pathLen = UInt32(bigEndian: pathLen) + + let pathData = UnsafeMutablePointer.allocate(capacity: Int(pathLen)) + defer { pathData.deallocate() } + guard read(socket, pathData, Int(pathLen)) == pathLen else { return nil } + let pathBytes = Data(bytes: pathData, count: Int(pathLen)) + guard let path = String(data: pathBytes, encoding: .utf8) else { return nil } + + var eventID: UInt32 = 0 + guard read(socket, &eventID, 4) == 4 else { return nil } + eventID = UInt32(bigEndian: eventID) + + guard let eventType = Com_Apple_Containerization_Sandbox_V3_FileSystemEventType(rawValue: Int(eventTypeValue)) else { + return nil + } + + return (eventID, path, eventType) + } + + private func sendResponseToParent(socket: Int32, eventID: UInt32, success: UInt8) throws { + var buffer = Data() + buffer.append(contentsOf: withUnsafeBytes(of: eventID.bigEndian) { Data($0) }) + buffer.append(success) + + try buffer.withUnsafeBytes { bytes in + let written = write(socket, bytes.bindMemory(to: UInt8.self).baseAddress, buffer.count) + guard written == buffer.count else { + throw ContainerizationError(.internalError, message: "Failed to write response to parent") + } + } + } - log.info("vminitd booting") - let eg = MultiThreadedEventLoopGroup(numberOfThreads: System.coreCount) - let server = Initd(log: log, group: eg) + private func generateSyntheticInotifyEvent( + path: String, + eventType: Com_Apple_Containerization_Sandbox_V3_FileSystemEventType + ) throws { + if eventType == .delete && !FileManager.default.fileExists(atPath: path) { + return + } - do { - log.info("serving vminitd API") - try await server.serve(port: vsockPort) - log.info("vminitd API returned") - } catch { - log.error("vminitd boot error \(error)") - exit(1) + let attributes = try FileManager.default.attributesOfItem(atPath: path) + guard let permissions = attributes[.posixPermissions] as? NSNumber else { + throw ContainerizationError(.internalError, message: "Failed to get file permissions for path: \(path)") + } + try FileManager.default.setAttributes( + [.posixPermissions: permissions], + ofItemAtPath: path + ) } } } diff --git a/vminitd/Sources/vminitd/FilesystemEventWorker.swift b/vminitd/Sources/vminitd/FilesystemEventWorker.swift index 2c0ba463..475134f6 100644 --- a/vminitd/Sources/vminitd/FilesystemEventWorker.swift +++ b/vminitd/Sources/vminitd/FilesystemEventWorker.swift @@ -16,6 +16,7 @@ import Containerization import ContainerizationError +import ContainerizationOS import Foundation import NIOCore import NIOPosix @@ -60,67 +61,102 @@ final class FilesystemEventWorker: @unchecked Sendable { let parentSocket = sockets[0] let childSocket = sockets[1] - let pid = fork() - guard pid >= 0 else { + var errorPipe: [Int32] = [0, 0] + guard pipe(&errorPipe) == 0 else { close(parentSocket) close(childSocket) - throw ContainerizationError(.internalError, message: "Failed to fork: errno \(errno)") + throw ContainerizationError(.internalError, message: "Failed to create error pipe: errno \(errno)") } + let errorReadFD = errorPipe[0] + let errorWriteFD = errorPipe[1] + + // Use Command to exec vminitd fs-notify subcommand (fork+execve) + // Socket is FD 3 (extraFiles[0]), error pipe is FD 4 (extraFiles[1]) + var command = Command("/sbin/vminitd", arguments: ["fs-notify", String(containerPID)]) + command.extraFiles = [ + FileHandle(fileDescriptor: childSocket, closeOnDealloc: false), + FileHandle(fileDescriptor: errorWriteFD, closeOnDealloc: false), + ] + command.stdin = .standardInput + command.stdout = .standardOutput + command.stderr = .standardError - if pid == 0 { + do { + try command.start() + } catch { close(parentSocket) - runChildProcess(socket: childSocket) - exit(0) - } else { close(childSocket) - self.childPID = pid - self.parentSocket = parentSocket - - var handshake: UInt8 = 0 - let readResult = read(parentSocket, &handshake, 1) - - if readResult != 1 { - close(parentSocket) - self.parentSocket = nil - var status: Int32 = 0 - waitpid(pid, &status, 0) - self.childPID = nil - throw ContainerizationError(.internalError, message: "Child process failed to start") - } + close(errorReadFD) + close(errorWriteFD) + throw ContainerizationError(.internalError, message: "Failed to start fs-notify process: \(error)") + } - if handshake == Self.handshakeFailure { - close(parentSocket) - self.parentSocket = nil - var status: Int32 = 0 - waitpid(pid, &status, 0) - self.childPID = nil - throw ContainerizationError(.internalError, message: "Child process failed to enter container namespace") - } + let pid = command.pid + close(childSocket) + close(errorWriteFD) // Close write end in parent + self.childPID = pid + self.parentSocket = parentSocket - if handshake != Self.handshakeReady { - close(parentSocket) - self.parentSocket = nil - var status: Int32 = 0 - waitpid(pid, &status, 0) - self.childPID = nil - throw ContainerizationError(.internalError, message: "Child process sent unexpected handshake: \(handshake)") - } + var handshake: UInt8 = 0 + let readResult = read(parentSocket, &handshake, 1) - do { - let bootstrap = NIOPipeBootstrap(group: eventLoop) - .channelInitializer { channel in - let handler = ResponseHandler(worker: self) - return channel.pipeline.addHandler(handler) - } - self.channel = try bootstrap.takingOwnershipOfDescriptor(inputOutput: parentSocket).wait() - } catch { - close(parentSocket) - self.parentSocket = nil - var status: Int32 = 0 - waitpid(pid, &status, 0) - self.childPID = nil - throw ContainerizationError(.internalError, message: "Failed to setup NIO channel: \(error)") - } + if readResult != 1 { + close(parentSocket) + self.parentSocket = nil + close(errorReadFD) + var status: Int32 = 0 + waitpid(pid, &status, 0) + self.childPID = nil + throw ContainerizationError(.internalError, message: "Child process failed to start") + } + + if handshake == Self.handshakeFailure { + close(parentSocket) + self.parentSocket = nil + + // Read error message from child + var errorBuffer = [UInt8](repeating: 0, count: 1024) + let bytesRead = read(errorReadFD, &errorBuffer, errorBuffer.count) + close(errorReadFD) + + var status: Int32 = 0 + waitpid(pid, &status, 0) + self.childPID = nil + + let errorMsg = + bytesRead > 0 + ? (String(bytes: errorBuffer.prefix(bytesRead), encoding: .utf8) ?? "unknown error") + : "no error message" + throw ContainerizationError(.internalError, message: "Child process failed: \(errorMsg)") + } + + if handshake != Self.handshakeReady { + close(parentSocket) + self.parentSocket = nil + close(errorReadFD) + var status: Int32 = 0 + waitpid(pid, &status, 0) + self.childPID = nil + throw ContainerizationError(.internalError, message: "Child process sent unexpected handshake: \(handshake)") + } + + // Success - close error pipe + close(errorReadFD) + + do { + let bootstrap = NIOPipeBootstrap(group: eventLoop) + .channelInitializer { channel in + let handler = ResponseHandler(worker: self) + return channel.pipeline.addHandler(handler) + } + self.channel = try bootstrap.takingOwnershipOfDescriptor(inputOutput: parentSocket).wait() + } catch { + close(parentSocket) + self.parentSocket = nil + var status: Int32 = 0 + waitpid(pid, &status, 0) + self.childPID = nil + throw ContainerizationError(.internalError, message: "Failed to setup NIO channel: \(error)") } } @@ -178,44 +214,6 @@ final class FilesystemEventWorker: @unchecked Sendable { } } - private func runChildProcess(socket: Int32) { - do { - try enterContainerNamespace() - } catch { - var failureHandshake = Self.handshakeFailure - _ = write(socket, &failureHandshake, 1) - close(socket) - exit(1) - } - - var readyHandshake = Self.handshakeReady - guard write(socket, &readyHandshake, 1) == 1 else { - close(socket) - exit(1) - } - - while true { - do { - guard let (eventID, path, eventType) = try readEventFromParent(socket: socket) else { - break - } - - var success: UInt8 = 1 - do { - try generateSyntheticInotifyEvent(path: path, eventType: eventType) - } catch { - success = 0 - } - - try sendResponseToParent(socket: socket, eventID: eventID, success: success) - } catch { - break - } - } - - close(socket) - } - private func sendEventToChild(socket: Int32, eventID: UInt32, path: String, eventType: Com_Apple_Containerization_Sandbox_V3_FileSystemEventType) throws { let pathData = path.data(using: .utf8) ?? Data() let pathLen = UInt32(pathData.count) @@ -235,104 +233,6 @@ final class FilesystemEventWorker: @unchecked Sendable { } } - private func readEventFromParent(socket: Int32) throws -> (UInt32, String, Com_Apple_Containerization_Sandbox_V3_FileSystemEventType)? { - var eventTypeValue: UInt32 = 0 - guard read(socket, &eventTypeValue, 4) == 4 else { return nil } - eventTypeValue = UInt32(bigEndian: eventTypeValue) - - var pathLen: UInt32 = 0 - guard read(socket, &pathLen, 4) == 4 else { return nil } - pathLen = UInt32(bigEndian: pathLen) - - let pathData = UnsafeMutablePointer.allocate(capacity: Int(pathLen)) - defer { pathData.deallocate() } - guard read(socket, pathData, Int(pathLen)) == pathLen else { return nil } - let pathBytes = Data(bytes: pathData, count: Int(pathLen)) - guard let path = String(data: pathBytes, encoding: .utf8) else { return nil } - - var eventID: UInt32 = 0 - guard read(socket, &eventID, 4) == 4 else { return nil } - eventID = UInt32(bigEndian: eventID) - - guard let eventType = Com_Apple_Containerization_Sandbox_V3_FileSystemEventType(rawValue: Int(eventTypeValue)) else { - return nil - } - - return (eventID, path, eventType) - } - - private func sendResponseToParent(socket: Int32, eventID: UInt32, success: UInt8) throws { - var buffer = Data() - buffer.append(contentsOf: withUnsafeBytes(of: eventID.bigEndian) { Data($0) }) - buffer.append(success) - - try buffer.withUnsafeBytes { bytes in - let written = write(socket, bytes.bindMemory(to: UInt8.self).baseAddress, buffer.count) - guard written == buffer.count else { - throw ContainerizationError(.internalError, message: "Failed to write response to parent") - } - } - } - - private func enterContainerNamespace() throws { - let nsPath = "/proc/\(containerPID)/ns/mnt" - let vmNsPath = "/proc/self/ns/mnt" - - guard FileManager.default.fileExists(atPath: nsPath) else { - throw ContainerizationError(.internalError, message: "Namespace file does not exist: \(nsPath)") - } - - let containerNsStatPtr = UnsafeMutablePointer.allocate(capacity: 1) - let vmNsStatPtr = UnsafeMutablePointer.allocate(capacity: 1) - defer { - containerNsStatPtr.deallocate() - vmNsStatPtr.deallocate() - } - - let containerStatResult = stat(nsPath, containerNsStatPtr) - let vmStatResult = stat(vmNsPath, vmNsStatPtr) - - if containerStatResult == 0 && vmStatResult == 0 { - let containerInode = containerNsStatPtr.pointee.st_ino - let vmInode = vmNsStatPtr.pointee.st_ino - - if containerInode == vmInode { - return - } - } - - let fd = open(nsPath, O_RDONLY) - guard fd >= 0 else { - throw ContainerizationError(.internalError, message: "Failed to open namespace file: \(nsPath), errno \(errno)") - } - defer { - _ = close(fd) - } - - let setnsResult = setns(fd, CLONE_NEWNS) - guard setnsResult == 0 else { - throw ContainerizationError(.internalError, message: "Failed to setns to mount namespace: errno \(errno)") - } - } - - private func generateSyntheticInotifyEvent( - path: String, - eventType: Com_Apple_Containerization_Sandbox_V3_FileSystemEventType - ) throws { - if eventType == .delete && !FileManager.default.fileExists(atPath: path) { - return - } - - let attributes = try FileManager.default.attributesOfItem(atPath: path) - guard let permissions = attributes[.posixPermissions] as? NSNumber else { - throw ContainerizationError(.internalError, message: "Failed to get file permissions for path: \(path)") - } - try FileManager.default.setAttributes( - [.posixPermissions: permissions], - ofItemAtPath: path - ) - } - private final class ResponseHandler: ChannelInboundHandler, @unchecked Sendable { typealias InboundIn = ByteBuffer diff --git a/vminitd/Sources/vminitd/ManagedContainer.swift b/vminitd/Sources/vminitd/ManagedContainer.swift index ec9f8872..1b54ac97 100644 --- a/vminitd/Sources/vminitd/ManagedContainer.swift +++ b/vminitd/Sources/vminitd/ManagedContainer.swift @@ -90,7 +90,6 @@ actor ManagedContainer { self.bundle = bundle self.log = log self.group = group - self.filesystemEventWorker = nil } catch { try? cgManager.delete() @@ -109,15 +108,7 @@ extension ManagedContainer { } } - private func startFilesystemEventWorker() throws { - let pid = self.initProcess.pid - guard pid > 0 else { - throw ContainerizationError(.invalidState, message: "Container process not started") - } - - let eventLoop = group.next() - let worker = FilesystemEventWorker(containerID: self.id, containerPID: pid, eventLoop: eventLoop) - try worker.start() + private func installWorker(_ worker: FilesystemEventWorker) { self.filesystemEventWorker = worker } @@ -153,12 +144,27 @@ extension ManagedContainer { func start(execID: String) async throws -> Int32 { let proc = try self.getExecOrInit(execID: execID) - let pid = try await ProcessSupervisor.default.start(process: proc) + let onPidReady: (@Sendable (Int32) throws -> Void)? if execID == self.id { - try self.startFilesystemEventWorker() + // Capture needed values for callback + let containerID = self.id + let eventLoop = self.group.next() + + onPidReady = { [weak self] pid in + let worker = FilesystemEventWorker(containerID: containerID, containerPID: pid, eventLoop: eventLoop) + try worker.start() + + // Hop back to actor to install worker + Task { [weak self] in + await self?.installWorker(worker) + } + } + } else { + onPidReady = nil } + let pid = try await ProcessSupervisor.default.start(process: proc, onPidReady: onPidReady) return pid } diff --git a/vminitd/Sources/vminitd/ManagedProcess.swift b/vminitd/Sources/vminitd/ManagedProcess.swift index 965d1205..45ea75a2 100644 --- a/vminitd/Sources/vminitd/ManagedProcess.swift +++ b/vminitd/Sources/vminitd/ManagedProcess.swift @@ -148,7 +148,7 @@ final class ManagedProcess: Sendable { } extension ManagedProcess { - func start() throws -> Int32 { + func start(onPidReady: (@Sendable (Int32) throws -> Void)? = nil) throws -> Int32 { try self.state.withLock { log.info( "starting managed process", @@ -197,6 +197,8 @@ extension ManagedProcess { try cgManager.addProcess(pid: pid) } + try onPidReady?(pid) + log.info( "sending pid acknowledgement", metadata: [ diff --git a/vminitd/Sources/vminitd/ProcessSupervisor.swift b/vminitd/Sources/vminitd/ProcessSupervisor.swift index a0a70929..9cfe19b5 100644 --- a/vminitd/Sources/vminitd/ProcessSupervisor.swift +++ b/vminitd/Sources/vminitd/ProcessSupervisor.swift @@ -98,7 +98,7 @@ actor ProcessSupervisor { } } - func start(process: ManagedProcess) throws -> Int32 { + func start(process: ManagedProcess, onPidReady: (@Sendable (Int32) throws -> Void)? = nil) throws -> Int32 { self.log?.debug("in supervisor lock to start process") defer { self.log?.debug("out of supervisor lock to start process") @@ -106,7 +106,7 @@ actor ProcessSupervisor { do { self.processes.append(process) - return try process.start() + return try process.start(onPidReady: onPidReady) } catch { self.log?.error("process start failed \(error)", metadata: ["process-id": "\(process.id)"]) throw error From 334246a294c6bf1cbc821ad764f1843059e83988 Mon Sep 17 00:00:00 2001 From: Raj Date: Mon, 13 Oct 2025 14:18:31 -0700 Subject: [PATCH 08/13] make FilesystemEventWorker properly sendable and fix kernel panic when vminitd runs as PID 1 --- vminitd/Sources/vminitd/Application.swift | 21 ++- .../vminitd/FilesystemEventWorker.swift | 143 ++++++++++-------- 2 files changed, 92 insertions(+), 72 deletions(-) diff --git a/vminitd/Sources/vminitd/Application.swift b/vminitd/Sources/vminitd/Application.swift index bd85e617..872770ab 100644 --- a/vminitd/Sources/vminitd/Application.swift +++ b/vminitd/Sources/vminitd/Application.swift @@ -48,7 +48,8 @@ struct Application: AsyncParsableCommand { static let standardErrorLock = NSLock() static func runInForeground(_ log: Logger) throws { - log.info("running vminitd under pid1") + precondition(getpid() != 1, "runInForeground must not be called as PID 1") + log.info("running vminitd under pid1 wrapper") var command = Command("/sbin/vminitd") command.attrs = .init(setsid: true) @@ -111,17 +112,21 @@ extension Application { #if DEBUG let environment = ProcessInfo.processInfo.environment let foreground = environment[Application.foregroundEnvVar] - log.info("checking for shim var \(Application.foregroundEnvVar)=\(String(describing: foreground))") + let isPid1 = (getpid() == 1) + log.info("checking for shim var \(Application.foregroundEnvVar)=\(String(describing: foreground)); pid=\(getpid())") - if foreground == nil { + // only use the FOREGROUND shim when we're not PID 1 + // if we are PID 1 (fresh VM boot), skip the shim to avoid exiting init + if foreground == nil && !isPid1 { try Application.runInForeground(log) - Application.exit(0) + Application.exit(0) // parent is not PID 1; safe to exit after child completes } - // since we are not running as pid1 in this mode we must set ourselves - // as a subpreaper so that all child processes are reaped by us and not - // passed onto our parent. - CZ_set_sub_reaper() + // we only need to be a subreaper when we're not PID 1 + // (when PID 1, the kernel already reaps children) + if !isPid1 { + CZ_set_sub_reaper() + } #endif signal(SIGPIPE, SIG_IGN) diff --git a/vminitd/Sources/vminitd/FilesystemEventWorker.swift b/vminitd/Sources/vminitd/FilesystemEventWorker.swift index 475134f6..f433efe1 100644 --- a/vminitd/Sources/vminitd/FilesystemEventWorker.swift +++ b/vminitd/Sources/vminitd/FilesystemEventWorker.swift @@ -28,20 +28,30 @@ import Musl import Glibc #endif -final class FilesystemEventWorker: @unchecked Sendable { +final class FilesystemEventWorker: Sendable { private static let handshakeReady: UInt8 = 0xAA private static let handshakeFailure: UInt8 = 0xFF private let containerID: String private let containerPID: Int32 - private var childPID: Int32? - private var parentSocket: Int32? - private var channel: Channel? private let eventLoop: EventLoop - private var eventIDCounter: UInt32 = 0 - private let pendingEvents: Mutex<[UInt32: CheckedContinuation]> = Mutex([:]) private let shouldStop: Atomic = Atomic(false) + // Cross-thread state (accessed from any thread) + private struct State { + var childPID: Int32? + var parentSocket: Int32? + } + private let state: Mutex = Mutex(State(childPID: nil, parentSocket: nil)) + + // Event-loop confined state (only accessed on channel.eventLoop) + private final class ELState: @unchecked Sendable { + var channel: Channel? + var eventIDCounter: UInt32 = 0 + var pendingEvents: [UInt32: CheckedContinuation] = [:] + } + private let elState = ELState() + init(containerID: String, containerPID: Int32, eventLoop: EventLoop) { self.containerID = containerID self.containerPID = containerPID @@ -49,7 +59,7 @@ final class FilesystemEventWorker: @unchecked Sendable { } func start() throws { - guard childPID == nil else { + guard state.withLock({ $0.childPID }) == nil else { throw ContainerizationError(.invalidState, message: "FilesystemEventWorker already started") } @@ -94,25 +104,27 @@ final class FilesystemEventWorker: @unchecked Sendable { let pid = command.pid close(childSocket) close(errorWriteFD) // Close write end in parent - self.childPID = pid - self.parentSocket = parentSocket + state.withLock { + $0.childPID = pid + $0.parentSocket = parentSocket + } var handshake: UInt8 = 0 let readResult = read(parentSocket, &handshake, 1) if readResult != 1 { close(parentSocket) - self.parentSocket = nil + state.withLock { $0.parentSocket = nil } close(errorReadFD) var status: Int32 = 0 waitpid(pid, &status, 0) - self.childPID = nil + state.withLock { $0.childPID = nil } throw ContainerizationError(.internalError, message: "Child process failed to start") } if handshake == Self.handshakeFailure { close(parentSocket) - self.parentSocket = nil + state.withLock { $0.parentSocket = nil } // Read error message from child var errorBuffer = [UInt8](repeating: 0, count: 1024) @@ -121,7 +133,7 @@ final class FilesystemEventWorker: @unchecked Sendable { var status: Int32 = 0 waitpid(pid, &status, 0) - self.childPID = nil + state.withLock { $0.childPID = nil } let errorMsg = bytesRead > 0 @@ -132,11 +144,11 @@ final class FilesystemEventWorker: @unchecked Sendable { if handshake != Self.handshakeReady { close(parentSocket) - self.parentSocket = nil + state.withLock { $0.parentSocket = nil } close(errorReadFD) var status: Int32 = 0 waitpid(pid, &status, 0) - self.childPID = nil + state.withLock { $0.childPID = nil } throw ContainerizationError(.internalError, message: "Child process sent unexpected handshake: \(handshake)") } @@ -149,37 +161,40 @@ final class FilesystemEventWorker: @unchecked Sendable { let handler = ResponseHandler(worker: self) return channel.pipeline.addHandler(handler) } - self.channel = try bootstrap.takingOwnershipOfDescriptor(inputOutput: parentSocket).wait() + self.elState.channel = try bootstrap.takingOwnershipOfDescriptor(inputOutput: parentSocket).wait() } catch { close(parentSocket) - self.parentSocket = nil + state.withLock { $0.parentSocket = nil } var status: Int32 = 0 waitpid(pid, &status, 0) - self.childPID = nil + state.withLock { $0.childPID = nil } throw ContainerizationError(.internalError, message: "Failed to setup NIO channel: \(error)") } } func enqueueEvent(path: String, eventType: Com_Apple_Containerization_Sandbox_V3_FileSystemEventType) async throws { - guard let socket = parentSocket, !shouldStop.load(ordering: .relaxed) else { - throw ContainerizationError(.invalidState, message: "FilesystemEventWorker not running") + let socket = try state.withLock { state throws -> Int32 in + guard let socket = state.parentSocket, !shouldStop.load(ordering: .relaxed) else { + throw ContainerizationError(.invalidState, message: "FilesystemEventWorker not running") + } + return socket } - let eventID = eventIDCounter - eventIDCounter += 1 - + // Use continuation to bridge between async and event loop try await withCheckedThrowingContinuation { (continuation: CheckedContinuation) in - pendingEvents.withLock { events in - events[eventID] = continuation - } - - do { - try sendEventToChild(socket: socket, eventID: eventID, path: path, eventType: eventType) - } catch { - _ = pendingEvents.withLock { events in - events.removeValue(forKey: eventID) + // Hop to event loop to access event ID counter and store continuation + eventLoop.execute { [elState] in + let eventID = elState.eventIDCounter + elState.eventIDCounter += 1 + elState.pendingEvents[eventID] = continuation + + do { + try self.sendEventToChild(socket: socket, eventID: eventID, path: path, eventType: eventType) + } catch { + // Remove continuation and resume with error on send failure + _ = elState.pendingEvents.removeValue(forKey: eventID) + continuation.resume(throwing: error) } - continuation.resume(throwing: error) } } } @@ -187,30 +202,32 @@ final class FilesystemEventWorker: @unchecked Sendable { func stop() { shouldStop.store(true, ordering: .relaxed) - if let channel = self.channel { - try? channel.close().wait() - self.channel = nil + // Close channel and clean up pending events on event loop + eventLoop.execute { [elState] in + elState.channel?.close(promise: nil) + elState.channel = nil + + for (_, continuation) in elState.pendingEvents { + continuation.resume(throwing: ContainerizationError(.cancelled, message: "FilesystemEventWorker stopped")) + } + elState.pendingEvents.removeAll() } - self.parentSocket = nil + // Kill child process + state.withLock { state in + state.parentSocket = nil - if let pid = childPID { - #if canImport(Musl) - Musl.kill(pid, SIGTERM) - #elseif canImport(Glibc) - Glibc.kill(pid, SIGTERM) - #endif + if let pid = state.childPID { + #if canImport(Musl) + Musl.kill(pid, SIGTERM) + #elseif canImport(Glibc) + Glibc.kill(pid, SIGTERM) + #endif - var status: Int32 = 0 - waitpid(pid, &status, 0) - childPID = nil - } - - pendingEvents.withLock { events in - for (_, continuation) in events { - continuation.resume(throwing: ContainerizationError(.cancelled, message: "FilesystemEventWorker stopped")) + var status: Int32 = 0 + waitpid(pid, &status, 0) + state.childPID = nil } - events.removeAll() } } @@ -254,25 +271,23 @@ final class FilesystemEventWorker: @unchecked Sendable { break } - worker.pendingEvents.withLock { events in - if let continuation = events.removeValue(forKey: eventID) { - if success == 1 { - continuation.resume() - } else { - continuation.resume(throwing: ContainerizationError(.internalError, message: "Child process failed to process filesystem event")) - } + // ResponseHandler runs on event loop, so can access elState.pendingEvents directly + if let continuation = worker.elState.pendingEvents.removeValue(forKey: eventID) { + if success == 1 { + continuation.resume() + } else { + continuation.resume(throwing: ContainerizationError(.internalError, message: "Child process failed to process filesystem event")) } } } } func errorCaught(context: ChannelHandlerContext, error: Error) { - worker.pendingEvents.withLock { events in - for (_, continuation) in events { - continuation.resume(throwing: error) - } - events.removeAll() + // ResponseHandler runs on event loop, so can access elState.pendingEvents directly + for (_, continuation) in worker.elState.pendingEvents { + continuation.resume(throwing: error) } + worker.elState.pendingEvents.removeAll() } } } From 2907115e7fce42506187f6eb7ee257736cc72227 Mon Sep 17 00:00:00 2001 From: Raj Date: Mon, 13 Oct 2025 15:43:58 -0700 Subject: [PATCH 09/13] change max concurrency for integration suite to 6 --- Sources/Integration/ContainerTests.swift | 83 ++++++++++++++++++++++++ 1 file changed, 83 insertions(+) diff --git a/Sources/Integration/ContainerTests.swift b/Sources/Integration/ContainerTests.swift index 5cc303eb..4d2eae3a 100644 --- a/Sources/Integration/ContainerTests.swift +++ b/Sources/Integration/ContainerTests.swift @@ -22,6 +22,8 @@ import ContainerizationOS import Crypto import Foundation import Logging +import NIOCore +import NIOPosix extension IntegrationSuite { func testProcessTrue() async throws { @@ -997,6 +999,87 @@ extension IntegrationSuite { return socketPath } + func testFSNotifyEvents() async throws { + let id = "test-fsnotify-events" + + let bs = try await bootstrap(id, reference: "docker.io/library/node:18-alpine") + let directory = try createMountDirectory() + let inotifyBuffer: IntegrationSuite.BufferWriter = BufferWriter() + let container = try LinuxContainer(id, rootfs: bs.rootfs, vmm: bs.vmm) { config in + config.process.arguments = [ + "node", + "-e", + "fs=require('fs');fs.watch(process.argv[1],(t,f)=>console.log(t,f))", + "/mnt", + ] + config.process.stdout = inotifyBuffer + config.process.stderr = inotifyBuffer + config.mounts.append(.share(source: directory.path, destination: "/mnt")) + } + + try await container.create() + try await container.start() + + // Get the vminitd agent to send notifications + let connection = try await container.dialVsock(port: 1024) // Default vminitd port + let group = MultiThreadedEventLoopGroup(numberOfThreads: 1) + let agent = Vminitd(connection: connection, group: group) + try await Task.sleep(for: .seconds(1)) + + let createResponse = try await agent.notifyFileSystemEvent( + path: "/mnt/hi.txt", + eventType: .create, + containerID: id + ) + + guard createResponse.success else { + throw IntegrationError.assert(msg: "CREATE event failed: \(createResponse.error)") + } + + let modifyResponse = try await agent.notifyFileSystemEvent( + path: "/mnt/hi.txt", + eventType: .modify, + containerID: id + ) + guard modifyResponse.success else { + throw IntegrationError.assert(msg: "MODIFY event failed: \(modifyResponse.error)") + } + + try await Task.sleep(for: .seconds(1)) + + let deleteResponse = try await agent.notifyFileSystemEvent( + path: "/mnt/nonexistent.txt", + eventType: .delete, + containerID: id + ) + guard deleteResponse.success else { + throw IntegrationError.assert(msg: "DELETE event failed: \(deleteResponse.error)") + } + + try await Task.sleep(for: .seconds(1)) + + let inotifyOutput = String(data: inotifyBuffer.data, encoding: .utf8) ?? "" + + let expectedLines = ["change hi.txt", "change hi.txt"] + let actualLines = inotifyOutput.trimmingCharacters(in: .whitespacesAndNewlines).components(separatedBy: .newlines).filter { !$0.isEmpty } + + guard actualLines.count >= expectedLines.count else { + throw IntegrationError.assert(msg: "Expected at least \(expectedLines.count) events, got \(actualLines.count). Output: '\(inotifyOutput)'") + } + + let hasExpectedEvents = expectedLines.allSatisfy { expectedLine in + actualLines.contains(expectedLine) + } + + guard hasExpectedEvents else { + throw IntegrationError.assert(msg: "Expected events not found. Expected: \(expectedLines), Actual: \(actualLines)") + } + + try await agent.close() + try await group.shutdownGracefully() + try await container.stop() + } + private func createMountDirectory() throws -> URL { let dir = FileManager.default.uniqueTemporaryDirectory(create: true) try "hello".write(to: dir.appendingPathComponent("hi.txt"), atomically: true, encoding: .utf8) From 716168edfa854be48aa804ec1ef22d1d6fc7052a Mon Sep 17 00:00:00 2001 From: Raj Date: Fri, 17 Oct 2025 15:03:27 -0700 Subject: [PATCH 10/13] switch event notification to fire and forget, writes to use NIO, and fix other comments --- Sources/Containerization/Vminitd.swift | 3 +- Sources/Integration/ContainerTests.swift | 39 ++--- vminitd/Sources/vminitd/Application.swift | 37 ++-- .../vminitd/FilesystemEventWorker.swift | 164 +++++------------- .../Sources/vminitd/ManagedContainer.swift | 8 +- 5 files changed, 83 insertions(+), 168 deletions(-) diff --git a/Sources/Containerization/Vminitd.swift b/Sources/Containerization/Vminitd.swift index e8b053f3..bb294158 100644 --- a/Sources/Containerization/Vminitd.swift +++ b/Sources/Containerization/Vminitd.swift @@ -335,6 +335,7 @@ extension Vminitd: VirtualMachineAgent { extension Vminitd { public typealias FileSystemEventRequest = Com_Apple_Containerization_Sandbox_V3_NotifyFileSystemEventRequest public typealias FileSystemEventResponse = Com_Apple_Containerization_Sandbox_V3_NotifyFileSystemEventResponse + public typealias FileSystemEventType = Com_Apple_Containerization_Sandbox_V3_FileSystemEventType /// Sets up an emulator in the guest. public func setupEmulator(binaryPath: String, configuration: Binfmt.Entry) async throws { let request = Com_Apple_Containerization_Sandbox_V3_SetupEmulatorRequest.with { @@ -457,7 +458,7 @@ extension Vminitd { public func notifyFileSystemEvent( path: String, - eventType: Com_Apple_Containerization_Sandbox_V3_FileSystemEventType, + eventType: FileSystemEventType, containerID: String ) async throws -> FileSystemEventResponse { let request = FileSystemEventRequest.with { diff --git a/Sources/Integration/ContainerTests.swift b/Sources/Integration/ContainerTests.swift index 4d2eae3a..d1cf34cc 100644 --- a/Sources/Integration/ContainerTests.swift +++ b/Sources/Integration/ContainerTests.swift @@ -1026,16 +1026,26 @@ extension IntegrationSuite { let agent = Vminitd(connection: connection, group: group) try await Task.sleep(for: .seconds(1)) + // Send CREATE event let createResponse = try await agent.notifyFileSystemEvent( path: "/mnt/hi.txt", eventType: .create, containerID: id ) - guard createResponse.success else { throw IntegrationError.assert(msg: "CREATE event failed: \(createResponse.error)") } + try await Task.sleep(for: .seconds(1)) + + let output1 = String(data: inotifyBuffer.data, encoding: .utf8) ?? "" + let lines1 = output1.trimmingCharacters(in: .whitespacesAndNewlines).components(separatedBy: .newlines).filter { !$0.isEmpty } + + guard lines1 == ["change hi.txt"] else { + throw IntegrationError.assert(msg: "CREATE should output 'change hi.txt'. Got: \(lines1)") + } + + // Send MODIFY event let modifyResponse = try await agent.notifyFileSystemEvent( path: "/mnt/hi.txt", eventType: .modify, @@ -1047,6 +1057,14 @@ extension IntegrationSuite { try await Task.sleep(for: .seconds(1)) + let output2 = String(data: inotifyBuffer.data, encoding: .utf8) ?? "" + let lines2 = output2.trimmingCharacters(in: .whitespacesAndNewlines).components(separatedBy: .newlines).filter { !$0.isEmpty } + + guard lines2 == ["change hi.txt", "change hi.txt"] else { + throw IntegrationError.assert(msg: "After MODIFY, expected exactly 2 'change hi.txt'. Got: \(lines2)") + } + + // Send DELETE event on non-existent file (should succeed but not crash) let deleteResponse = try await agent.notifyFileSystemEvent( path: "/mnt/nonexistent.txt", eventType: .delete, @@ -1056,25 +1074,6 @@ extension IntegrationSuite { throw IntegrationError.assert(msg: "DELETE event failed: \(deleteResponse.error)") } - try await Task.sleep(for: .seconds(1)) - - let inotifyOutput = String(data: inotifyBuffer.data, encoding: .utf8) ?? "" - - let expectedLines = ["change hi.txt", "change hi.txt"] - let actualLines = inotifyOutput.trimmingCharacters(in: .whitespacesAndNewlines).components(separatedBy: .newlines).filter { !$0.isEmpty } - - guard actualLines.count >= expectedLines.count else { - throw IntegrationError.assert(msg: "Expected at least \(expectedLines.count) events, got \(actualLines.count). Output: '\(inotifyOutput)'") - } - - let hasExpectedEvents = expectedLines.allSatisfy { expectedLine in - actualLines.contains(expectedLine) - } - - guard hasExpectedEvents else { - throw IntegrationError.assert(msg: "Expected events not found. Expected: \(expectedLines), Actual: \(actualLines)") - } - try await agent.close() try await group.shutdownGracefully() try await container.stop() diff --git a/vminitd/Sources/vminitd/Application.swift b/vminitd/Sources/vminitd/Application.swift index 872770ab..4bac9952 100644 --- a/vminitd/Sources/vminitd/Application.swift +++ b/vminitd/Sources/vminitd/Application.swift @@ -205,19 +205,23 @@ extension Application { while true { do { - guard let (eventID, path, eventType) = try readEventFromParent(socket: socketFD) else { + guard let (path, eventType) = try readEventFromParent(socket: socketFD) else { break } - var success: UInt8 = 1 do { try generateSyntheticInotifyEvent(path: path, eventType: eventType) } catch { - success = 0 + // Log detailed error to stderr (captured by parent) + let errorMsg = "Failed to generate inotify event: path=\(path), type=\(eventType), error=\(error)" + fputs(errorMsg + "\n", stderr) + fflush(stderr) } - try sendResponseToParent(socket: socketFD, eventID: eventID, success: success) } catch { + // Log and exit + fputs("Protocol error reading from parent: \(error)\n", stderr) + fflush(stderr) break } } @@ -262,7 +266,7 @@ extension Application { } } - private func readEventFromParent(socket: Int32) throws -> (UInt32, String, Com_Apple_Containerization_Sandbox_V3_FileSystemEventType)? { + private func readEventFromParent(socket: Int32) throws -> (String, FileSystemEventType)? { var eventTypeValue: UInt32 = 0 guard read(socket, &eventTypeValue, 4) == 4 else { return nil } eventTypeValue = UInt32(bigEndian: eventTypeValue) @@ -277,33 +281,16 @@ extension Application { let pathBytes = Data(bytes: pathData, count: Int(pathLen)) guard let path = String(data: pathBytes, encoding: .utf8) else { return nil } - var eventID: UInt32 = 0 - guard read(socket, &eventID, 4) == 4 else { return nil } - eventID = UInt32(bigEndian: eventID) - - guard let eventType = Com_Apple_Containerization_Sandbox_V3_FileSystemEventType(rawValue: Int(eventTypeValue)) else { + guard let eventType = FileSystemEventType(rawValue: Int(eventTypeValue)) else { return nil } - return (eventID, path, eventType) - } - - private func sendResponseToParent(socket: Int32, eventID: UInt32, success: UInt8) throws { - var buffer = Data() - buffer.append(contentsOf: withUnsafeBytes(of: eventID.bigEndian) { Data($0) }) - buffer.append(success) - - try buffer.withUnsafeBytes { bytes in - let written = write(socket, bytes.bindMemory(to: UInt8.self).baseAddress, buffer.count) - guard written == buffer.count else { - throw ContainerizationError(.internalError, message: "Failed to write response to parent") - } - } + return (path, eventType) } private func generateSyntheticInotifyEvent( path: String, - eventType: Com_Apple_Containerization_Sandbox_V3_FileSystemEventType + eventType: FileSystemEventType ) throws { if eventType == .delete && !FileManager.default.fileExists(atPath: path) { return diff --git a/vminitd/Sources/vminitd/FilesystemEventWorker.swift b/vminitd/Sources/vminitd/FilesystemEventWorker.swift index f433efe1..32c4b8ff 100644 --- a/vminitd/Sources/vminitd/FilesystemEventWorker.swift +++ b/vminitd/Sources/vminitd/FilesystemEventWorker.swift @@ -18,6 +18,7 @@ import Containerization import ContainerizationError import ContainerizationOS import Foundation +import Logging import NIOCore import NIOPosix import Synchronization @@ -28,6 +29,8 @@ import Musl import Glibc #endif +typealias FileSystemEventType = Com_Apple_Containerization_Sandbox_V3_FileSystemEventType + final class FilesystemEventWorker: Sendable { private static let handshakeReady: UInt8 = 0xAA private static let handshakeFailure: UInt8 = 0xFF @@ -35,27 +38,21 @@ final class FilesystemEventWorker: Sendable { private let containerID: String private let containerPID: Int32 private let eventLoop: EventLoop - private let shouldStop: Atomic = Atomic(false) + private let log: Logger - // Cross-thread state (accessed from any thread) + // Cross-thread state (synchronized via Mutex) private struct State { var childPID: Int32? - var parentSocket: Int32? - } - private let state: Mutex = Mutex(State(childPID: nil, parentSocket: nil)) - - // Event-loop confined state (only accessed on channel.eventLoop) - private final class ELState: @unchecked Sendable { + var isStopped: Bool = false var channel: Channel? - var eventIDCounter: UInt32 = 0 - var pendingEvents: [UInt32: CheckedContinuation] = [:] } - private let elState = ELState() + private let state: Mutex = Mutex(State(childPID: nil, isStopped: false)) - init(containerID: String, containerPID: Int32, eventLoop: EventLoop) { + init(containerID: String, containerPID: Int32, eventLoop: EventLoop, log: Logger) { self.containerID = containerID self.containerPID = containerPID self.eventLoop = eventLoop + self.log = log } func start() throws { @@ -106,7 +103,6 @@ final class FilesystemEventWorker: Sendable { close(errorWriteFD) // Close write end in parent state.withLock { $0.childPID = pid - $0.parentSocket = parentSocket } var handshake: UInt8 = 0 @@ -114,7 +110,6 @@ final class FilesystemEventWorker: Sendable { if readResult != 1 { close(parentSocket) - state.withLock { $0.parentSocket = nil } close(errorReadFD) var status: Int32 = 0 waitpid(pid, &status, 0) @@ -124,7 +119,6 @@ final class FilesystemEventWorker: Sendable { if handshake == Self.handshakeFailure { close(parentSocket) - state.withLock { $0.parentSocket = nil } // Read error message from child var errorBuffer = [UInt8](repeating: 0, count: 1024) @@ -144,7 +138,6 @@ final class FilesystemEventWorker: Sendable { if handshake != Self.handshakeReady { close(parentSocket) - state.withLock { $0.parentSocket = nil } close(errorReadFD) var status: Int32 = 0 waitpid(pid, &status, 0) @@ -156,15 +149,15 @@ final class FilesystemEventWorker: Sendable { close(errorReadFD) do { - let bootstrap = NIOPipeBootstrap(group: eventLoop) - .channelInitializer { channel in - let handler = ResponseHandler(worker: self) - return channel.pipeline.addHandler(handler) - } - self.elState.channel = try bootstrap.takingOwnershipOfDescriptor(inputOutput: parentSocket).wait() + let eventChannel = try NIOPipeBootstrap(group: eventLoop) + .takingOwnershipOfDescriptor(inputOutput: parentSocket) + .wait() + + state.withLock { state in + state.channel = eventChannel + } } catch { close(parentSocket) - state.withLock { $0.parentSocket = nil } var status: Int32 = 0 waitpid(pid, &status, 0) state.withLock { $0.childPID = nil } @@ -172,51 +165,46 @@ final class FilesystemEventWorker: Sendable { } } - func enqueueEvent(path: String, eventType: Com_Apple_Containerization_Sandbox_V3_FileSystemEventType) async throws { - let socket = try state.withLock { state throws -> Int32 in - guard let socket = state.parentSocket, !shouldStop.load(ordering: .relaxed) else { - throw ContainerizationError(.invalidState, message: "FilesystemEventWorker not running") - } - return socket - } - - // Use continuation to bridge between async and event loop - try await withCheckedThrowingContinuation { (continuation: CheckedContinuation) in - // Hop to event loop to access event ID counter and store continuation - eventLoop.execute { [elState] in - let eventID = elState.eventIDCounter - elState.eventIDCounter += 1 - elState.pendingEvents[eventID] = continuation - - do { - try self.sendEventToChild(socket: socket, eventID: eventID, path: path, eventType: eventType) - } catch { - // Remove continuation and resume with error on send failure - _ = elState.pendingEvents.removeValue(forKey: eventID) - continuation.resume(throwing: error) - } + func enqueueEvent(path: String, eventType: FileSystemEventType) throws { + guard !state.withLock({ $0.isStopped }) else { + throw ContainerizationError(.invalidState, message: "FilesystemEventWorker not running") + } + + eventLoop.execute { + let channel = self.state.withLock { $0.channel } + guard let channel = channel else { return } + + // Build ByteBuffer with binary protocol: + // [event_type:4 bytes][path_len:4 bytes][path:N bytes] + let pathUTF8Count = path.utf8.count + var buffer = channel.allocator.buffer(capacity: 8 + pathUTF8Count) + buffer.writeInteger(UInt32(eventType.rawValue), endianness: .big) + buffer.writeInteger(UInt32(pathUTF8Count), endianness: .big) + buffer.writeString(path) + + channel.writeAndFlush(buffer).whenFailure { error in + self.log.warning( + "Failed to send event to fs-notify child", + metadata: [ + "container": "\(self.containerID)", + "path": "\(path)", + "error": "\(error)", + ]) } } } func stop() { - shouldStop.store(true, ordering: .relaxed) - - // Close channel and clean up pending events on event loop - eventLoop.execute { [elState] in - elState.channel?.close(promise: nil) - elState.channel = nil - - for (_, continuation) in elState.pendingEvents { - continuation.resume(throwing: ContainerizationError(.cancelled, message: "FilesystemEventWorker stopped")) + eventLoop.execute { + self.state.withLock { state in + state.channel?.close(promise: nil) + state.channel = nil + state.isStopped = true } - elState.pendingEvents.removeAll() } // Kill child process state.withLock { state in - state.parentSocket = nil - if let pid = state.childPID { #if canImport(Musl) Musl.kill(pid, SIGTERM) @@ -230,64 +218,4 @@ final class FilesystemEventWorker: Sendable { } } } - - private func sendEventToChild(socket: Int32, eventID: UInt32, path: String, eventType: Com_Apple_Containerization_Sandbox_V3_FileSystemEventType) throws { - let pathData = path.data(using: .utf8) ?? Data() - let pathLen = UInt32(pathData.count) - let eventTypeValue = UInt32(eventType.rawValue) - - var buffer = Data() - buffer.append(contentsOf: withUnsafeBytes(of: eventTypeValue.bigEndian) { Data($0) }) - buffer.append(contentsOf: withUnsafeBytes(of: pathLen.bigEndian) { Data($0) }) - buffer.append(pathData) - buffer.append(contentsOf: withUnsafeBytes(of: eventID.bigEndian) { Data($0) }) - - try buffer.withUnsafeBytes { bytes in - let written = write(socket, bytes.bindMemory(to: UInt8.self).baseAddress, buffer.count) - guard written == buffer.count else { - throw ContainerizationError(.internalError, message: "Failed to write event to child: written \(written), expected \(buffer.count)") - } - } - } - - private final class ResponseHandler: ChannelInboundHandler, @unchecked Sendable { - typealias InboundIn = ByteBuffer - - private var buffer = ByteBuffer() - private unowned let worker: FilesystemEventWorker - - init(worker: FilesystemEventWorker) { - self.worker = worker - } - - func channelRead(context: ChannelHandlerContext, data: NIOAny) { - var inBuffer = unwrapInboundIn(data) - buffer.writeBuffer(&inBuffer) - - while buffer.readableBytes >= 5 { - guard let eventID = buffer.readInteger(endianness: .big, as: UInt32.self), - let success = buffer.readInteger(as: UInt8.self) - else { - break - } - - // ResponseHandler runs on event loop, so can access elState.pendingEvents directly - if let continuation = worker.elState.pendingEvents.removeValue(forKey: eventID) { - if success == 1 { - continuation.resume() - } else { - continuation.resume(throwing: ContainerizationError(.internalError, message: "Child process failed to process filesystem event")) - } - } - } - } - - func errorCaught(context: ChannelHandlerContext, error: Error) { - // ResponseHandler runs on event loop, so can access elState.pendingEvents directly - for (_, continuation) in worker.elState.pendingEvents { - continuation.resume(throwing: error) - } - worker.elState.pendingEvents.removeAll() - } - } } diff --git a/vminitd/Sources/vminitd/ManagedContainer.swift b/vminitd/Sources/vminitd/ManagedContainer.swift index 1b54ac97..7c98d042 100644 --- a/vminitd/Sources/vminitd/ManagedContainer.swift +++ b/vminitd/Sources/vminitd/ManagedContainer.swift @@ -22,7 +22,6 @@ import ContainerizationOS import Foundation import Logging import NIOCore -import Synchronization #if canImport(Musl) import Musl @@ -112,11 +111,11 @@ extension ManagedContainer { self.filesystemEventWorker = worker } - func executeFileSystemEvent(path: String, eventType: Com_Apple_Containerization_Sandbox_V3_FileSystemEventType) async throws { + func executeFileSystemEvent(path: String, eventType: FileSystemEventType) throws { guard let worker = self.filesystemEventWorker else { throw ContainerizationError(.invalidState, message: "Filesystem event worker not started for container \(self.id)") } - try await worker.enqueueEvent(path: path, eventType: eventType) + try worker.enqueueEvent(path: path, eventType: eventType) } func createExec( @@ -150,9 +149,10 @@ extension ManagedContainer { // Capture needed values for callback let containerID = self.id let eventLoop = self.group.next() + let log = self.log onPidReady = { [weak self] pid in - let worker = FilesystemEventWorker(containerID: containerID, containerPID: pid, eventLoop: eventLoop) + let worker = FilesystemEventWorker(containerID: containerID, containerPID: pid, eventLoop: eventLoop, log: log) try worker.start() // Hop back to actor to install worker From d84f2f9766735242099a096906309d1f12d8f0f9 Mon Sep 17 00:00:00 2001 From: Raj Date: Thu, 23 Oct 2025 15:01:52 -0700 Subject: [PATCH 11/13] make fmt --- Sources/Integration/Suite.swift | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/Sources/Integration/Suite.swift b/Sources/Integration/Suite.swift index 530b5756..ab0300eb 100644 --- a/Sources/Integration/Suite.swift +++ b/Sources/Integration/Suite.swift @@ -162,7 +162,11 @@ struct IntegrationSuite: AsyncParsableCommand { static let eventLoop = MultiThreadedEventLoopGroup(numberOfThreads: System.coreCount) - func bootstrap(_ testID: String, reference: String = "ghcr.io/linuxcontainers/alpine:3.20") async throws -> (rootfs: Containerization.Mount, vmm: VirtualMachineManager, image: Containerization.Image, bootlog: URL) { + static let eventLoop = MultiThreadedEventLoopGroup(numberOfThreads: System.coreCount) + + func bootstrap(_ testID: String, reference: String = "ghcr.io/linuxcontainers/alpine:3.20") async throws -> ( + rootfs: Containerization.Mount, vmm: VirtualMachineManager, image: Containerization.Image, bootlog: URL + ) { let store = Self.imageStore let initImage = try await store.getInitImage(reference: Self.initImage) From e368d60ef52d412c5a52cfd2c101115a94041b1a Mon Sep 17 00:00:00 2001 From: Raj Date: Tue, 28 Oct 2025 13:33:38 -0700 Subject: [PATCH 12/13] add bootlog for fsnotify test --- Sources/Integration/ContainerTests.swift | 1 + Sources/Integration/Suite.swift | 2 -- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/Sources/Integration/ContainerTests.swift b/Sources/Integration/ContainerTests.swift index d1cf34cc..525d5eed 100644 --- a/Sources/Integration/ContainerTests.swift +++ b/Sources/Integration/ContainerTests.swift @@ -1015,6 +1015,7 @@ extension IntegrationSuite { config.process.stdout = inotifyBuffer config.process.stderr = inotifyBuffer config.mounts.append(.share(source: directory.path, destination: "/mnt")) + config.bootlog = bs.bootlog } try await container.create() diff --git a/Sources/Integration/Suite.swift b/Sources/Integration/Suite.swift index ab0300eb..319f80f8 100644 --- a/Sources/Integration/Suite.swift +++ b/Sources/Integration/Suite.swift @@ -162,8 +162,6 @@ struct IntegrationSuite: AsyncParsableCommand { static let eventLoop = MultiThreadedEventLoopGroup(numberOfThreads: System.coreCount) - static let eventLoop = MultiThreadedEventLoopGroup(numberOfThreads: System.coreCount) - func bootstrap(_ testID: String, reference: String = "ghcr.io/linuxcontainers/alpine:3.20") async throws -> ( rootfs: Containerization.Mount, vmm: VirtualMachineManager, image: Containerization.Image, bootlog: URL ) { From 4558603453bbccdb82780c9ea3a3df7dd64ec0ea Mon Sep 17 00:00:00 2001 From: Raj Date: Thu, 30 Oct 2025 21:45:46 -0700 Subject: [PATCH 13/13] replace self-exec with thread-based FSNotify worker --- vminitd/Package.swift | 1 - vminitd/Sources/vminitd/Application.swift | 284 +++--------------- .../vminitd/FilesystemEventWorker.swift | 245 +++++++++++---- 3 files changed, 234 insertions(+), 296 deletions(-) diff --git a/vminitd/Package.swift b/vminitd/Package.swift index 9070ae75..7c3b81c2 100644 --- a/vminitd/Package.swift +++ b/vminitd/Package.swift @@ -47,7 +47,6 @@ let package = Package( .executableTarget( name: "vminitd", dependencies: [ - .product(name: "ArgumentParser", package: "swift-argument-parser"), .product(name: "Logging", package: "swift-log"), .product(name: "_NIOFileSystem", package: "swift-nio"), .product(name: "Containerization", package: "containerization"), diff --git a/vminitd/Sources/vminitd/Application.swift b/vminitd/Sources/vminitd/Application.swift index 4bac9952..af04ef2c 100644 --- a/vminitd/Sources/vminitd/Application.swift +++ b/vminitd/Sources/vminitd/Application.swift @@ -14,7 +14,6 @@ // limitations under the License. //===----------------------------------------------------------------------===// -import ArgumentParser import Containerization import ContainerizationError import ContainerizationOS @@ -26,46 +25,29 @@ import NIOPosix #if os(Linux) import Musl import LCShim -#else -import Darwin #endif @main -struct Application: AsyncParsableCommand { - static let configuration = CommandConfiguration( - commandName: "vminitd", - abstract: "VM init process and container agent", - version: "1.0.0", - subcommands: [ - InitCommand.self, - FsNotifyCommand.self, - ], - defaultSubcommand: InitCommand.self - ) +struct Application { + private static let foregroundEnvVar = "FOREGROUND" + private static let vsockPort = 1024 + private static let standardErrorLock = NSLock() - static let foregroundEnvVar = "FOREGROUND" - static let vsockPort = 1024 - static let standardErrorLock = NSLock() - - static func runInForeground(_ log: Logger) throws { - precondition(getpid() != 1, "runInForeground must not be called as PID 1") - log.info("running vminitd under pid1 wrapper") + private static func runInForeground(_ log: Logger) throws { + log.info("running vminitd under pid1") var command = Command("/sbin/vminitd") command.attrs = .init(setsid: true) command.stdin = .standardInput command.stdout = .standardOutput command.stderr = .standardError - - var env = ProcessInfo.processInfo.environment - env[foregroundEnvVar] = "1" - command.environment = env.map { "\($0.key)=\($0.value)" } + command.environment = ["\(foregroundEnvVar)=1"] try command.start() _ = try command.wait() } - static func adjustLimits() throws { + private static func adjustLimits() throws { var limits = rlimit() guard getrlimit(RLIMIT_NOFILE, &limits) == 0 else { throw POSIXError(.init(rawValue: errno)!) @@ -78,232 +60,64 @@ struct Application: AsyncParsableCommand { } @Sendable - static func standardError(label: String) -> StreamLogHandler { + private static func standardError(label: String) -> StreamLogHandler { standardErrorLock.withLock { StreamLogHandler.standardError(label: label) } } - static func exit(_ code: Int32) -> Never { - #if os(Linux) - Musl.exit(code) - #else - Darwin.exit(code) - #endif - } -} - -extension Application { - struct InitCommand: AsyncParsableCommand { - static let configuration = CommandConfiguration( - commandName: "init", - abstract: "Run vminitd as init process (default)" - ) - - func run() async throws { - LoggingSystem.bootstrap(Application.standardError) - var log = Logger(label: "vminitd") - - try Application.adjustLimits() - - // when running under debug mode, launch vminitd as a sub process of pid1 - // so that we get a chance to collect better logs and errors before pid1 exists - // and the kernel panics. - #if DEBUG - let environment = ProcessInfo.processInfo.environment - let foreground = environment[Application.foregroundEnvVar] - let isPid1 = (getpid() == 1) - log.info("checking for shim var \(Application.foregroundEnvVar)=\(String(describing: foreground)); pid=\(getpid())") + static func main() async throws { + LoggingSystem.bootstrap(standardError) + var log = Logger(label: "vminitd") - // only use the FOREGROUND shim when we're not PID 1 - // if we are PID 1 (fresh VM boot), skip the shim to avoid exiting init - if foreground == nil && !isPid1 { - try Application.runInForeground(log) - Application.exit(0) // parent is not PID 1; safe to exit after child completes - } + try adjustLimits() - // we only need to be a subreaper when we're not PID 1 - // (when PID 1, the kernel already reaps children) - if !isPid1 { - CZ_set_sub_reaper() - } - #endif + // when running under debug mode, launch vminitd as a sub process of pid1 + // so that we get a chance to collect better logs and errors before pid1 exists + // and the kernel panics. + #if DEBUG + let environment = ProcessInfo.processInfo.environment + let foreground = environment[Self.foregroundEnvVar] + log.info("checking for shim var \(foregroundEnvVar)=\(String(describing: foreground))") - signal(SIGPIPE, SIG_IGN) - - // Because the sysctl rpc wouldn't make sense if this didn't always exist, we - // ALWAYS mount /proc. - guard Musl.mount("proc", "/proc", "proc", 0, "") == 0 else { - log.error("failed to mount /proc") - Application.exit(1) - } - guard Musl.mount("tmpfs", "/run", "tmpfs", 0, "") == 0 else { - log.error("failed to mount /run") - Application.exit(1) - } - try Binfmt.mount() - - log.logLevel = .debug - - log.info("vminitd booting") - let eg = MultiThreadedEventLoopGroup(numberOfThreads: System.coreCount) - let server = Initd(log: log, group: eg) - - do { - log.info("serving vminitd API") - try await server.serve(port: Application.vsockPort) - log.info("vminitd API returned") - } catch { - log.error("vminitd boot error \(error)") - Application.exit(1) - } + if foreground == nil { + try runInForeground(log) + exit(0) } - } -} - -extension Application { - struct FsNotifyCommand: ParsableCommand { - static let configuration = CommandConfiguration( - commandName: "fs-notify", - abstract: "Internal command to run filesystem notification worker in container namespace", - shouldDisplay: false - ) - - @Argument(help: "Container PID whose namespace to enter") - var containerPID: Int32 - - private static let handshakeReady: UInt8 = 0xAA - private static let handshakeFailure: UInt8 = 0xFF - func run() throws { - // FD 3 = socket (extraFiles[0]), FD 4 = error pipe (extraFiles[1]) - let socketFD: Int32 = 3 - let errorPipeFD: Int32 = 4 - - do { - try enterContainerNamespace(containerPID: containerPID) - close(errorPipeFD) - } catch { - let errorMsg = "Failed to enter namespace: \(error)" - _ = errorMsg.utf8CString.withUnsafeBufferPointer { buffer in - // -1 to skip null terminator - write(errorPipeFD, buffer.baseAddress, buffer.count - 1) - } - close(errorPipeFD) - - var failureHandshake = Self.handshakeFailure - _ = write(socketFD, &failureHandshake, 1) - close(socketFD) - Application.exit(1) - } - - var readyHandshake = Self.handshakeReady - guard write(socketFD, &readyHandshake, 1) == 1 else { - close(socketFD) - Application.exit(1) - } - - while true { - do { - guard let (path, eventType) = try readEventFromParent(socket: socketFD) else { - break - } - - do { - try generateSyntheticInotifyEvent(path: path, eventType: eventType) - } catch { - // Log detailed error to stderr (captured by parent) - let errorMsg = "Failed to generate inotify event: path=\(path), type=\(eventType), error=\(error)" - fputs(errorMsg + "\n", stderr) - fflush(stderr) - } + // since we are not running as pid1 in this mode we must set ourselves + // as a subpreaper so that all child processes are reaped by us and not + // passed onto our parent. + CZ_set_sub_reaper() + #endif - } catch { - // Log and exit - fputs("Protocol error reading from parent: \(error)\n", stderr) - fflush(stderr) - break - } - } + signal(SIGPIPE, SIG_IGN) - close(socketFD) + // Because the sysctl rpc wouldn't make sense if this didn't always exist, we + // ALWAYS mount /proc. + guard Musl.mount("proc", "/proc", "proc", 0, "") == 0 else { + log.error("failed to mount /proc") + exit(1) } - - private func enterContainerNamespace(containerPID: Int32) throws { - let nsPath = "/proc/\(containerPID)/ns/mnt" - let vmNsPath = "/proc/self/ns/mnt" - - let containerNsStatPtr = UnsafeMutablePointer.allocate(capacity: 1) - let vmNsStatPtr = UnsafeMutablePointer.allocate(capacity: 1) - defer { - containerNsStatPtr.deallocate() - vmNsStatPtr.deallocate() - } - - let containerStatResult = stat(nsPath, containerNsStatPtr) - let vmStatResult = stat(vmNsPath, vmNsStatPtr) - - if containerStatResult == 0 && vmStatResult == 0 { - let containerInode = containerNsStatPtr.pointee.st_ino - let vmInode = vmNsStatPtr.pointee.st_ino - - if containerInode == vmInode { - return - } - } - - let fd = open(nsPath, O_RDONLY) - guard fd >= 0 else { - throw ContainerizationError(.internalError, message: "Failed to open namespace file: \(nsPath), errno \(errno)") - } - defer { - _ = close(fd) - } - let _ = unshare(CLONE_FS) - let setnsResult = setns(fd, CLONE_NEWNS) - guard setnsResult == 0 else { - throw ContainerizationError(.internalError, message: "Failed to setns to mount namespace: errno \(errno)") - } + guard Musl.mount("tmpfs", "/run", "tmpfs", 0, "") == 0 else { + log.error("failed to mount /run") + exit(1) } + try Binfmt.mount() - private func readEventFromParent(socket: Int32) throws -> (String, FileSystemEventType)? { - var eventTypeValue: UInt32 = 0 - guard read(socket, &eventTypeValue, 4) == 4 else { return nil } - eventTypeValue = UInt32(bigEndian: eventTypeValue) - - var pathLen: UInt32 = 0 - guard read(socket, &pathLen, 4) == 4 else { return nil } - pathLen = UInt32(bigEndian: pathLen) - - let pathData = UnsafeMutablePointer.allocate(capacity: Int(pathLen)) - defer { pathData.deallocate() } - guard read(socket, pathData, Int(pathLen)) == pathLen else { return nil } - let pathBytes = Data(bytes: pathData, count: Int(pathLen)) - guard let path = String(data: pathBytes, encoding: .utf8) else { return nil } - - guard let eventType = FileSystemEventType(rawValue: Int(eventTypeValue)) else { - return nil - } - - return (path, eventType) - } + log.logLevel = .debug - private func generateSyntheticInotifyEvent( - path: String, - eventType: FileSystemEventType - ) throws { - if eventType == .delete && !FileManager.default.fileExists(atPath: path) { - return - } + log.info("vminitd booting") + let eg = MultiThreadedEventLoopGroup(numberOfThreads: System.coreCount) + let server = Initd(log: log, group: eg) - let attributes = try FileManager.default.attributesOfItem(atPath: path) - guard let permissions = attributes[.posixPermissions] as? NSNumber else { - throw ContainerizationError(.internalError, message: "Failed to get file permissions for path: \(path)") - } - try FileManager.default.setAttributes( - [.posixPermissions: permissions], - ofItemAtPath: path - ) + do { + log.info("serving vminitd API") + try await server.serve(port: vsockPort) + log.info("vminitd API returned") + } catch { + log.error("vminitd boot error \(error)") + exit(1) } } } diff --git a/vminitd/Sources/vminitd/FilesystemEventWorker.swift b/vminitd/Sources/vminitd/FilesystemEventWorker.swift index 32c4b8ff..cf73d0e9 100644 --- a/vminitd/Sources/vminitd/FilesystemEventWorker.swift +++ b/vminitd/Sources/vminitd/FilesystemEventWorker.swift @@ -42,11 +42,11 @@ final class FilesystemEventWorker: Sendable { // Cross-thread state (synchronized via Mutex) private struct State { - var childPID: Int32? + var isStarted: Bool = false var isStopped: Bool = false var channel: Channel? } - private let state: Mutex = Mutex(State(childPID: nil, isStopped: false)) + private let state: Mutex = Mutex(State(isStarted: false, isStopped: false)) init(containerID: String, containerPID: Int32, eventLoop: EventLoop, log: Logger) { self.containerID = containerID @@ -56,7 +56,7 @@ final class FilesystemEventWorker: Sendable { } func start() throws { - guard state.withLock({ $0.childPID }) == nil else { + guard !state.withLock({ $0.isStarted }) else { throw ContainerizationError(.invalidState, message: "FilesystemEventWorker already started") } @@ -77,33 +77,27 @@ final class FilesystemEventWorker: Sendable { let errorReadFD = errorPipe[0] let errorWriteFD = errorPipe[1] - // Use Command to exec vminitd fs-notify subcommand (fork+execve) - // Socket is FD 3 (extraFiles[0]), error pipe is FD 4 (extraFiles[1]) - var command = Command("/sbin/vminitd", arguments: ["fs-notify", String(containerPID)]) - command.extraFiles = [ - FileHandle(fileDescriptor: childSocket, closeOnDealloc: false), - FileHandle(fileDescriptor: errorWriteFD, closeOnDealloc: false), - ] - command.stdin = .standardInput - command.stdout = .standardOutput - command.stderr = .standardError + let containerID = self.containerID + let containerPID = self.containerPID + let log = self.log - do { - try command.start() - } catch { - close(parentSocket) - close(childSocket) - close(errorReadFD) - close(errorWriteFD) - throw ContainerizationError(.internalError, message: "Failed to start fs-notify process: \(error)") - } + let thread = Thread { [weak self] in + defer { + close(childSocket) + } - let pid = command.pid - close(childSocket) - close(errorWriteFD) // Close write end in parent - state.withLock { - $0.childPID = pid + self?.runWorkerThread( + socket: childSocket, + errorPipe: errorWriteFD, + containerID: containerID, + containerPID: containerPID, + log: log + ) } + thread.name = "fsnotify-\(containerID)" + thread.start() + + state.withLock { $0.isStarted = true } var handshake: UInt8 = 0 let readResult = read(parentSocket, &handshake, 1) @@ -111,41 +105,35 @@ final class FilesystemEventWorker: Sendable { if readResult != 1 { close(parentSocket) close(errorReadFD) - var status: Int32 = 0 - waitpid(pid, &status, 0) - state.withLock { $0.childPID = nil } - throw ContainerizationError(.internalError, message: "Child process failed to start") + state.withLock { $0.isStarted = false } + throw ContainerizationError(.internalError, message: "Worker thread failed to send handshake") } if handshake == Self.handshakeFailure { close(parentSocket) - // Read error message from child + // Read error message from thread var errorBuffer = [UInt8](repeating: 0, count: 1024) let bytesRead = read(errorReadFD, &errorBuffer, errorBuffer.count) close(errorReadFD) - var status: Int32 = 0 - waitpid(pid, &status, 0) - state.withLock { $0.childPID = nil } + state.withLock { $0.isStarted = false } let errorMsg = bytesRead > 0 ? (String(bytes: errorBuffer.prefix(bytesRead), encoding: .utf8) ?? "unknown error") : "no error message" - throw ContainerizationError(.internalError, message: "Child process failed: \(errorMsg)") + throw ContainerizationError(.internalError, message: "Worker thread failed: \(errorMsg)") } if handshake != Self.handshakeReady { close(parentSocket) close(errorReadFD) - var status: Int32 = 0 - waitpid(pid, &status, 0) - state.withLock { $0.childPID = nil } - throw ContainerizationError(.internalError, message: "Child process sent unexpected handshake: \(handshake)") + state.withLock { $0.isStarted = false } + throw ContainerizationError(.internalError, message: "Worker thread sent unexpected handshake: \(handshake)") } - // Success - close error pipe + // Success - close error pipe read end close(errorReadFD) do { @@ -158,13 +146,161 @@ final class FilesystemEventWorker: Sendable { } } catch { close(parentSocket) - var status: Int32 = 0 - waitpid(pid, &status, 0) - state.withLock { $0.childPID = nil } + state.withLock { $0.isStarted = false } throw ContainerizationError(.internalError, message: "Failed to setup NIO channel: \(error)") } } + private func runWorkerThread( + socket: Int32, + errorPipe: Int32, + containerID: String, + containerPID: Int32, + log: Logger + ) { + // Helper to send error and handshake failure + func sendError(_ message: String) { + _ = message.utf8CString.withUnsafeBufferPointer { buffer in + write(errorPipe, buffer.baseAddress, buffer.count - 1) + } + close(errorPipe) + var failureHandshake = Self.handshakeFailure + _ = write(socket, &failureHandshake, 1) + } + + do { + try enterContainerNamespace(containerPID: containerPID, log: log) + } catch { + sendError("Failed to enter namespace: \(error)") + return + } + + close(errorPipe) + var readyHandshake = Self.handshakeReady + guard write(socket, &readyHandshake, 1) == 1 else { + return + } + + while true { + do { + guard let (path, eventType) = try readEventFromParent(socket: socket) else { + break + } + + do { + try generateSyntheticInotifyEvent(path: path, eventType: eventType) + } catch { + let errorMsg = "Failed to generate inotify event: path=\(path), type=\(eventType), error=\(error)" + fputs(errorMsg + "\n", stderr) + fflush(stderr) + } + } catch { + fputs("Protocol error reading from parent: \(error)\n", stderr) + fflush(stderr) + break + } + } + } + + private func enterContainerNamespace(containerPID: Int32, log: Logger) throws { + let nsPath = "/proc/\(containerPID)/ns/mnt" + let vmNsPath = "/proc/self/ns/mnt" + + let containerNsStatPtr = UnsafeMutablePointer.allocate(capacity: 1) + let vmNsStatPtr = UnsafeMutablePointer.allocate(capacity: 1) + defer { + containerNsStatPtr.deallocate() + vmNsStatPtr.deallocate() + } + + let containerStatResult = stat(nsPath, containerNsStatPtr) + let vmStatResult = stat(vmNsPath, vmNsStatPtr) + + if containerStatResult == 0 && vmStatResult == 0 { + let containerInode = containerNsStatPtr.pointee.st_ino + let vmInode = vmNsStatPtr.pointee.st_ino + + if containerInode == vmInode { + return + } + } + + let fd = open(nsPath, O_RDONLY) + guard fd >= 0 else { + throw ContainerizationError(.internalError, message: "Failed to open namespace file: \(nsPath), errno \(errno)") + } + defer { + close(fd) + } + + #if canImport(Musl) + let unshareResult = Musl.unshare(CLONE_FS) + #elseif canImport(Glibc) + let unshareResult = Glibc.unshare(CLONE_FS) + #endif + guard unshareResult == 0 else { + throw ContainerizationError(.internalError, message: "Failed to unshare filesystem structure: errno \(errno)") + } + + #if canImport(Musl) + let setnsResult = Musl.setns(fd, CLONE_NEWNS) + #elseif canImport(Glibc) + let setnsResult = Glibc.setns(fd, CLONE_NEWNS) + #endif + guard setnsResult == 0 else { + throw ContainerizationError(.internalError, message: "Failed to setns to mount namespace: errno \(errno)") + } + } + + private func readEventFromParent(socket: Int32) throws -> (String, FileSystemEventType)? { + var eventTypeValue: UInt32 = 0 + guard read(socket, &eventTypeValue, 4) == 4 else { + return nil + } + eventTypeValue = UInt32(bigEndian: eventTypeValue) + + var pathLen: UInt32 = 0 + guard read(socket, &pathLen, 4) == 4 else { + throw ContainerizationError(.internalError, message: "Failed to read path length from parent") + } + pathLen = UInt32(bigEndian: pathLen) + + let pathData = UnsafeMutablePointer.allocate(capacity: Int(pathLen)) + defer { pathData.deallocate() } + guard read(socket, pathData, Int(pathLen)) == pathLen else { + throw ContainerizationError(.internalError, message: "Failed to read path from parent") + } + let pathBytes = Data(bytes: pathData, count: Int(pathLen)) + guard let path = String(data: pathBytes, encoding: .utf8) else { + throw ContainerizationError(.internalError, message: "Failed to decode path as UTF-8") + } + + guard let eventType = FileSystemEventType(rawValue: Int(eventTypeValue)) else { + throw ContainerizationError(.internalError, message: "Invalid event type: \(eventTypeValue)") + } + + return (path, eventType) + } + + private func generateSyntheticInotifyEvent( + path: String, + eventType: FileSystemEventType + ) throws { + if eventType == .delete && !FileManager.default.fileExists(atPath: path) { + return + } + + let attributes = try FileManager.default.attributesOfItem(atPath: path) + guard let permissions = attributes[.posixPermissions] as? NSNumber else { + throw ContainerizationError(.internalError, message: "Failed to get file permissions for path: \(path)") + } + + try FileManager.default.setAttributes( + [.posixPermissions: permissions], + ofItemAtPath: path + ) + } + func enqueueEvent(path: String, eventType: FileSystemEventType) throws { guard !state.withLock({ $0.isStopped }) else { throw ContainerizationError(.invalidState, message: "FilesystemEventWorker not running") @@ -195,26 +331,15 @@ final class FilesystemEventWorker: Sendable { } func stop() { + state.withLock { state in + state.isStopped = true + state.isStarted = false + } + eventLoop.execute { self.state.withLock { state in state.channel?.close(promise: nil) state.channel = nil - state.isStopped = true - } - } - - // Kill child process - state.withLock { state in - if let pid = state.childPID { - #if canImport(Musl) - Musl.kill(pid, SIGTERM) - #elseif canImport(Glibc) - Glibc.kill(pid, SIGTERM) - #endif - - var status: Int32 = 0 - waitpid(pid, &status, 0) - state.childPID = nil } } }