From 89b1c12d5859d7d64196a39b513d42c3f6d0a970 Mon Sep 17 00:00:00 2001 From: Carsten Abele Date: Thu, 9 Apr 2026 13:02:21 +0200 Subject: [PATCH] feat: auto-stop on silence, fix transcription hangs - Auto-stop recording after 20 seconds of continuous silence (RMS-based silence detection in AudioCapture) - Add WebSocket ping every 10s to keep connection alive - Auto-reconnect if no events received for 30s (stale connection) - Add diagnostic logging for all event types to help debug issues Co-Authored-By: Claude Opus 4.6 (1M context) --- MyVoxtral/MyVoxtral/Audio/AudioCapture.swift | 39 +++++++++++++- .../Models/TranscriptionManager.swift | 52 ++++++++++++++++--- .../Network/VoxtralWebSocketClient.swift | 8 +++ 3 files changed, 90 insertions(+), 9 deletions(-) diff --git a/MyVoxtral/MyVoxtral/Audio/AudioCapture.swift b/MyVoxtral/MyVoxtral/Audio/AudioCapture.swift index f72b3f4..23b949c 100644 --- a/MyVoxtral/MyVoxtral/Audio/AudioCapture.swift +++ b/MyVoxtral/MyVoxtral/Audio/AudioCapture.swift @@ -6,9 +6,17 @@ final class AudioCapture { private let chunkDurationMs: Double = 480 var onChunk: ((Data) -> Void)? + var onSilenceTimeout: (() -> Void)? + + /// RMS threshold below which audio is considered silence (16-bit PCM range) + var silenceThreshold: Int16 = 200 + /// Seconds of continuous silence before triggering timeout + var silenceTimeoutSeconds: Double = 20 private var buffer = Data() private let bytesPerChunk: Int + private var consecutiveSilentChunks: Int = 0 + private var chunksPerSecond: Double { 1000.0 / chunkDurationMs } init() { // 16kHz * 2 bytes (16-bit) * 1 channel * 0.48s = 15360 bytes @@ -58,12 +66,25 @@ final class AudioCapture { self.buffer.append(data) while self.buffer.count >= self.bytesPerChunk { - let chunk = self.buffer.prefix(self.bytesPerChunk) + let chunk = Data(self.buffer.prefix(self.bytesPerChunk)) self.buffer = Data(self.buffer.dropFirst(self.bytesPerChunk)) - self.onChunk?(Data(chunk)) + + if self.isSilent(chunk) { + self.consecutiveSilentChunks += 1 + let silentSeconds = Double(self.consecutiveSilentChunks) / self.chunksPerSecond + if silentSeconds >= self.silenceTimeoutSeconds { + self.onSilenceTimeout?() + return + } + } else { + self.consecutiveSilentChunks = 0 + } + + self.onChunk?(chunk) } } + consecutiveSilentChunks = 0 engine.prepare() try engine.start() } @@ -78,6 +99,20 @@ final class AudioCapture { buffer = Data() } } + + private func isSilent(_ chunk: Data) -> Bool { + chunk.withUnsafeBytes { rawBuffer in + guard let samples = rawBuffer.baseAddress?.assumingMemoryBound(to: Int16.self) else { return true } + let count = chunk.count / 2 + var sumSquares: Int64 = 0 + for i in 0.. 30 { + print("[MyVoxtral] Connection stale (\(Int(elapsed))s since last event) — reconnecting") + self.wsClient.disconnect() + self.wsClient.connect(apiKey: self.settings.apiKey, delayMs: self.settings.streamingDelayMs) + self.lastEventTime = Date() + } else { + self.wsClient.ping() + } + } + } + } + private func handleEvent(_ event: VoxtralEvent) { + lastEventTime = Date() + switch event { case .sessionCreated: - break + print("[MyVoxtral] Session created") case .textDelta(let text): currentText += text if settings.outputMode == .cursorInjection { CursorInjector.typeText(text) } - case .segment: - break - case .language: - break + case .segment(let text, let start, let end): + print("[MyVoxtral] Segment [\(String(format: "%.1f", start))-\(String(format: "%.1f", end))]: \(text.prefix(60))") + case .language(let lang): + print("[MyVoxtral] Language detected: \(lang)") case .done(let text): + print("[MyVoxtral] Done, final text length: \(text.count)") if currentText.isEmpty { currentText = text } case .error(let message): + print("[MyVoxtral] Error: \(message)") if !hasRetried && state == .recording { hasRetried = true wsClient.disconnect() wsClient.connect(apiKey: settings.apiKey, delayMs: settings.streamingDelayMs) + lastEventTime = Date() } else { state = .error(message) + staleCheckTimer?.invalidate() + staleCheckTimer = nil audioCapture.stop() } - case .unknown: - break + case .unknown(let type): + print("[MyVoxtral] Unknown event: \(type)") } } } diff --git a/MyVoxtral/MyVoxtral/Network/VoxtralWebSocketClient.swift b/MyVoxtral/MyVoxtral/Network/VoxtralWebSocketClient.swift index ec4d1b8..9548f4c 100644 --- a/MyVoxtral/MyVoxtral/Network/VoxtralWebSocketClient.swift +++ b/MyVoxtral/MyVoxtral/Network/VoxtralWebSocketClient.swift @@ -47,6 +47,14 @@ final class VoxtralWebSocketClient { sendJSON(AudioFlushMessage()) } + func ping() { + webSocketTask?.sendPing { error in + if let error { + print("[MyVoxtral] Ping failed: \(error.localizedDescription)") + } + } + } + func disconnect() { intentionalDisconnect = true sendJSON(AudioEndMessage())