feat: auto-stop on silence, fix transcription hangs

- Auto-stop recording after 20 seconds of continuous silence
  (RMS-based silence detection in AudioCapture)
- Add WebSocket ping every 10s to keep connection alive
- Auto-reconnect if no events received for 30s (stale connection)
- Add diagnostic logging for all event types to help debug issues

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Carsten Abele 2026-04-09 13:02:21 +02:00
parent 8b56202e90
commit 89b1c12d58
3 changed files with 90 additions and 9 deletions

View file

@ -6,9 +6,17 @@ final class AudioCapture {
private let chunkDurationMs: Double = 480 private let chunkDurationMs: Double = 480
var onChunk: ((Data) -> Void)? var onChunk: ((Data) -> Void)?
var onSilenceTimeout: (() -> Void)?
/// RMS threshold below which audio is considered silence (16-bit PCM range)
var silenceThreshold: Int16 = 200
/// Seconds of continuous silence before triggering timeout
var silenceTimeoutSeconds: Double = 20
private var buffer = Data() private var buffer = Data()
private let bytesPerChunk: Int private let bytesPerChunk: Int
private var consecutiveSilentChunks: Int = 0
private var chunksPerSecond: Double { 1000.0 / chunkDurationMs }
init() { init() {
// 16kHz * 2 bytes (16-bit) * 1 channel * 0.48s = 15360 bytes // 16kHz * 2 bytes (16-bit) * 1 channel * 0.48s = 15360 bytes
@ -58,12 +66,25 @@ final class AudioCapture {
self.buffer.append(data) self.buffer.append(data)
while self.buffer.count >= self.bytesPerChunk { while self.buffer.count >= self.bytesPerChunk {
let chunk = self.buffer.prefix(self.bytesPerChunk) let chunk = Data(self.buffer.prefix(self.bytesPerChunk))
self.buffer = Data(self.buffer.dropFirst(self.bytesPerChunk)) self.buffer = Data(self.buffer.dropFirst(self.bytesPerChunk))
self.onChunk?(Data(chunk))
if self.isSilent(chunk) {
self.consecutiveSilentChunks += 1
let silentSeconds = Double(self.consecutiveSilentChunks) / self.chunksPerSecond
if silentSeconds >= self.silenceTimeoutSeconds {
self.onSilenceTimeout?()
return
}
} else {
self.consecutiveSilentChunks = 0
}
self.onChunk?(chunk)
} }
} }
consecutiveSilentChunks = 0
engine.prepare() engine.prepare()
try engine.start() try engine.start()
} }
@ -78,6 +99,20 @@ final class AudioCapture {
buffer = Data() buffer = Data()
} }
} }
private func isSilent(_ chunk: Data) -> Bool {
chunk.withUnsafeBytes { rawBuffer in
guard let samples = rawBuffer.baseAddress?.assumingMemoryBound(to: Int16.self) else { return true }
let count = chunk.count / 2
var sumSquares: Int64 = 0
for i in 0..<count {
let sample = Int64(samples[i])
sumSquares += sample * sample
}
let rms = Int16(sqrt(Double(sumSquares) / Double(max(count, 1))))
return rms < silenceThreshold
}
}
} }
enum AudioCaptureError: Error, LocalizedError { enum AudioCaptureError: Error, LocalizedError {

View file

@ -15,6 +15,8 @@ final class TranscriptionManager: ObservableObject {
private let wsClient = VoxtralWebSocketClient() private let wsClient = VoxtralWebSocketClient()
private let settings = AppSettings.shared private let settings = AppSettings.shared
private var hasRetried = false private var hasRetried = false
private var lastEventTime = Date()
private var staleCheckTimer: Timer?
var isRecording: Bool { state == .recording } var isRecording: Bool { state == .recording }
@ -40,6 +42,8 @@ final class TranscriptionManager: ObservableObject {
} }
wsClient.connect(apiKey: settings.apiKey, delayMs: settings.streamingDelayMs) wsClient.connect(apiKey: settings.apiKey, delayMs: settings.streamingDelayMs)
lastEventTime = Date()
startStaleCheck()
audioCapture.onChunk = { [weak self] chunk in audioCapture.onChunk = { [weak self] chunk in
Task { @MainActor in Task { @MainActor in
@ -47,6 +51,13 @@ final class TranscriptionManager: ObservableObject {
} }
} }
audioCapture.onSilenceTimeout = { [weak self] in
Task { @MainActor in
print("[MyVoxtral] Silence timeout — stopping recording")
self?.stop()
}
}
do { do {
try audioCapture.start() try audioCapture.start()
state = .recording state = .recording
@ -56,6 +67,8 @@ final class TranscriptionManager: ObservableObject {
} }
func stop() { func stop() {
staleCheckTimer?.invalidate()
staleCheckTimer = nil
audioCapture.stop() audioCapture.stop()
wsClient.flush() wsClient.flush()
wsClient.disconnect() wsClient.disconnect()
@ -66,34 +79,59 @@ final class TranscriptionManager: ObservableObject {
} }
} }
private func startStaleCheck() {
staleCheckTimer?.invalidate()
staleCheckTimer = Timer.scheduledTimer(withTimeInterval: 10, repeats: true) { [weak self] _ in
Task { @MainActor in
guard let self, self.isRecording else { return }
let elapsed = Date().timeIntervalSince(self.lastEventTime)
if elapsed > 30 {
print("[MyVoxtral] Connection stale (\(Int(elapsed))s since last event) — reconnecting")
self.wsClient.disconnect()
self.wsClient.connect(apiKey: self.settings.apiKey, delayMs: self.settings.streamingDelayMs)
self.lastEventTime = Date()
} else {
self.wsClient.ping()
}
}
}
}
private func handleEvent(_ event: VoxtralEvent) { private func handleEvent(_ event: VoxtralEvent) {
lastEventTime = Date()
switch event { switch event {
case .sessionCreated: case .sessionCreated:
break print("[MyVoxtral] Session created")
case .textDelta(let text): case .textDelta(let text):
currentText += text currentText += text
if settings.outputMode == .cursorInjection { if settings.outputMode == .cursorInjection {
CursorInjector.typeText(text) CursorInjector.typeText(text)
} }
case .segment: case .segment(let text, let start, let end):
break print("[MyVoxtral] Segment [\(String(format: "%.1f", start))-\(String(format: "%.1f", end))]: \(text.prefix(60))")
case .language: case .language(let lang):
break print("[MyVoxtral] Language detected: \(lang)")
case .done(let text): case .done(let text):
print("[MyVoxtral] Done, final text length: \(text.count)")
if currentText.isEmpty { if currentText.isEmpty {
currentText = text currentText = text
} }
case .error(let message): case .error(let message):
print("[MyVoxtral] Error: \(message)")
if !hasRetried && state == .recording { if !hasRetried && state == .recording {
hasRetried = true hasRetried = true
wsClient.disconnect() wsClient.disconnect()
wsClient.connect(apiKey: settings.apiKey, delayMs: settings.streamingDelayMs) wsClient.connect(apiKey: settings.apiKey, delayMs: settings.streamingDelayMs)
lastEventTime = Date()
} else { } else {
state = .error(message) state = .error(message)
staleCheckTimer?.invalidate()
staleCheckTimer = nil
audioCapture.stop() audioCapture.stop()
} }
case .unknown: case .unknown(let type):
break print("[MyVoxtral] Unknown event: \(type)")
} }
} }
} }

View file

@ -47,6 +47,14 @@ final class VoxtralWebSocketClient {
sendJSON(AudioFlushMessage()) sendJSON(AudioFlushMessage())
} }
func ping() {
webSocketTask?.sendPing { error in
if let error {
print("[MyVoxtral] Ping failed: \(error.localizedDescription)")
}
}
}
func disconnect() { func disconnect() {
intentionalDisconnect = true intentionalDisconnect = true
sendJSON(AudioEndMessage()) sendJSON(AudioEndMessage())