feat: auto-stop on silence, fix transcription hangs

- Auto-stop recording after 20 seconds of continuous silence
  (RMS-based silence detection in AudioCapture)
- Add WebSocket ping every 10s to keep connection alive
- Auto-reconnect if no events received for 30s (stale connection)
- Add diagnostic logging for all event types to help debug issues

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Carsten Abele 2026-04-09 13:02:21 +02:00
parent 8b56202e90
commit 89b1c12d58
3 changed files with 90 additions and 9 deletions

View file

@ -6,9 +6,17 @@ final class AudioCapture {
private let chunkDurationMs: Double = 480
var onChunk: ((Data) -> Void)?
var onSilenceTimeout: (() -> Void)?
/// RMS threshold below which audio is considered silence (16-bit PCM range)
var silenceThreshold: Int16 = 200
/// Seconds of continuous silence before triggering timeout
var silenceTimeoutSeconds: Double = 20
private var buffer = Data()
private let bytesPerChunk: Int
private var consecutiveSilentChunks: Int = 0
private var chunksPerSecond: Double { 1000.0 / chunkDurationMs }
init() {
// 16kHz * 2 bytes (16-bit) * 1 channel * 0.48s = 15360 bytes
@ -58,12 +66,25 @@ final class AudioCapture {
self.buffer.append(data)
while self.buffer.count >= self.bytesPerChunk {
let chunk = self.buffer.prefix(self.bytesPerChunk)
let chunk = Data(self.buffer.prefix(self.bytesPerChunk))
self.buffer = Data(self.buffer.dropFirst(self.bytesPerChunk))
self.onChunk?(Data(chunk))
if self.isSilent(chunk) {
self.consecutiveSilentChunks += 1
let silentSeconds = Double(self.consecutiveSilentChunks) / self.chunksPerSecond
if silentSeconds >= self.silenceTimeoutSeconds {
self.onSilenceTimeout?()
return
}
} else {
self.consecutiveSilentChunks = 0
}
self.onChunk?(chunk)
}
}
consecutiveSilentChunks = 0
engine.prepare()
try engine.start()
}
@ -78,6 +99,20 @@ final class AudioCapture {
buffer = Data()
}
}
private func isSilent(_ chunk: Data) -> Bool {
chunk.withUnsafeBytes { rawBuffer in
guard let samples = rawBuffer.baseAddress?.assumingMemoryBound(to: Int16.self) else { return true }
let count = chunk.count / 2
var sumSquares: Int64 = 0
for i in 0..<count {
let sample = Int64(samples[i])
sumSquares += sample * sample
}
let rms = Int16(sqrt(Double(sumSquares) / Double(max(count, 1))))
return rms < silenceThreshold
}
}
}
enum AudioCaptureError: Error, LocalizedError {

View file

@ -15,6 +15,8 @@ final class TranscriptionManager: ObservableObject {
private let wsClient = VoxtralWebSocketClient()
private let settings = AppSettings.shared
private var hasRetried = false
private var lastEventTime = Date()
private var staleCheckTimer: Timer?
var isRecording: Bool { state == .recording }
@ -40,6 +42,8 @@ final class TranscriptionManager: ObservableObject {
}
wsClient.connect(apiKey: settings.apiKey, delayMs: settings.streamingDelayMs)
lastEventTime = Date()
startStaleCheck()
audioCapture.onChunk = { [weak self] chunk in
Task { @MainActor in
@ -47,6 +51,13 @@ final class TranscriptionManager: ObservableObject {
}
}
audioCapture.onSilenceTimeout = { [weak self] in
Task { @MainActor in
print("[MyVoxtral] Silence timeout — stopping recording")
self?.stop()
}
}
do {
try audioCapture.start()
state = .recording
@ -56,6 +67,8 @@ final class TranscriptionManager: ObservableObject {
}
func stop() {
staleCheckTimer?.invalidate()
staleCheckTimer = nil
audioCapture.stop()
wsClient.flush()
wsClient.disconnect()
@ -66,34 +79,59 @@ final class TranscriptionManager: ObservableObject {
}
}
private func startStaleCheck() {
staleCheckTimer?.invalidate()
staleCheckTimer = Timer.scheduledTimer(withTimeInterval: 10, repeats: true) { [weak self] _ in
Task { @MainActor in
guard let self, self.isRecording else { return }
let elapsed = Date().timeIntervalSince(self.lastEventTime)
if elapsed > 30 {
print("[MyVoxtral] Connection stale (\(Int(elapsed))s since last event) — reconnecting")
self.wsClient.disconnect()
self.wsClient.connect(apiKey: self.settings.apiKey, delayMs: self.settings.streamingDelayMs)
self.lastEventTime = Date()
} else {
self.wsClient.ping()
}
}
}
}
private func handleEvent(_ event: VoxtralEvent) {
lastEventTime = Date()
switch event {
case .sessionCreated:
break
print("[MyVoxtral] Session created")
case .textDelta(let text):
currentText += text
if settings.outputMode == .cursorInjection {
CursorInjector.typeText(text)
}
case .segment:
break
case .language:
break
case .segment(let text, let start, let end):
print("[MyVoxtral] Segment [\(String(format: "%.1f", start))-\(String(format: "%.1f", end))]: \(text.prefix(60))")
case .language(let lang):
print("[MyVoxtral] Language detected: \(lang)")
case .done(let text):
print("[MyVoxtral] Done, final text length: \(text.count)")
if currentText.isEmpty {
currentText = text
}
case .error(let message):
print("[MyVoxtral] Error: \(message)")
if !hasRetried && state == .recording {
hasRetried = true
wsClient.disconnect()
wsClient.connect(apiKey: settings.apiKey, delayMs: settings.streamingDelayMs)
lastEventTime = Date()
} else {
state = .error(message)
staleCheckTimer?.invalidate()
staleCheckTimer = nil
audioCapture.stop()
}
case .unknown:
break
case .unknown(let type):
print("[MyVoxtral] Unknown event: \(type)")
}
}
}

View file

@ -47,6 +47,14 @@ final class VoxtralWebSocketClient {
sendJSON(AudioFlushMessage())
}
func ping() {
webSocketTask?.sendPing { error in
if let error {
print("[MyVoxtral] Ping failed: \(error.localizedDescription)")
}
}
}
func disconnect() {
intentionalDisconnect = true
sendJSON(AudioEndMessage())