feat: auto-stop on silence, fix transcription hangs
- Auto-stop recording after 20 seconds of continuous silence (RMS-based silence detection in AudioCapture) - Add WebSocket ping every 10s to keep connection alive - Auto-reconnect if no events received for 30s (stale connection) - Add diagnostic logging for all event types to help debug issues Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
8b56202e90
commit
89b1c12d58
3 changed files with 90 additions and 9 deletions
|
|
@ -6,9 +6,17 @@ final class AudioCapture {
|
|||
private let chunkDurationMs: Double = 480
|
||||
|
||||
var onChunk: ((Data) -> Void)?
|
||||
var onSilenceTimeout: (() -> Void)?
|
||||
|
||||
/// RMS threshold below which audio is considered silence (16-bit PCM range)
|
||||
var silenceThreshold: Int16 = 200
|
||||
/// Seconds of continuous silence before triggering timeout
|
||||
var silenceTimeoutSeconds: Double = 20
|
||||
|
||||
private var buffer = Data()
|
||||
private let bytesPerChunk: Int
|
||||
private var consecutiveSilentChunks: Int = 0
|
||||
private var chunksPerSecond: Double { 1000.0 / chunkDurationMs }
|
||||
|
||||
init() {
|
||||
// 16kHz * 2 bytes (16-bit) * 1 channel * 0.48s = 15360 bytes
|
||||
|
|
@ -58,12 +66,25 @@ final class AudioCapture {
|
|||
self.buffer.append(data)
|
||||
|
||||
while self.buffer.count >= self.bytesPerChunk {
|
||||
let chunk = self.buffer.prefix(self.bytesPerChunk)
|
||||
let chunk = Data(self.buffer.prefix(self.bytesPerChunk))
|
||||
self.buffer = Data(self.buffer.dropFirst(self.bytesPerChunk))
|
||||
self.onChunk?(Data(chunk))
|
||||
|
||||
if self.isSilent(chunk) {
|
||||
self.consecutiveSilentChunks += 1
|
||||
let silentSeconds = Double(self.consecutiveSilentChunks) / self.chunksPerSecond
|
||||
if silentSeconds >= self.silenceTimeoutSeconds {
|
||||
self.onSilenceTimeout?()
|
||||
return
|
||||
}
|
||||
} else {
|
||||
self.consecutiveSilentChunks = 0
|
||||
}
|
||||
|
||||
self.onChunk?(chunk)
|
||||
}
|
||||
}
|
||||
|
||||
consecutiveSilentChunks = 0
|
||||
engine.prepare()
|
||||
try engine.start()
|
||||
}
|
||||
|
|
@ -78,6 +99,20 @@ final class AudioCapture {
|
|||
buffer = Data()
|
||||
}
|
||||
}
|
||||
|
||||
private func isSilent(_ chunk: Data) -> Bool {
|
||||
chunk.withUnsafeBytes { rawBuffer in
|
||||
guard let samples = rawBuffer.baseAddress?.assumingMemoryBound(to: Int16.self) else { return true }
|
||||
let count = chunk.count / 2
|
||||
var sumSquares: Int64 = 0
|
||||
for i in 0..<count {
|
||||
let sample = Int64(samples[i])
|
||||
sumSquares += sample * sample
|
||||
}
|
||||
let rms = Int16(sqrt(Double(sumSquares) / Double(max(count, 1))))
|
||||
return rms < silenceThreshold
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
enum AudioCaptureError: Error, LocalizedError {
|
||||
|
|
|
|||
|
|
@ -15,6 +15,8 @@ final class TranscriptionManager: ObservableObject {
|
|||
private let wsClient = VoxtralWebSocketClient()
|
||||
private let settings = AppSettings.shared
|
||||
private var hasRetried = false
|
||||
private var lastEventTime = Date()
|
||||
private var staleCheckTimer: Timer?
|
||||
|
||||
var isRecording: Bool { state == .recording }
|
||||
|
||||
|
|
@ -40,6 +42,8 @@ final class TranscriptionManager: ObservableObject {
|
|||
}
|
||||
|
||||
wsClient.connect(apiKey: settings.apiKey, delayMs: settings.streamingDelayMs)
|
||||
lastEventTime = Date()
|
||||
startStaleCheck()
|
||||
|
||||
audioCapture.onChunk = { [weak self] chunk in
|
||||
Task { @MainActor in
|
||||
|
|
@ -47,6 +51,13 @@ final class TranscriptionManager: ObservableObject {
|
|||
}
|
||||
}
|
||||
|
||||
audioCapture.onSilenceTimeout = { [weak self] in
|
||||
Task { @MainActor in
|
||||
print("[MyVoxtral] Silence timeout — stopping recording")
|
||||
self?.stop()
|
||||
}
|
||||
}
|
||||
|
||||
do {
|
||||
try audioCapture.start()
|
||||
state = .recording
|
||||
|
|
@ -56,6 +67,8 @@ final class TranscriptionManager: ObservableObject {
|
|||
}
|
||||
|
||||
func stop() {
|
||||
staleCheckTimer?.invalidate()
|
||||
staleCheckTimer = nil
|
||||
audioCapture.stop()
|
||||
wsClient.flush()
|
||||
wsClient.disconnect()
|
||||
|
|
@ -66,34 +79,59 @@ final class TranscriptionManager: ObservableObject {
|
|||
}
|
||||
}
|
||||
|
||||
private func startStaleCheck() {
|
||||
staleCheckTimer?.invalidate()
|
||||
staleCheckTimer = Timer.scheduledTimer(withTimeInterval: 10, repeats: true) { [weak self] _ in
|
||||
Task { @MainActor in
|
||||
guard let self, self.isRecording else { return }
|
||||
let elapsed = Date().timeIntervalSince(self.lastEventTime)
|
||||
if elapsed > 30 {
|
||||
print("[MyVoxtral] Connection stale (\(Int(elapsed))s since last event) — reconnecting")
|
||||
self.wsClient.disconnect()
|
||||
self.wsClient.connect(apiKey: self.settings.apiKey, delayMs: self.settings.streamingDelayMs)
|
||||
self.lastEventTime = Date()
|
||||
} else {
|
||||
self.wsClient.ping()
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private func handleEvent(_ event: VoxtralEvent) {
|
||||
lastEventTime = Date()
|
||||
|
||||
switch event {
|
||||
case .sessionCreated:
|
||||
break
|
||||
print("[MyVoxtral] Session created")
|
||||
case .textDelta(let text):
|
||||
currentText += text
|
||||
if settings.outputMode == .cursorInjection {
|
||||
CursorInjector.typeText(text)
|
||||
}
|
||||
case .segment:
|
||||
break
|
||||
case .language:
|
||||
break
|
||||
case .segment(let text, let start, let end):
|
||||
print("[MyVoxtral] Segment [\(String(format: "%.1f", start))-\(String(format: "%.1f", end))]: \(text.prefix(60))")
|
||||
case .language(let lang):
|
||||
print("[MyVoxtral] Language detected: \(lang)")
|
||||
case .done(let text):
|
||||
print("[MyVoxtral] Done, final text length: \(text.count)")
|
||||
if currentText.isEmpty {
|
||||
currentText = text
|
||||
}
|
||||
case .error(let message):
|
||||
print("[MyVoxtral] Error: \(message)")
|
||||
if !hasRetried && state == .recording {
|
||||
hasRetried = true
|
||||
wsClient.disconnect()
|
||||
wsClient.connect(apiKey: settings.apiKey, delayMs: settings.streamingDelayMs)
|
||||
lastEventTime = Date()
|
||||
} else {
|
||||
state = .error(message)
|
||||
staleCheckTimer?.invalidate()
|
||||
staleCheckTimer = nil
|
||||
audioCapture.stop()
|
||||
}
|
||||
case .unknown:
|
||||
break
|
||||
case .unknown(let type):
|
||||
print("[MyVoxtral] Unknown event: \(type)")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -47,6 +47,14 @@ final class VoxtralWebSocketClient {
|
|||
sendJSON(AudioFlushMessage())
|
||||
}
|
||||
|
||||
func ping() {
|
||||
webSocketTask?.sendPing { error in
|
||||
if let error {
|
||||
print("[MyVoxtral] Ping failed: \(error.localizedDescription)")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func disconnect() {
|
||||
intentionalDisconnect = true
|
||||
sendJSON(AudioEndMessage())
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue