feat: add Voxtral WebSocket message types and parser

This commit is contained in:
Carsten Abele 2026-04-07 19:38:38 +02:00
parent 4b1cae1b5f
commit e5395017c2

View file

@ -0,0 +1,123 @@
import Foundation
// MARK: - Outbound Messages (Client Server)
struct AudioAppendMessage: Encodable {
let type = "input_audio.append"
let audio: String // base64-encoded PCM
}
struct AudioFlushMessage: Encodable {
let type = "input_audio.flush"
}
struct AudioEndMessage: Encodable {
let type = "input_audio.end"
}
struct SessionUpdateMessage: Encodable {
let type = "session.update"
let session: SessionConfig
}
struct SessionConfig: Encodable {
let audioFormat: AudioFormatConfig
let targetStreamingDelayMs: Int
enum CodingKeys: String, CodingKey {
case audioFormat = "audio_format"
case targetStreamingDelayMs = "target_streaming_delay_ms"
}
}
struct AudioFormatConfig: Encodable {
let encoding = "pcm_s16le"
let sampleRate = 16000
enum CodingKeys: String, CodingKey {
case encoding
case sampleRate = "sample_rate"
}
}
// MARK: - Inbound Messages (Server Client)
enum VoxtralEvent {
case sessionCreated
case textDelta(String)
case segment(text: String, start: Double, end: Double)
case language(String)
case done(text: String)
case error(String)
case unknown(String)
}
struct IncomingEvent: Decodable {
let type: String
}
struct TextDeltaEvent: Decodable {
let text: String
}
struct LanguageEvent: Decodable {
let audioLanguage: String
enum CodingKeys: String, CodingKey {
case audioLanguage = "audio_language"
}
}
struct SegmentEvent: Decodable {
let text: String
let start: Double
let end: Double
}
struct DoneEvent: Decodable {
let text: String
}
struct ErrorEvent: Decodable {
let error: ErrorDetail?
}
struct ErrorDetail: Decodable {
let message: ErrorMessage?
}
struct ErrorMessage: Decodable {
let detail: String?
}
// MARK: - Event Parsing
func parseVoxtralEvent(from data: Data) -> VoxtralEvent {
guard let envelope = try? JSONDecoder().decode(IncomingEvent.self, from: data) else {
return .unknown(String(data: data, encoding: .utf8) ?? "")
}
switch envelope.type {
case "session.created":
return .sessionCreated
case "transcription.text.delta":
guard let e = try? JSONDecoder().decode(TextDeltaEvent.self, from: data) else { return .unknown("") }
return .textDelta(e.text)
case "transcription.segment":
guard let e = try? JSONDecoder().decode(SegmentEvent.self, from: data) else { return .unknown("") }
return .segment(text: e.text, start: e.start, end: e.end)
case "transcription.language":
guard let e = try? JSONDecoder().decode(LanguageEvent.self, from: data) else { return .unknown("") }
return .language(e.audioLanguage)
case "transcription.done":
guard let e = try? JSONDecoder().decode(DoneEvent.self, from: data) else { return .unknown("") }
return .done(text: e.text)
case "error":
if let e = try? JSONDecoder().decode(ErrorEvent.self, from: data) {
return .error(e.error?.message?.detail ?? "Unknown error")
}
return .error("Unknown error")
default:
return .unknown(envelope.type)
}
}