diff --git a/MyVoxtral/MyVoxtral/Network/VoxtralMessages.swift b/MyVoxtral/MyVoxtral/Network/VoxtralMessages.swift new file mode 100644 index 0000000..7f9edbb --- /dev/null +++ b/MyVoxtral/MyVoxtral/Network/VoxtralMessages.swift @@ -0,0 +1,123 @@ +import Foundation + +// MARK: - Outbound Messages (Client → Server) + +struct AudioAppendMessage: Encodable { + let type = "input_audio.append" + let audio: String // base64-encoded PCM +} + +struct AudioFlushMessage: Encodable { + let type = "input_audio.flush" +} + +struct AudioEndMessage: Encodable { + let type = "input_audio.end" +} + +struct SessionUpdateMessage: Encodable { + let type = "session.update" + let session: SessionConfig +} + +struct SessionConfig: Encodable { + let audioFormat: AudioFormatConfig + let targetStreamingDelayMs: Int + + enum CodingKeys: String, CodingKey { + case audioFormat = "audio_format" + case targetStreamingDelayMs = "target_streaming_delay_ms" + } +} + +struct AudioFormatConfig: Encodable { + let encoding = "pcm_s16le" + let sampleRate = 16000 + + enum CodingKeys: String, CodingKey { + case encoding + case sampleRate = "sample_rate" + } +} + +// MARK: - Inbound Messages (Server → Client) + +enum VoxtralEvent { + case sessionCreated + case textDelta(String) + case segment(text: String, start: Double, end: Double) + case language(String) + case done(text: String) + case error(String) + case unknown(String) +} + +struct IncomingEvent: Decodable { + let type: String +} + +struct TextDeltaEvent: Decodable { + let text: String +} + +struct LanguageEvent: Decodable { + let audioLanguage: String + + enum CodingKeys: String, CodingKey { + case audioLanguage = "audio_language" + } +} + +struct SegmentEvent: Decodable { + let text: String + let start: Double + let end: Double +} + +struct DoneEvent: Decodable { + let text: String +} + +struct ErrorEvent: Decodable { + let error: ErrorDetail? +} + +struct ErrorDetail: Decodable { + let message: ErrorMessage? +} + +struct ErrorMessage: Decodable { + let detail: String? +} + +// MARK: - Event Parsing + +func parseVoxtralEvent(from data: Data) -> VoxtralEvent { + guard let envelope = try? JSONDecoder().decode(IncomingEvent.self, from: data) else { + return .unknown(String(data: data, encoding: .utf8) ?? "") + } + + switch envelope.type { + case "session.created": + return .sessionCreated + case "transcription.text.delta": + guard let e = try? JSONDecoder().decode(TextDeltaEvent.self, from: data) else { return .unknown("") } + return .textDelta(e.text) + case "transcription.segment": + guard let e = try? JSONDecoder().decode(SegmentEvent.self, from: data) else { return .unknown("") } + return .segment(text: e.text, start: e.start, end: e.end) + case "transcription.language": + guard let e = try? JSONDecoder().decode(LanguageEvent.self, from: data) else { return .unknown("") } + return .language(e.audioLanguage) + case "transcription.done": + guard let e = try? JSONDecoder().decode(DoneEvent.self, from: data) else { return .unknown("") } + return .done(text: e.text) + case "error": + if let e = try? JSONDecoder().decode(ErrorEvent.self, from: data) { + return .error(e.error?.message?.detail ?? "Unknown error") + } + return .error("Unknown error") + default: + return .unknown(envelope.type) + } +}