myVoxtral/MyVoxtral/MyVoxtral/Network/VoxtralMessages.swift
Carsten Abele 3c2cf277bb feat: add usage cost display in menu bar dropdown
Shows session and cumulative audio duration and cost ($0.006/min)
from the transcription.done event's usage.prompt_audio_seconds field.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-07 21:38:49 +02:00

138 lines
3.6 KiB
Swift

import Foundation
// MARK: - Outbound Messages (Client Server)
struct AudioAppendMessage: Encodable {
let type = "input_audio.append"
let audio: String // base64-encoded PCM
}
struct AudioFlushMessage: Encodable {
let type = "input_audio.flush"
}
struct AudioEndMessage: Encodable {
let type = "input_audio.end"
}
struct SessionUpdateMessage: Encodable {
let type = "session.update"
let session: SessionConfig
}
struct SessionConfig: Encodable {
let audioFormat: AudioFormatConfig
let targetStreamingDelayMs: Int
enum CodingKeys: String, CodingKey {
case audioFormat = "audio_format"
case targetStreamingDelayMs = "target_streaming_delay_ms"
}
}
struct AudioFormatConfig: Encodable {
let encoding = "pcm_s16le"
let sampleRate = 16000
enum CodingKeys: String, CodingKey {
case encoding
case sampleRate = "sample_rate"
}
}
// MARK: - Inbound Messages (Server Client)
enum VoxtralEvent {
case sessionCreated
case textDelta(String)
case segment(text: String, start: Double, end: Double)
case language(String)
case done(text: String, audioSeconds: Int?)
case error(String)
case unknown(String)
}
struct IncomingEvent: Decodable {
let type: String
}
struct TextDeltaEvent: Decodable {
let text: String
}
struct LanguageEvent: Decodable {
let audioLanguage: String
enum CodingKeys: String, CodingKey {
case audioLanguage = "audio_language"
}
}
struct SegmentEvent: Decodable {
let text: String
let start: Double
let end: Double
}
struct DoneEvent: Decodable {
let text: String
let usage: UsageInfo?
}
struct UsageInfo: Decodable {
let promptTokens: Int?
let completionTokens: Int?
let totalTokens: Int?
let promptAudioSeconds: Int?
enum CodingKeys: String, CodingKey {
case promptTokens = "prompt_tokens"
case completionTokens = "completion_tokens"
case totalTokens = "total_tokens"
case promptAudioSeconds = "prompt_audio_seconds"
}
}
struct ErrorEvent: Decodable {
let error: ErrorDetail?
}
struct ErrorDetail: Decodable {
let message: ErrorMessage?
}
struct ErrorMessage: Decodable {
let detail: String?
}
// MARK: - Event Parsing
func parseVoxtralEvent(from data: Data) -> VoxtralEvent {
guard let envelope = try? JSONDecoder().decode(IncomingEvent.self, from: data) else {
return .unknown(String(data: data, encoding: .utf8) ?? "")
}
switch envelope.type {
case "session.created":
return .sessionCreated
case "transcription.text.delta":
guard let e = try? JSONDecoder().decode(TextDeltaEvent.self, from: data) else { return .unknown("") }
return .textDelta(e.text)
case "transcription.segment":
guard let e = try? JSONDecoder().decode(SegmentEvent.self, from: data) else { return .unknown("") }
return .segment(text: e.text, start: e.start, end: e.end)
case "transcription.language":
guard let e = try? JSONDecoder().decode(LanguageEvent.self, from: data) else { return .unknown("") }
return .language(e.audioLanguage)
case "transcription.done":
guard let e = try? JSONDecoder().decode(DoneEvent.self, from: data) else { return .unknown("") }
return .done(text: e.text, audioSeconds: e.usage?.promptAudioSeconds)
case "error":
if let e = try? JSONDecoder().decode(ErrorEvent.self, from: data) {
return .error(e.error?.message?.detail ?? "Unknown error")
}
return .error("Unknown error")
default:
return .unknown(envelope.type)
}
}