Shows session and cumulative audio duration and cost ($0.006/min) from the transcription.done event's usage.prompt_audio_seconds field. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
138 lines
3.6 KiB
Swift
138 lines
3.6 KiB
Swift
import Foundation
|
|
|
|
// MARK: - Outbound Messages (Client → Server)
|
|
|
|
struct AudioAppendMessage: Encodable {
|
|
let type = "input_audio.append"
|
|
let audio: String // base64-encoded PCM
|
|
}
|
|
|
|
struct AudioFlushMessage: Encodable {
|
|
let type = "input_audio.flush"
|
|
}
|
|
|
|
struct AudioEndMessage: Encodable {
|
|
let type = "input_audio.end"
|
|
}
|
|
|
|
struct SessionUpdateMessage: Encodable {
|
|
let type = "session.update"
|
|
let session: SessionConfig
|
|
}
|
|
|
|
struct SessionConfig: Encodable {
|
|
let audioFormat: AudioFormatConfig
|
|
let targetStreamingDelayMs: Int
|
|
|
|
enum CodingKeys: String, CodingKey {
|
|
case audioFormat = "audio_format"
|
|
case targetStreamingDelayMs = "target_streaming_delay_ms"
|
|
}
|
|
}
|
|
|
|
struct AudioFormatConfig: Encodable {
|
|
let encoding = "pcm_s16le"
|
|
let sampleRate = 16000
|
|
|
|
enum CodingKeys: String, CodingKey {
|
|
case encoding
|
|
case sampleRate = "sample_rate"
|
|
}
|
|
}
|
|
|
|
// MARK: - Inbound Messages (Server → Client)
|
|
|
|
enum VoxtralEvent {
|
|
case sessionCreated
|
|
case textDelta(String)
|
|
case segment(text: String, start: Double, end: Double)
|
|
case language(String)
|
|
case done(text: String, audioSeconds: Int?)
|
|
case error(String)
|
|
case unknown(String)
|
|
}
|
|
|
|
struct IncomingEvent: Decodable {
|
|
let type: String
|
|
}
|
|
|
|
struct TextDeltaEvent: Decodable {
|
|
let text: String
|
|
}
|
|
|
|
struct LanguageEvent: Decodable {
|
|
let audioLanguage: String
|
|
|
|
enum CodingKeys: String, CodingKey {
|
|
case audioLanguage = "audio_language"
|
|
}
|
|
}
|
|
|
|
struct SegmentEvent: Decodable {
|
|
let text: String
|
|
let start: Double
|
|
let end: Double
|
|
}
|
|
|
|
struct DoneEvent: Decodable {
|
|
let text: String
|
|
let usage: UsageInfo?
|
|
}
|
|
|
|
struct UsageInfo: Decodable {
|
|
let promptTokens: Int?
|
|
let completionTokens: Int?
|
|
let totalTokens: Int?
|
|
let promptAudioSeconds: Int?
|
|
|
|
enum CodingKeys: String, CodingKey {
|
|
case promptTokens = "prompt_tokens"
|
|
case completionTokens = "completion_tokens"
|
|
case totalTokens = "total_tokens"
|
|
case promptAudioSeconds = "prompt_audio_seconds"
|
|
}
|
|
}
|
|
|
|
struct ErrorEvent: Decodable {
|
|
let error: ErrorDetail?
|
|
}
|
|
|
|
struct ErrorDetail: Decodable {
|
|
let message: ErrorMessage?
|
|
}
|
|
|
|
struct ErrorMessage: Decodable {
|
|
let detail: String?
|
|
}
|
|
|
|
// MARK: - Event Parsing
|
|
|
|
func parseVoxtralEvent(from data: Data) -> VoxtralEvent {
|
|
guard let envelope = try? JSONDecoder().decode(IncomingEvent.self, from: data) else {
|
|
return .unknown(String(data: data, encoding: .utf8) ?? "")
|
|
}
|
|
|
|
switch envelope.type {
|
|
case "session.created":
|
|
return .sessionCreated
|
|
case "transcription.text.delta":
|
|
guard let e = try? JSONDecoder().decode(TextDeltaEvent.self, from: data) else { return .unknown("") }
|
|
return .textDelta(e.text)
|
|
case "transcription.segment":
|
|
guard let e = try? JSONDecoder().decode(SegmentEvent.self, from: data) else { return .unknown("") }
|
|
return .segment(text: e.text, start: e.start, end: e.end)
|
|
case "transcription.language":
|
|
guard let e = try? JSONDecoder().decode(LanguageEvent.self, from: data) else { return .unknown("") }
|
|
return .language(e.audioLanguage)
|
|
case "transcription.done":
|
|
guard let e = try? JSONDecoder().decode(DoneEvent.self, from: data) else { return .unknown("") }
|
|
return .done(text: e.text, audioSeconds: e.usage?.promptAudioSeconds)
|
|
case "error":
|
|
if let e = try? JSONDecoder().decode(ErrorEvent.self, from: data) {
|
|
return .error(e.error?.message?.detail ?? "Unknown error")
|
|
}
|
|
return .error("Unknown error")
|
|
default:
|
|
return .unknown(envelope.type)
|
|
}
|
|
}
|