From b57bec42739bbe93f35336310caa4b967068e27f Mon Sep 17 00:00:00 2001 From: Carsten Abele Date: Tue, 7 Apr 2026 20:20:12 +0200 Subject: [PATCH] fix: WebSocket connection, global shortcut, and accessibility - Add model query param to WebSocket URL (was causing handshake failure) - Suppress "connection lost" error on intentional disconnect - Fix shortcut recording with NSEvent local monitor - Add proper keycode-to-string mapping for shortcut display - Move app lifecycle to NSApplicationDelegate for reliable window management - Prompt for Accessibility permission on first launch - Add build-app.sh for proper .app bundle creation Co-Authored-By: Claude Opus 4.6 (1M context) --- MyVoxtral/MyVoxtral/Models/AppSettings.swift | 21 ++++- MyVoxtral/MyVoxtral/MyVoxtralApp.swift | 81 ++++++++++++------- .../Network/VoxtralWebSocketClient.swift | 30 ++++--- .../MyVoxtral/Utilities/GlobalShortcut.swift | 24 ++++-- MyVoxtral/MyVoxtral/Views/MenuBarView.swift | 1 - MyVoxtral/MyVoxtral/Views/SettingsView.swift | 48 ++++++++++- MyVoxtral/build-app.sh | 29 +++++++ 7 files changed, 184 insertions(+), 50 deletions(-) create mode 100755 MyVoxtral/build-app.sh diff --git a/MyVoxtral/MyVoxtral/Models/AppSettings.swift b/MyVoxtral/MyVoxtral/Models/AppSettings.swift index 43c4c8a..af98be7 100644 --- a/MyVoxtral/MyVoxtral/Models/AppSettings.swift +++ b/MyVoxtral/MyVoxtral/Models/AppSettings.swift @@ -37,9 +37,24 @@ final class AppSettings: ObservableObject { if mods.contains(.option) { parts.append("⌥") } if mods.contains(.shift) { parts.append("⇧") } if mods.contains(.command) { parts.append("⌘") } - if let scalar = Unicode.Scalar(shortcutKeyCode) { - parts.append(String(Character(scalar)).uppercased()) - } + parts.append(Self.keyCodeToString(shortcutKeyCode)) return parts.joined() } + + private static func keyCodeToString(_ keyCode: UInt16) -> String { + let mapping: [UInt16: String] = [ + 0: "A", 1: "S", 2: "D", 3: "F", 4: "H", 5: "G", 6: "Z", 7: "X", + 8: "C", 9: "V", 11: "B", 12: "Q", 13: "W", 14: "E", 15: "R", + 16: "Y", 17: "T", 18: "1", 19: "2", 20: "3", 21: "4", 22: "6", + 23: "5", 24: "=", 25: "9", 26: "7", 27: "-", 28: "8", 29: "0", + 30: "]", 31: "O", 32: "U", 33: "[", 34: "I", 35: "P", 37: "L", + 38: "J", 39: "'", 40: "K", 41: ";", 42: "\\", 43: ",", 44: "/", + 45: "N", 46: "M", 47: ".", 49: "Space", 50: "`", + 36: "Return", 48: "Tab", 51: "Delete", 53: "Esc", + 96: "F5", 97: "F6", 98: "F7", 99: "F3", 100: "F8", + 101: "F9", 103: "F11", 105: "F13", 109: "F10", 111: "F12", + 118: "F4", 120: "F2", 122: "F1", + ] + return mapping[keyCode] ?? "Key\(keyCode)" + } } diff --git a/MyVoxtral/MyVoxtral/MyVoxtralApp.swift b/MyVoxtral/MyVoxtral/MyVoxtralApp.swift index 60fe270..81d9575 100644 --- a/MyVoxtral/MyVoxtral/MyVoxtralApp.swift +++ b/MyVoxtral/MyVoxtral/MyVoxtralApp.swift @@ -2,62 +2,89 @@ import SwiftUI @main struct MyVoxtralApp: App { - @StateObject private var manager = TranscriptionManager() - @StateObject private var settings = AppSettings.shared - @State private var transcriptionPanel: TranscriptionPanel? - @State private var settingsWindow: NSWindow? - - private let globalShortcut = GlobalShortcut() + @NSApplicationDelegateAdaptor(AppDelegate.self) var appDelegate var body: some Scene { MenuBarExtra { MenuBarView( - manager: manager, - onShowTranscription: { showTranscriptionWindow() }, - onShowSettings: { showSettingsWindow() } + manager: appDelegate.manager, + onShowTranscription: { appDelegate.showTranscriptionWindow() }, + onShowSettings: { appDelegate.showSettingsWindow() } ) - .task { - if !settings.hasAPIKey { - showSettingsWindow() - } - registerShortcut() - } - .onChange(of: manager.isRecording) { - if manager.isRecording && settings.outputMode == .textBox { - showTranscriptionWindow() + .onChange(of: appDelegate.manager.isRecording) { + if appDelegate.manager.isRecording && AppSettings.shared.outputMode == .textBox { + appDelegate.showTranscriptionWindow() } } } label: { - Image(systemName: manager.isRecording ? "mic.fill" : "mic") + Image(systemName: appDelegate.manager.isRecording ? "mic.fill" : "mic") .symbolRenderingMode(.palette) - .foregroundStyle(manager.isRecording ? .red : .primary) + .foregroundStyle(appDelegate.manager.isRecording ? .red : .primary) + } + } +} + +@MainActor +final class AppDelegate: NSObject, NSApplicationDelegate, ObservableObject { + let manager = TranscriptionManager() + private let globalShortcut = GlobalShortcut() + private var settingsWindow: NSWindow? + private var transcriptionPanel: TranscriptionPanel? + private var defaultsObserver: NSObjectProtocol? + private var lastKeyCode: UInt16 = 0 + private var lastModifiers: UInt = 0 + + func applicationDidFinishLaunching(_ notification: Notification) { + // Prompt for Accessibility permission (needed for global shortcut + cursor injection) + if !CursorInjector.isAccessibilityGranted { + CursorInjector.promptAccessibilityPermission() + } + + if !AppSettings.shared.hasAPIKey { + showSettingsWindow() + } + registerShortcut() + + // Re-register shortcut when settings change (only if shortcut actually changed) + defaultsObserver = NotificationCenter.default.addObserver( + forName: UserDefaults.didChangeNotification, object: nil, queue: .main + ) { [weak self] _ in + Task { @MainActor in + let settings = AppSettings.shared + guard let self, + settings.shortcutKeyCode != self.lastKeyCode || + settings.shortcutModifiers != self.lastModifiers else { return } + self.registerShortcut() + } } - .onChange(of: settings.shortcutKeyCode) { registerShortcut() } - .onChange(of: settings.shortcutModifiers) { registerShortcut() } } - private func registerShortcut() { + func registerShortcut() { + let settings = AppSettings.shared + lastKeyCode = settings.shortcutKeyCode + lastModifiers = settings.shortcutModifiers globalShortcut.register( keyCode: settings.shortcutKeyCode, modifiers: settings.shortcutModifiers ) - globalShortcut.onTrigger = { [weak manager] in + globalShortcut.onTrigger = { [weak self] in Task { @MainActor in - manager?.toggle() + self?.manager.toggle() } } } - private func showTranscriptionWindow() { + func showTranscriptionWindow() { if transcriptionPanel == nil { transcriptionPanel = TranscriptionPanel(manager: manager) } transcriptionPanel?.show() } - private func showSettingsWindow() { + func showSettingsWindow() { if let settingsWindow, settingsWindow.isVisible { settingsWindow.makeKeyAndOrderFront(nil) + NSApp.activate(ignoringOtherApps: true) return } let window = NSWindow( diff --git a/MyVoxtral/MyVoxtral/Network/VoxtralWebSocketClient.swift b/MyVoxtral/MyVoxtral/Network/VoxtralWebSocketClient.swift index 8283026..ec4d1b8 100644 --- a/MyVoxtral/MyVoxtral/Network/VoxtralWebSocketClient.swift +++ b/MyVoxtral/MyVoxtral/Network/VoxtralWebSocketClient.swift @@ -5,30 +5,36 @@ final class VoxtralWebSocketClient { private var webSocketTask: URLSessionWebSocketTask? private var session: URLSession? private let encoder = JSONEncoder() + private var intentionalDisconnect = false var onEvent: ((VoxtralEvent) -> Void)? - func connect(apiKey: String, delayMs: Int) { - guard let url = URL(string: "wss://api.mistral.ai/v1/audio/transcriptions/realtime") else { return } + func connect(apiKey: String, model: String = "voxtral-mini-transcribe-realtime-2602", delayMs: Int) { + var components = URLComponents(string: "wss://api.mistral.ai/v1/audio/transcriptions/realtime")! + components.queryItems = [URLQueryItem(name: "model", value: model)] + guard let url = components.url else { return } var request = URLRequest(url: url) request.setValue("Bearer \(apiKey)", forHTTPHeaderField: "Authorization") - session = URLSession(configuration: .default) + let config = URLSessionConfiguration.default + config.httpAdditionalHeaders = ["Authorization": "Bearer \(apiKey)"] + session = URLSession(configuration: config) webSocketTask = session?.webSocketTask(with: request) webSocketTask?.resume() - // Send session config - let config = SessionUpdateMessage( + intentionalDisconnect = false + // Start receiving first, then send session config + receiveLoop() + + // Send session config after connection + let sessionConfig = SessionUpdateMessage( session: SessionConfig( audioFormat: AudioFormatConfig(), targetStreamingDelayMs: delayMs ) ) - sendJSON(config) - - // Start receiving - receiveLoop() + sendJSON(sessionConfig) } func sendAudio(_ pcmData: Data) { @@ -42,6 +48,7 @@ final class VoxtralWebSocketClient { } func disconnect() { + intentionalDisconnect = true sendJSON(AudioEndMessage()) webSocketTask?.cancel(with: .normalClosure, reason: nil) webSocketTask = nil @@ -79,10 +86,11 @@ final class VoxtralWebSocketClient { @unknown default: break } - self?.receiveLoop() + Task { @MainActor in self?.receiveLoop() } case .failure(let error): Task { @MainActor in - self?.onEvent?(.error("Connection lost: \(error.localizedDescription)")) + guard let self, !self.intentionalDisconnect else { return } + self.onEvent?(.error("Connection lost: \(error.localizedDescription)")) } } } diff --git a/MyVoxtral/MyVoxtral/Utilities/GlobalShortcut.swift b/MyVoxtral/MyVoxtral/Utilities/GlobalShortcut.swift index 1edf79b..8be521c 100644 --- a/MyVoxtral/MyVoxtral/Utilities/GlobalShortcut.swift +++ b/MyVoxtral/MyVoxtral/Utilities/GlobalShortcut.swift @@ -1,7 +1,8 @@ import Cocoa final class GlobalShortcut { - private var monitor: Any? + private var globalMonitor: Any? + private var localMonitor: Any? var onTrigger: (() -> Void)? func register(keyCode: UInt16, modifiers: UInt) { @@ -9,20 +10,31 @@ final class GlobalShortcut { guard keyCode != 0 || modifiers != 0 else { return } let requiredFlags = NSEvent.ModifierFlags(rawValue: modifiers) + let mask: NSEvent.ModifierFlags = [.command, .option, .control, .shift] - monitor = NSEvent.addGlobalMonitorForEvents(matching: .keyDown) { [weak self] event in - let mask: NSEvent.ModifierFlags = [.command, .option, .control, .shift] + globalMonitor = NSEvent.addGlobalMonitorForEvents(matching: .keyDown) { [weak self] event in if event.keyCode == keyCode && event.modifierFlags.intersection(mask) == requiredFlags { self?.onTrigger?() } } + localMonitor = NSEvent.addLocalMonitorForEvents(matching: .keyDown) { [weak self] event in + if event.keyCode == keyCode && event.modifierFlags.intersection(mask) == requiredFlags { + self?.onTrigger?() + return nil + } + return event + } } func unregister() { - if let monitor { - NSEvent.removeMonitor(monitor) + if let globalMonitor { + NSEvent.removeMonitor(globalMonitor) } - monitor = nil + globalMonitor = nil + if let localMonitor { + NSEvent.removeMonitor(localMonitor) + } + localMonitor = nil } deinit { diff --git a/MyVoxtral/MyVoxtral/Views/MenuBarView.swift b/MyVoxtral/MyVoxtral/Views/MenuBarView.swift index 0943128..0ce2ba0 100644 --- a/MyVoxtral/MyVoxtral/Views/MenuBarView.swift +++ b/MyVoxtral/MyVoxtral/Views/MenuBarView.swift @@ -11,7 +11,6 @@ struct MenuBarView: View { Button(manager.isRecording ? "Stop Recording" : "Start Recording") { manager.toggle() } - .keyboardShortcut("r") if case .error(let msg) = manager.state { Text(msg) diff --git a/MyVoxtral/MyVoxtral/Views/SettingsView.swift b/MyVoxtral/MyVoxtral/Views/SettingsView.swift index 94c0e41..49f8e60 100644 --- a/MyVoxtral/MyVoxtral/Views/SettingsView.swift +++ b/MyVoxtral/MyVoxtral/Views/SettingsView.swift @@ -3,6 +3,7 @@ import SwiftUI struct SettingsView: View { @ObservedObject var settings = AppSettings.shared @State private var isRecordingShortcut = false + @State private var keyMonitor: Any? var body: some View { Form { @@ -36,8 +37,18 @@ struct SettingsView: View { HStack { Text("Toggle Recording:") Spacer() - Button(isRecordingShortcut ? "Press keys..." : settings.shortcutDisplayString) { - isRecordingShortcut = true + Button(isRecordingShortcut ? "Press a key combo..." : settings.shortcutDisplayString) { + startRecordingShortcut() + } + .buttonStyle(.bordered) + + if settings.hasShortcut { + Button("Clear") { + settings.shortcutKeyCode = 0 + settings.shortcutModifiers = 0 + } + .buttonStyle(.borderless) + .foregroundStyle(.secondary) } } } @@ -64,5 +75,38 @@ struct SettingsView: View { } .formStyle(.grouped) .frame(width: 360, height: 340) + .onDisappear { + stopRecordingShortcut() + } + } + + private func startRecordingShortcut() { + isRecordingShortcut = true + keyMonitor = NSEvent.addLocalMonitorForEvents(matching: .keyDown) { event in + let mask: NSEvent.ModifierFlags = [.command, .option, .control, .shift] + let mods = event.modifierFlags.intersection(mask) + + // Require at least one modifier key + guard !mods.isEmpty else { + // Escape cancels + if event.keyCode == 53 { + stopRecordingShortcut() + } + return nil + } + + settings.shortcutKeyCode = event.keyCode + settings.shortcutModifiers = mods.rawValue + stopRecordingShortcut() + return nil // swallow the event + } + } + + private func stopRecordingShortcut() { + isRecordingShortcut = false + if let keyMonitor { + NSEvent.removeMonitor(keyMonitor) + } + keyMonitor = nil } } diff --git a/MyVoxtral/build-app.sh b/MyVoxtral/build-app.sh new file mode 100755 index 0000000..4f96265 --- /dev/null +++ b/MyVoxtral/build-app.sh @@ -0,0 +1,29 @@ +#!/bin/bash +set -e + +APP_NAME="MyVoxtral" +BUILD_DIR=".build/arm64-apple-macosx/debug" +APP_BUNDLE="$BUILD_DIR/$APP_NAME.app" + +# Build +swift build + +# Create .app bundle structure +rm -rf "$APP_BUNDLE" +mkdir -p "$APP_BUNDLE/Contents/MacOS" +mkdir -p "$APP_BUNDLE/Contents/Resources" + +# Copy binary +cp "$BUILD_DIR/$APP_NAME" "$APP_BUNDLE/Contents/MacOS/$APP_NAME" + +# Copy Info.plist +cp "$APP_NAME/Info.plist" "$APP_BUNDLE/Contents/Info.plist" + +# Add CFBundleExecutable to Info.plist +/usr/libexec/PlistBuddy -c "Add :CFBundleExecutable string $APP_NAME" "$APP_BUNDLE/Contents/Info.plist" 2>/dev/null || true +/usr/libexec/PlistBuddy -c "Add :CFBundleIdentifier string com.myvoxtral.app" "$APP_BUNDLE/Contents/Info.plist" 2>/dev/null || true +/usr/libexec/PlistBuddy -c "Add :CFBundleName string $APP_NAME" "$APP_BUNDLE/Contents/Info.plist" 2>/dev/null || true +/usr/libexec/PlistBuddy -c "Add :CFBundlePackageType string APPL" "$APP_BUNDLE/Contents/Info.plist" 2>/dev/null || true + +echo "Built: $APP_BUNDLE" +echo "Run with: open $APP_BUNDLE"