diff options
| author | Ben Sima <ben@bensima.com> | 2025-12-13 14:46:33 -0500 |
|---|---|---|
| committer | Ben Sima <ben@bensima.com> | 2025-12-13 14:46:33 -0500 |
| commit | 38c4ea7fcb86ea78448e7097fcd8689d37d78399 (patch) | |
| tree | 96137251440db989a5a11a06739f4dc67fa6f29c /Omni/Agent | |
| parent | 61ebcf0aeea6cfbdb70becf47bad38d001d8faa3 (diff) | |
fix: use OpenAI Whisper for voice transcription
OpenRouter's chat completion API doesn't properly pass audio to models.
Switched to calling OpenAI's /v1/audio/transcriptions endpoint directly
with the whisper-1 model.
Requires OPENAI_API_KEY environment variable.
Diffstat (limited to 'Omni/Agent')
| -rw-r--r-- | Omni/Agent/Telegram.hs | 2 | ||||
| -rw-r--r-- | Omni/Agent/Telegram/Media.hs | 87 |
2 files changed, 54 insertions, 35 deletions
diff --git a/Omni/Agent/Telegram.hs b/Omni/Agent/Telegram.hs index ad2fc3b..61127b4 100644 --- a/Omni/Agent/Telegram.hs +++ b/Omni/Agent/Telegram.hs @@ -603,7 +603,6 @@ handleAuthorizedMessage tgConfig provider engineCfg msg uid userName chatId = do _ -> Types.tmText msg let userMessage = replyContext <> baseMessage - shouldEngage <- if Types.isGroupChat msg then do @@ -728,7 +727,6 @@ handleAuthorizedMessageBatch tgConfig provider engineCfg msg uid userName chatId _ -> "" let userMessage = mediaPrefix <> batchedText - shouldEngage <- if Types.isGroupChat msg then do diff --git a/Omni/Agent/Telegram/Media.hs b/Omni/Agent/Telegram/Media.hs index 6539b79..47fbf91 100644 --- a/Omni/Agent/Telegram/Media.hs +++ b/Omni/Agent/Telegram/Media.hs @@ -54,6 +54,7 @@ import qualified Network.HTTP.Simple as HTTP import qualified Omni.Agent.Telegram.Types as Types import qualified Omni.Agent.Tools.Pdf as Pdf import qualified Omni.Test as Test +import System.Environment (lookupEnv) import System.IO (hClose) import System.IO.Temp (withSystemTempFile) @@ -270,37 +271,57 @@ analyzeImage apiKey imageBytes userPrompt = do Right respBody -> pure (first ("Vision API: " <>) (parseOpenRouterResponse respBody)) transcribeVoice :: Text -> BL.ByteString -> IO (Either Text Text) -transcribeVoice apiKey audioBytes = do - let base64Data = TL.toStrict (TLE.decodeUtf8 (B64.encode audioBytes)) - body = - Aeson.object - [ "model" .= ("google/gemini-2.5-flash" :: Text), - "messages" - .= [ Aeson.object - [ "role" .= ("user" :: Text), - "content" - .= [ Aeson.object - [ "type" .= ("input_audio" :: Text), - "input_audio" - .= Aeson.object - [ "data" .= base64Data, - "format" .= ("ogg" :: Text) - ] - ], - Aeson.object - [ "type" .= ("text" :: Text), - "text" .= ("transcribe this audio exactly. return ONLY the transcription, no commentary or preamble." :: Text) - ] - ] - ] - ] - ] - headers = - [ ("Authorization", "Bearer " <> encodeUtf8 apiKey), - ("HTTP-Referer", "https://omni.dev"), - ("X-Title", "Omni Agent") - ] - result <- httpPostJson "https://openrouter.ai/api/v1/chat/completions" headers body 120 +transcribeVoice _unusedApiKey audioBytes = do + maybeKey <- lookupEnv "OPENAI_API_KEY" + case maybeKey of + Nothing -> pure (Left "OPENAI_API_KEY not set - required for voice transcription") + Just key -> transcribeWithWhisper (Text.pack key) audioBytes + +transcribeWithWhisper :: Text -> BL.ByteString -> IO (Either Text Text) +transcribeWithWhisper apiKey audioBytes = do + result <- + try <| do + req0 <- HTTP.parseRequest "https://api.openai.com/v1/audio/transcriptions" + let boundary = "----WebKitFormBoundary7MA4YWxkTrZu0gW" + body = buildMultipartBody boundary audioBytes + req = + HTTP.setRequestMethod "POST" + <| HTTP.setRequestHeader "Authorization" ["Bearer " <> encodeUtf8 apiKey] + <| HTTP.setRequestHeader "Content-Type" ["multipart/form-data; boundary=" <> boundary] + <| HTTP.setRequestBodyLBS body + <| HTTP.setRequestResponseTimeout (HTTPClient.responseTimeoutMicro (120 * 1000000)) + <| req0 + resp <- HTTP.httpLBS req + let status = HTTP.getResponseStatusCode resp + if status >= 200 && status < 300 + then pure (Right (HTTP.getResponseBody resp)) + else pure (Left ("HTTP " <> tshow status <> ": " <> TL.toStrict (TLE.decodeUtf8 (BL.take 500 (HTTP.getResponseBody resp))))) case result of - Left err -> pure (Left ("Transcription API error: " <> err)) - Right respBody -> pure (first ("Transcription API: " <>) (parseOpenRouterResponse respBody)) + Left (e :: SomeException) -> pure (Left ("Whisper API error: " <> tshow e)) + Right (Left err) -> pure (Left ("Whisper API error: " <> err)) + Right (Right respBody) -> + case Aeson.decode respBody of + Just (Aeson.Object obj) -> case KeyMap.lookup "text" obj of + Just (Aeson.String transcription) -> pure (Right transcription) + _ -> pure (Left "No 'text' field in Whisper response") + _ -> pure (Left "Failed to parse Whisper response") + +buildMultipartBody :: ByteString -> BL.ByteString -> BL.ByteString +buildMultipartBody boundary audioBytes = + BL.concat + [ "--", + BL.fromStrict boundary, + "\r\n", + "Content-Disposition: form-data; name=\"file\"; filename=\"audio.ogg\"\r\n", + "Content-Type: audio/ogg\r\n\r\n", + audioBytes, + "\r\n", + "--", + BL.fromStrict boundary, + "\r\n", + "Content-Disposition: form-data; name=\"model\"\r\n\r\n", + "whisper-1\r\n", + "--", + BL.fromStrict boundary, + "--\r\n" + ] |
