summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBen Sima <ben@bensima.com>2025-12-13 14:46:33 -0500
committerBen Sima <ben@bensima.com>2025-12-13 14:46:33 -0500
commit38c4ea7fcb86ea78448e7097fcd8689d37d78399 (patch)
tree96137251440db989a5a11a06739f4dc67fa6f29c
parent61ebcf0aeea6cfbdb70becf47bad38d001d8faa3 (diff)
fix: use OpenAI Whisper for voice transcription
OpenRouter's chat completion API doesn't properly pass audio to models. Switched to calling OpenAI's /v1/audio/transcriptions endpoint directly with the whisper-1 model. Requires OPENAI_API_KEY environment variable.
-rw-r--r--Omni/Agent/Telegram.hs2
-rw-r--r--Omni/Agent/Telegram/Media.hs87
2 files changed, 54 insertions, 35 deletions
diff --git a/Omni/Agent/Telegram.hs b/Omni/Agent/Telegram.hs
index ad2fc3b..61127b4 100644
--- a/Omni/Agent/Telegram.hs
+++ b/Omni/Agent/Telegram.hs
@@ -603,7 +603,6 @@ handleAuthorizedMessage tgConfig provider engineCfg msg uid userName chatId = do
_ -> Types.tmText msg
let userMessage = replyContext <> baseMessage
-
shouldEngage <-
if Types.isGroupChat msg
then do
@@ -728,7 +727,6 @@ handleAuthorizedMessageBatch tgConfig provider engineCfg msg uid userName chatId
_ -> ""
let userMessage = mediaPrefix <> batchedText
-
shouldEngage <-
if Types.isGroupChat msg
then do
diff --git a/Omni/Agent/Telegram/Media.hs b/Omni/Agent/Telegram/Media.hs
index 6539b79..47fbf91 100644
--- a/Omni/Agent/Telegram/Media.hs
+++ b/Omni/Agent/Telegram/Media.hs
@@ -54,6 +54,7 @@ import qualified Network.HTTP.Simple as HTTP
import qualified Omni.Agent.Telegram.Types as Types
import qualified Omni.Agent.Tools.Pdf as Pdf
import qualified Omni.Test as Test
+import System.Environment (lookupEnv)
import System.IO (hClose)
import System.IO.Temp (withSystemTempFile)
@@ -270,37 +271,57 @@ analyzeImage apiKey imageBytes userPrompt = do
Right respBody -> pure (first ("Vision API: " <>) (parseOpenRouterResponse respBody))
transcribeVoice :: Text -> BL.ByteString -> IO (Either Text Text)
-transcribeVoice apiKey audioBytes = do
- let base64Data = TL.toStrict (TLE.decodeUtf8 (B64.encode audioBytes))
- body =
- Aeson.object
- [ "model" .= ("google/gemini-2.5-flash" :: Text),
- "messages"
- .= [ Aeson.object
- [ "role" .= ("user" :: Text),
- "content"
- .= [ Aeson.object
- [ "type" .= ("input_audio" :: Text),
- "input_audio"
- .= Aeson.object
- [ "data" .= base64Data,
- "format" .= ("ogg" :: Text)
- ]
- ],
- Aeson.object
- [ "type" .= ("text" :: Text),
- "text" .= ("transcribe this audio exactly. return ONLY the transcription, no commentary or preamble." :: Text)
- ]
- ]
- ]
- ]
- ]
- headers =
- [ ("Authorization", "Bearer " <> encodeUtf8 apiKey),
- ("HTTP-Referer", "https://omni.dev"),
- ("X-Title", "Omni Agent")
- ]
- result <- httpPostJson "https://openrouter.ai/api/v1/chat/completions" headers body 120
+transcribeVoice _unusedApiKey audioBytes = do
+ maybeKey <- lookupEnv "OPENAI_API_KEY"
+ case maybeKey of
+ Nothing -> pure (Left "OPENAI_API_KEY not set - required for voice transcription")
+ Just key -> transcribeWithWhisper (Text.pack key) audioBytes
+
+transcribeWithWhisper :: Text -> BL.ByteString -> IO (Either Text Text)
+transcribeWithWhisper apiKey audioBytes = do
+ result <-
+ try <| do
+ req0 <- HTTP.parseRequest "https://api.openai.com/v1/audio/transcriptions"
+ let boundary = "----WebKitFormBoundary7MA4YWxkTrZu0gW"
+ body = buildMultipartBody boundary audioBytes
+ req =
+ HTTP.setRequestMethod "POST"
+ <| HTTP.setRequestHeader "Authorization" ["Bearer " <> encodeUtf8 apiKey]
+ <| HTTP.setRequestHeader "Content-Type" ["multipart/form-data; boundary=" <> boundary]
+ <| HTTP.setRequestBodyLBS body
+ <| HTTP.setRequestResponseTimeout (HTTPClient.responseTimeoutMicro (120 * 1000000))
+ <| req0
+ resp <- HTTP.httpLBS req
+ let status = HTTP.getResponseStatusCode resp
+ if status >= 200 && status < 300
+ then pure (Right (HTTP.getResponseBody resp))
+ else pure (Left ("HTTP " <> tshow status <> ": " <> TL.toStrict (TLE.decodeUtf8 (BL.take 500 (HTTP.getResponseBody resp)))))
case result of
- Left err -> pure (Left ("Transcription API error: " <> err))
- Right respBody -> pure (first ("Transcription API: " <>) (parseOpenRouterResponse respBody))
+ Left (e :: SomeException) -> pure (Left ("Whisper API error: " <> tshow e))
+ Right (Left err) -> pure (Left ("Whisper API error: " <> err))
+ Right (Right respBody) ->
+ case Aeson.decode respBody of
+ Just (Aeson.Object obj) -> case KeyMap.lookup "text" obj of
+ Just (Aeson.String transcription) -> pure (Right transcription)
+ _ -> pure (Left "No 'text' field in Whisper response")
+ _ -> pure (Left "Failed to parse Whisper response")
+
+buildMultipartBody :: ByteString -> BL.ByteString -> BL.ByteString
+buildMultipartBody boundary audioBytes =
+ BL.concat
+ [ "--",
+ BL.fromStrict boundary,
+ "\r\n",
+ "Content-Disposition: form-data; name=\"file\"; filename=\"audio.ogg\"\r\n",
+ "Content-Type: audio/ogg\r\n\r\n",
+ audioBytes,
+ "\r\n",
+ "--",
+ BL.fromStrict boundary,
+ "\r\n",
+ "Content-Disposition: form-data; name=\"model\"\r\n\r\n",
+ "whisper-1\r\n",
+ "--",
+ BL.fromStrict boundary,
+ "--\r\n"
+ ]