fix: use OpenAI Whisper for voice transcription

OpenRouter's chat completion API doesn't properly pass audio to models. Switched to calling OpenAI's /v1/audio/transcriptions endpoint directly with the whisper-1 model. Requires OPENAI_API_KEY environment variable.
author: Ben Sima <ben@bensima.com> 2025-12-13 14:46:33 -0500
committer: Ben Sima <ben@bensima.com> 2025-12-13 14:46:33 -0500
commit: 38c4ea7fcb86ea78448e7097fcd8689d37d78399 (patch)
tree: 96137251440db989a5a11a06739f4dc67fa6f29c
parent: 61ebcf0aeea6cfbdb70becf47bad38d001d8faa3 (diff)
2 files changed, 54 insertions, 35 deletions
diff --git a/Omni/Agent/Telegram.hs b/Omni/Agent/Telegram.hs
index ad2fc3b..61127b4 100644
--- a/Omni/Agent/Telegram.hs
+++ b/Omni/Agent/Telegram.hs
@@ -603,7 +603,6 @@ handleAuthorizedMessage tgConfig provider engineCfg msg uid userName chatId = do
         _ -> Types.tmText msg
 
   let userMessage = replyContext <> baseMessage
-
   shouldEngage <-
     if Types.isGroupChat msg
       then do
@@ -728,7 +727,6 @@ handleAuthorizedMessageBatch tgConfig provider engineCfg msg uid userName chatId
         _ -> ""
 
   let userMessage = mediaPrefix <> batchedText
-
   shouldEngage <-
     if Types.isGroupChat msg
       then do
diff --git a/Omni/Agent/Telegram/Media.hs b/Omni/Agent/Telegram/Media.hs
index 6539b79..47fbf91 100644
--- a/Omni/Agent/Telegram/Media.hs
+++ b/Omni/Agent/Telegram/Media.hs
@@ -54,6 +54,7 @@ import qualified Network.HTTP.Simple as HTTP
 import qualified Omni.Agent.Telegram.Types as Types
 import qualified Omni.Agent.Tools.Pdf as Pdf
 import qualified Omni.Test as Test
+import System.Environment (lookupEnv)
 import System.IO (hClose)
 import System.IO.Temp (withSystemTempFile)
 
@@ -270,37 +271,57 @@ analyzeImage apiKey imageBytes userPrompt = do
     Right respBody -> pure (first ("Vision API: " <>) (parseOpenRouterResponse respBody))
 
 transcribeVoice :: Text -> BL.ByteString -> IO (Either Text Text)
-transcribeVoice apiKey audioBytes = do
-  let base64Data = TL.toStrict (TLE.decodeUtf8 (B64.encode audioBytes))
-      body =
-        Aeson.object
-          [ "model" .= ("google/gemini-2.5-flash" :: Text),
-            "messages"
-              .= [ Aeson.object
-                     [ "role" .= ("user" :: Text),
-                       "content"
-                         .= [ Aeson.object
-                                [ "type" .= ("input_audio" :: Text),
-                                  "input_audio"
-                                    .= Aeson.object
-                                      [ "data" .= base64Data,
-                                        "format" .= ("ogg" :: Text)
-                                      ]
-                                ],
-                              Aeson.object
-                                [ "type" .= ("text" :: Text),
-                                  "text" .= ("transcribe this audio exactly. return ONLY the transcription, no commentary or preamble." :: Text)
-                                ]
-                            ]
-                     ]
-                 ]
-          ]
-      headers =
-        [ ("Authorization", "Bearer " <> encodeUtf8 apiKey),
-          ("HTTP-Referer", "https://omni.dev"),
-          ("X-Title", "Omni Agent")
-        ]
-  result <- httpPostJson "https://openrouter.ai/api/v1/chat/completions" headers body 120
+transcribeVoice _unusedApiKey audioBytes = do
+  maybeKey <- lookupEnv "OPENAI_API_KEY"
+  case maybeKey of
+    Nothing -> pure (Left "OPENAI_API_KEY not set - required for voice transcription")
+    Just key -> transcribeWithWhisper (Text.pack key) audioBytes
+
+transcribeWithWhisper :: Text -> BL.ByteString -> IO (Either Text Text)
+transcribeWithWhisper apiKey audioBytes = do
+  result <-
+    try <| do
+      req0 <- HTTP.parseRequest "https://api.openai.com/v1/audio/transcriptions"
+      let boundary = "----WebKitFormBoundary7MA4YWxkTrZu0gW"
+          body = buildMultipartBody boundary audioBytes
+          req =
+            HTTP.setRequestMethod "POST"
+              <| HTTP.setRequestHeader "Authorization" ["Bearer " <> encodeUtf8 apiKey]
+              <| HTTP.setRequestHeader "Content-Type" ["multipart/form-data; boundary=" <> boundary]
+              <| HTTP.setRequestBodyLBS body
+              <| HTTP.setRequestResponseTimeout (HTTPClient.responseTimeoutMicro (120 * 1000000))
+              <| req0
+      resp <- HTTP.httpLBS req
+      let status = HTTP.getResponseStatusCode resp
+      if status >= 200 && status < 300
+        then pure (Right (HTTP.getResponseBody resp))
+        else pure (Left ("HTTP " <> tshow status <> ": " <> TL.toStrict (TLE.decodeUtf8 (BL.take 500 (HTTP.getResponseBody resp)))))
   case result of
-    Left err -> pure (Left ("Transcription API error: " <> err))
-    Right respBody -> pure (first ("Transcription API: " <>) (parseOpenRouterResponse respBody))
+    Left (e :: SomeException) -> pure (Left ("Whisper API error: " <> tshow e))
+    Right (Left err) -> pure (Left ("Whisper API error: " <> err))
+    Right (Right respBody) ->
+      case Aeson.decode respBody of
+        Just (Aeson.Object obj) -> case KeyMap.lookup "text" obj of
+          Just (Aeson.String transcription) -> pure (Right transcription)
+          _ -> pure (Left "No 'text' field in Whisper response")
+        _ -> pure (Left "Failed to parse Whisper response")
+
+buildMultipartBody :: ByteString -> BL.ByteString -> BL.ByteString
+buildMultipartBody boundary audioBytes =
+  BL.concat
+    [ "--",
+      BL.fromStrict boundary,
+      "\r\n",
+      "Content-Disposition: form-data; name=\"file\"; filename=\"audio.ogg\"\r\n",
+      "Content-Type: audio/ogg\r\n\r\n",
+      audioBytes,
+      "\r\n",
+      "--",
+      BL.fromStrict boundary,
+      "\r\n",
+      "Content-Disposition: form-data; name=\"model\"\r\n\r\n",
+      "whisper-1\r\n",
+      "--",
+      BL.fromStrict boundary,
+      "--\r\n"
+    ]
author	Ben Sima <ben@bensima.com>	2025-12-13 14:46:33 -0500
committer	Ben Sima <ben@bensima.com>	2025-12-13 14:46:33 -0500
commit	38c4ea7fcb86ea78448e7097fcd8689d37d78399 (patch)
tree	96137251440db989a5a11a06739f4dc67fa6f29c
parent	61ebcf0aeea6cfbdb70becf47bad38d001d8faa3 (diff)