summaryrefslogtreecommitdiff
path: root/Omni/Agent/Telegram/Media.hs
diff options
context:
space:
mode:
authorBen Sima <ben@bensima.com>2025-12-17 13:29:40 -0500
committerBen Sima <ben@bensima.com>2025-12-17 13:29:40 -0500
commitab01b34bf563990e0f491ada646472aaade97610 (patch)
tree5e46a1a157bb846b0c3a090a83153c788da2b977 /Omni/Agent/Telegram/Media.hs
parente112d3ce07fa24f31a281e521a554cc881a76c7b (diff)
parent337648981cc5a55935116141341521f4fce83214 (diff)
Merge Ava deployment changes
Diffstat (limited to 'Omni/Agent/Telegram/Media.hs')
-rw-r--r--Omni/Agent/Telegram/Media.hs327
1 files changed, 327 insertions, 0 deletions
diff --git a/Omni/Agent/Telegram/Media.hs b/Omni/Agent/Telegram/Media.hs
new file mode 100644
index 0000000..47fbf91
--- /dev/null
+++ b/Omni/Agent/Telegram/Media.hs
@@ -0,0 +1,327 @@
+{-# LANGUAGE OverloadedStrings #-}
+{-# LANGUAGE ScopedTypeVariables #-}
+{-# LANGUAGE NoImplicitPrelude #-}
+
+-- | Telegram Media Handling - File downloads, image analysis, voice transcription.
+--
+-- : out omni-agent-telegram-media
+-- : dep aeson
+-- : dep http-conduit
+-- : dep base64-bytestring
+module Omni.Agent.Telegram.Media
+ ( -- * File Downloads
+ getFile,
+ downloadFile,
+ downloadFileBytes,
+ downloadPhoto,
+ downloadVoice,
+ downloadAndExtractPdf,
+
+ -- * Multimodal Processing
+ analyzeImage,
+ transcribeVoice,
+
+ -- * Size Limits
+ maxImageBytes,
+ maxVoiceBytes,
+ checkPhotoSize,
+ checkVoiceSize,
+
+ -- * HTTP Utilities
+ httpGetBytes,
+ httpPostJson,
+
+ -- * Testing
+ main,
+ test,
+ )
+where
+
+import Alpha
+import Data.Aeson ((.=))
+import qualified Data.Aeson as Aeson
+import qualified Data.Aeson.KeyMap as KeyMap
+import qualified Data.ByteString as BS
+import qualified Data.ByteString.Base64.Lazy as B64
+import qualified Data.ByteString.Lazy as BL
+import qualified Data.CaseInsensitive as CI
+import qualified Data.Text as Text
+import qualified Data.Text.Encoding as TE
+import qualified Data.Text.Lazy as TL
+import qualified Data.Text.Lazy.Encoding as TLE
+import qualified Network.HTTP.Client as HTTPClient
+import qualified Network.HTTP.Simple as HTTP
+import qualified Omni.Agent.Telegram.Types as Types
+import qualified Omni.Agent.Tools.Pdf as Pdf
+import qualified Omni.Test as Test
+import System.Environment (lookupEnv)
+import System.IO (hClose)
+import System.IO.Temp (withSystemTempFile)
+
+main :: IO ()
+main = Test.run test
+
+test :: Test.Tree
+test =
+ Test.group
+ "Omni.Agent.Telegram.Media"
+ [ Test.unit "maxImageBytes is 10MB" <| do
+ maxImageBytes Test.@=? 10_000_000,
+ Test.unit "maxVoiceBytes is 20MB" <| do
+ maxVoiceBytes Test.@=? 20_000_000,
+ Test.unit "checkPhotoSize accepts small photos" <| do
+ let photo = Types.TelegramPhoto "id" 800 600 (Just 100_000)
+ checkPhotoSize photo Test.@=? Right (),
+ Test.unit "checkPhotoSize rejects large photos" <| do
+ let photo = Types.TelegramPhoto "id" 800 600 (Just 15_000_000)
+ case checkPhotoSize photo of
+ Left _ -> pure ()
+ Right _ -> Test.assertFailure "Expected rejection",
+ Test.unit "checkVoiceSize accepts small voice" <| do
+ let voice = Types.TelegramVoice "id" 60 (Just "audio/ogg") (Just 500_000)
+ checkVoiceSize voice Test.@=? Right (),
+ Test.unit "checkVoiceSize rejects large voice" <| do
+ let voice = Types.TelegramVoice "id" 60 (Just "audio/ogg") (Just 25_000_000)
+ case checkVoiceSize voice of
+ Left _ -> pure ()
+ Right _ -> Test.assertFailure "Expected rejection"
+ ]
+
+maxImageBytes :: Int
+maxImageBytes = 10_000_000
+
+maxVoiceBytes :: Int
+maxVoiceBytes = 20_000_000
+
+checkPhotoSize :: Types.TelegramPhoto -> Either Text ()
+checkPhotoSize photo =
+ case Types.tpFileSize photo of
+ Just size
+ | size > maxImageBytes ->
+ Left <| "image too large (" <> tshow (size `div` 1_000_000) <> "MB), max " <> tshow (maxImageBytes `div` 1_000_000) <> "MB"
+ _ -> Right ()
+
+checkVoiceSize :: Types.TelegramVoice -> Either Text ()
+checkVoiceSize voice =
+ case Types.tvFileSize voice of
+ Just size
+ | size > maxVoiceBytes ->
+ Left <| "voice message too large (" <> tshow (size `div` 1_000_000) <> "MB), max " <> tshow (maxVoiceBytes `div` 1_000_000) <> "MB"
+ _ -> Right ()
+
+httpGetBytes :: String -> IO (Either Text BL.ByteString)
+httpGetBytes url = do
+ result <-
+ try <| do
+ req <- HTTP.parseRequest url
+ resp <- HTTP.httpLBS req
+ let status = HTTP.getResponseStatusCode resp
+ if status >= 200 && status < 300
+ then pure (Right (HTTP.getResponseBody resp))
+ else pure (Left ("HTTP " <> tshow status))
+ case result of
+ Left (e :: SomeException) -> pure (Left ("HTTP error: " <> tshow e))
+ Right r -> pure r
+
+httpPostJson :: String -> [(ByteString, ByteString)] -> Aeson.Value -> Int -> IO (Either Text BL.ByteString)
+httpPostJson url extraHeaders body timeoutSecs = do
+ result <-
+ try <| do
+ req0 <- HTTP.parseRequest url
+ let baseReq =
+ HTTP.setRequestMethod "POST"
+ <| HTTP.setRequestHeader "Content-Type" ["application/json"]
+ <| HTTP.setRequestBodyLBS (Aeson.encode body)
+ <| HTTP.setRequestResponseTimeout (HTTPClient.responseTimeoutMicro (timeoutSecs * 1000000))
+ <| req0
+ req = foldr addHeader baseReq extraHeaders
+ addHeader (name, value) = HTTP.addRequestHeader (CI.mk name) value
+ resp <- HTTP.httpLBS req
+ let status = HTTP.getResponseStatusCode resp
+ if status >= 200 && status < 300
+ then pure (Right (HTTP.getResponseBody resp))
+ else pure (Left ("HTTP " <> tshow status <> ": " <> shortBody resp))
+ case result of
+ Left (e :: SomeException) -> pure (Left ("HTTP error: " <> tshow e))
+ Right r -> pure r
+ where
+ shortBody r =
+ let b = BL.toStrict (HTTP.getResponseBody r)
+ in TE.decodeUtf8 (BS.take 200 b)
+
+getFile :: Types.TelegramConfig -> Text -> IO (Either Text Text)
+getFile cfg fileId = do
+ let url =
+ Text.unpack (Types.tgApiBaseUrl cfg)
+ <> "/bot"
+ <> Text.unpack (Types.tgBotToken cfg)
+ <> "/getFile?file_id="
+ <> Text.unpack fileId
+ result <- httpGetBytes url
+ case result of
+ Left err -> pure (Left err)
+ Right body ->
+ case Aeson.decode body of
+ Just (Aeson.Object obj) -> case KeyMap.lookup "result" obj of
+ Just (Aeson.Object resultObj) -> case KeyMap.lookup "file_path" resultObj of
+ Just (Aeson.String path) -> pure (Right path)
+ _ -> pure (Left "No file_path in response")
+ _ -> pure (Left "No result in response")
+ _ -> pure (Left "Failed to parse getFile response")
+
+downloadFileBytes :: Types.TelegramConfig -> Text -> IO (Either Text BL.ByteString)
+downloadFileBytes cfg filePath = do
+ let url =
+ "https://api.telegram.org/file/bot"
+ <> Text.unpack (Types.tgBotToken cfg)
+ <> "/"
+ <> Text.unpack filePath
+ httpGetBytes url
+
+downloadFile :: Types.TelegramConfig -> Text -> FilePath -> IO (Either Text ())
+downloadFile cfg filePath destPath = do
+ result <- downloadFileBytes cfg filePath
+ case result of
+ Left err -> pure (Left err)
+ Right bytes -> do
+ BL.writeFile destPath bytes
+ pure (Right ())
+
+downloadPhoto :: Types.TelegramConfig -> Types.TelegramPhoto -> IO (Either Text BL.ByteString)
+downloadPhoto cfg photo = do
+ filePathResult <- getFile cfg (Types.tpFileId photo)
+ case filePathResult of
+ Left err -> pure (Left err)
+ Right filePath -> downloadFileBytes cfg filePath
+
+downloadVoice :: Types.TelegramConfig -> Types.TelegramVoice -> IO (Either Text BL.ByteString)
+downloadVoice cfg voice = do
+ filePathResult <- getFile cfg (Types.tvFileId voice)
+ case filePathResult of
+ Left err -> pure (Left err)
+ Right filePath -> downloadFileBytes cfg filePath
+
+downloadAndExtractPdf :: Types.TelegramConfig -> Text -> IO (Either Text Text)
+downloadAndExtractPdf cfg fileId = do
+ filePathResult <- getFile cfg fileId
+ case filePathResult of
+ Left err -> pure (Left err)
+ Right filePath ->
+ withSystemTempFile "telegram_pdf.pdf" <| \tmpPath tmpHandle -> do
+ hClose tmpHandle
+ downloadResult <- downloadFile cfg filePath tmpPath
+ case downloadResult of
+ Left err -> pure (Left err)
+ Right () -> Pdf.extractPdfText tmpPath
+
+parseOpenRouterResponse :: BL.ByteString -> Either Text Text
+parseOpenRouterResponse body =
+ case Aeson.decode body of
+ Just (Aeson.Object obj) -> case KeyMap.lookup "choices" obj of
+ Just (Aeson.Array choices) | not (null choices) ->
+ case toList choices of
+ (Aeson.Object choice : _) -> case KeyMap.lookup "message" choice of
+ Just (Aeson.Object msg) -> case KeyMap.lookup "content" msg of
+ Just (Aeson.String content) -> Right content
+ Just Aeson.Null -> Left "No content in response"
+ _ -> Left "Unexpected content type in response"
+ _ -> Left "No message in choice"
+ _ -> Left "Empty choices array"
+ _ -> Left "No choices in response"
+ _ -> Left "Failed to parse response"
+
+analyzeImage :: Text -> BL.ByteString -> Text -> IO (Either Text Text)
+analyzeImage apiKey imageBytes userPrompt = do
+ let base64Data = TL.toStrict (TLE.decodeUtf8 (B64.encode imageBytes))
+ dataUrl = "data:image/jpeg;base64," <> base64Data
+ prompt =
+ if Text.null userPrompt
+ then "describe this image objectively in third person. do not use first person pronouns like 'I can see'. just describe what is shown."
+ else userPrompt <> "\n\n(describe objectively in third person, no first person pronouns)"
+ body =
+ Aeson.object
+ [ "model" .= ("anthropic/claude-sonnet-4.5" :: Text),
+ "messages"
+ .= [ Aeson.object
+ [ "role" .= ("user" :: Text),
+ "content"
+ .= [ Aeson.object
+ [ "type" .= ("text" :: Text),
+ "text" .= prompt
+ ],
+ Aeson.object
+ [ "type" .= ("image_url" :: Text),
+ "image_url"
+ .= Aeson.object
+ [ "url" .= dataUrl
+ ]
+ ]
+ ]
+ ]
+ ]
+ ]
+ headers =
+ [ ("Authorization", "Bearer " <> encodeUtf8 apiKey),
+ ("HTTP-Referer", "https://omni.dev"),
+ ("X-Title", "Omni Agent")
+ ]
+ result <- httpPostJson "https://openrouter.ai/api/v1/chat/completions" headers body 120
+ case result of
+ Left err -> pure (Left ("Vision API error: " <> err))
+ Right respBody -> pure (first ("Vision API: " <>) (parseOpenRouterResponse respBody))
+
+transcribeVoice :: Text -> BL.ByteString -> IO (Either Text Text)
+transcribeVoice _unusedApiKey audioBytes = do
+ maybeKey <- lookupEnv "OPENAI_API_KEY"
+ case maybeKey of
+ Nothing -> pure (Left "OPENAI_API_KEY not set - required for voice transcription")
+ Just key -> transcribeWithWhisper (Text.pack key) audioBytes
+
+transcribeWithWhisper :: Text -> BL.ByteString -> IO (Either Text Text)
+transcribeWithWhisper apiKey audioBytes = do
+ result <-
+ try <| do
+ req0 <- HTTP.parseRequest "https://api.openai.com/v1/audio/transcriptions"
+ let boundary = "----WebKitFormBoundary7MA4YWxkTrZu0gW"
+ body = buildMultipartBody boundary audioBytes
+ req =
+ HTTP.setRequestMethod "POST"
+ <| HTTP.setRequestHeader "Authorization" ["Bearer " <> encodeUtf8 apiKey]
+ <| HTTP.setRequestHeader "Content-Type" ["multipart/form-data; boundary=" <> boundary]
+ <| HTTP.setRequestBodyLBS body
+ <| HTTP.setRequestResponseTimeout (HTTPClient.responseTimeoutMicro (120 * 1000000))
+ <| req0
+ resp <- HTTP.httpLBS req
+ let status = HTTP.getResponseStatusCode resp
+ if status >= 200 && status < 300
+ then pure (Right (HTTP.getResponseBody resp))
+ else pure (Left ("HTTP " <> tshow status <> ": " <> TL.toStrict (TLE.decodeUtf8 (BL.take 500 (HTTP.getResponseBody resp)))))
+ case result of
+ Left (e :: SomeException) -> pure (Left ("Whisper API error: " <> tshow e))
+ Right (Left err) -> pure (Left ("Whisper API error: " <> err))
+ Right (Right respBody) ->
+ case Aeson.decode respBody of
+ Just (Aeson.Object obj) -> case KeyMap.lookup "text" obj of
+ Just (Aeson.String transcription) -> pure (Right transcription)
+ _ -> pure (Left "No 'text' field in Whisper response")
+ _ -> pure (Left "Failed to parse Whisper response")
+
+buildMultipartBody :: ByteString -> BL.ByteString -> BL.ByteString
+buildMultipartBody boundary audioBytes =
+ BL.concat
+ [ "--",
+ BL.fromStrict boundary,
+ "\r\n",
+ "Content-Disposition: form-data; name=\"file\"; filename=\"audio.ogg\"\r\n",
+ "Content-Type: audio/ogg\r\n\r\n",
+ audioBytes,
+ "\r\n",
+ "--",
+ BL.fromStrict boundary,
+ "\r\n",
+ "Content-Disposition: form-data; name=\"model\"\r\n\r\n",
+ "whisper-1\r\n",
+ "--",
+ BL.fromStrict boundary,
+ "--\r\n"
+ ]