{-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE ScopedTypeVariables #-} {-# LANGUAGE NoImplicitPrelude #-} -- | Telegram Media Handling - File downloads, image analysis, voice transcription. -- -- : out omni-agent-telegram-media -- : dep aeson -- : dep http-conduit -- : dep base64-bytestring module Omni.Agent.Telegram.Media ( -- * File Downloads getFile, downloadFile, downloadFileBytes, downloadPhoto, downloadVoice, downloadAndExtractPdf, -- * Multimodal Processing analyzeImage, transcribeVoice, -- * Size Limits maxImageBytes, maxVoiceBytes, checkPhotoSize, checkVoiceSize, -- * HTTP Utilities httpGetBytes, httpPostJson, -- * Testing main, test, ) where import Alpha import Data.Aeson ((.=)) import qualified Data.Aeson as Aeson import qualified Data.Aeson.KeyMap as KeyMap import qualified Data.ByteString as BS import qualified Data.ByteString.Base64.Lazy as B64 import qualified Data.ByteString.Lazy as BL import qualified Data.CaseInsensitive as CI import qualified Data.Text as Text import qualified Data.Text.Encoding as TE import qualified Data.Text.Lazy as TL import qualified Data.Text.Lazy.Encoding as TLE import qualified Network.HTTP.Client as HTTPClient import qualified Network.HTTP.Simple as HTTP import qualified Omni.Agent.Telegram.Types as Types import qualified Omni.Agent.Tools.Pdf as Pdf import qualified Omni.Test as Test import System.Environment (lookupEnv) import System.IO (hClose) import System.IO.Temp (withSystemTempFile) main :: IO () main = Test.run test test :: Test.Tree test = Test.group "Omni.Agent.Telegram.Media" [ Test.unit "maxImageBytes is 10MB" <| do maxImageBytes Test.@=? 10_000_000, Test.unit "maxVoiceBytes is 20MB" <| do maxVoiceBytes Test.@=? 20_000_000, Test.unit "checkPhotoSize accepts small photos" <| do let photo = Types.TelegramPhoto "id" 800 600 (Just 100_000) checkPhotoSize photo Test.@=? Right (), Test.unit "checkPhotoSize rejects large photos" <| do let photo = Types.TelegramPhoto "id" 800 600 (Just 15_000_000) case checkPhotoSize photo of Left _ -> pure () Right _ -> Test.assertFailure "Expected rejection", Test.unit "checkVoiceSize accepts small voice" <| do let voice = Types.TelegramVoice "id" 60 (Just "audio/ogg") (Just 500_000) checkVoiceSize voice Test.@=? Right (), Test.unit "checkVoiceSize rejects large voice" <| do let voice = Types.TelegramVoice "id" 60 (Just "audio/ogg") (Just 25_000_000) case checkVoiceSize voice of Left _ -> pure () Right _ -> Test.assertFailure "Expected rejection" ] maxImageBytes :: Int maxImageBytes = 10_000_000 maxVoiceBytes :: Int maxVoiceBytes = 20_000_000 checkPhotoSize :: Types.TelegramPhoto -> Either Text () checkPhotoSize photo = case Types.tpFileSize photo of Just size | size > maxImageBytes -> Left <| "image too large (" <> tshow (size `div` 1_000_000) <> "MB), max " <> tshow (maxImageBytes `div` 1_000_000) <> "MB" _ -> Right () checkVoiceSize :: Types.TelegramVoice -> Either Text () checkVoiceSize voice = case Types.tvFileSize voice of Just size | size > maxVoiceBytes -> Left <| "voice message too large (" <> tshow (size `div` 1_000_000) <> "MB), max " <> tshow (maxVoiceBytes `div` 1_000_000) <> "MB" _ -> Right () httpGetBytes :: String -> IO (Either Text BL.ByteString) httpGetBytes url = do result <- try <| do req <- HTTP.parseRequest url resp <- HTTP.httpLBS req let status = HTTP.getResponseStatusCode resp if status >= 200 && status < 300 then pure (Right (HTTP.getResponseBody resp)) else pure (Left ("HTTP " <> tshow status)) case result of Left (e :: SomeException) -> pure (Left ("HTTP error: " <> tshow e)) Right r -> pure r httpPostJson :: String -> [(ByteString, ByteString)] -> Aeson.Value -> Int -> IO (Either Text BL.ByteString) httpPostJson url extraHeaders body timeoutSecs = do result <- try <| do req0 <- HTTP.parseRequest url let baseReq = HTTP.setRequestMethod "POST" <| HTTP.setRequestHeader "Content-Type" ["application/json"] <| HTTP.setRequestBodyLBS (Aeson.encode body) <| HTTP.setRequestResponseTimeout (HTTPClient.responseTimeoutMicro (timeoutSecs * 1000000)) <| req0 req = foldr addHeader baseReq extraHeaders addHeader (name, value) = HTTP.addRequestHeader (CI.mk name) value resp <- HTTP.httpLBS req let status = HTTP.getResponseStatusCode resp if status >= 200 && status < 300 then pure (Right (HTTP.getResponseBody resp)) else pure (Left ("HTTP " <> tshow status <> ": " <> shortBody resp)) case result of Left (e :: SomeException) -> pure (Left ("HTTP error: " <> tshow e)) Right r -> pure r where shortBody r = let b = BL.toStrict (HTTP.getResponseBody r) in TE.decodeUtf8 (BS.take 200 b) getFile :: Types.TelegramConfig -> Text -> IO (Either Text Text) getFile cfg fileId = do let url = Text.unpack (Types.tgApiBaseUrl cfg) <> "/bot" <> Text.unpack (Types.tgBotToken cfg) <> "/getFile?file_id=" <> Text.unpack fileId result <- httpGetBytes url case result of Left err -> pure (Left err) Right body -> case Aeson.decode body of Just (Aeson.Object obj) -> case KeyMap.lookup "result" obj of Just (Aeson.Object resultObj) -> case KeyMap.lookup "file_path" resultObj of Just (Aeson.String path) -> pure (Right path) _ -> pure (Left "No file_path in response") _ -> pure (Left "No result in response") _ -> pure (Left "Failed to parse getFile response") downloadFileBytes :: Types.TelegramConfig -> Text -> IO (Either Text BL.ByteString) downloadFileBytes cfg filePath = do let url = "https://api.telegram.org/file/bot" <> Text.unpack (Types.tgBotToken cfg) <> "/" <> Text.unpack filePath httpGetBytes url downloadFile :: Types.TelegramConfig -> Text -> FilePath -> IO (Either Text ()) downloadFile cfg filePath destPath = do result <- downloadFileBytes cfg filePath case result of Left err -> pure (Left err) Right bytes -> do BL.writeFile destPath bytes pure (Right ()) downloadPhoto :: Types.TelegramConfig -> Types.TelegramPhoto -> IO (Either Text BL.ByteString) downloadPhoto cfg photo = do filePathResult <- getFile cfg (Types.tpFileId photo) case filePathResult of Left err -> pure (Left err) Right filePath -> downloadFileBytes cfg filePath downloadVoice :: Types.TelegramConfig -> Types.TelegramVoice -> IO (Either Text BL.ByteString) downloadVoice cfg voice = do filePathResult <- getFile cfg (Types.tvFileId voice) case filePathResult of Left err -> pure (Left err) Right filePath -> downloadFileBytes cfg filePath downloadAndExtractPdf :: Types.TelegramConfig -> Text -> IO (Either Text Text) downloadAndExtractPdf cfg fileId = do filePathResult <- getFile cfg fileId case filePathResult of Left err -> pure (Left err) Right filePath -> withSystemTempFile "telegram_pdf.pdf" <| \tmpPath tmpHandle -> do hClose tmpHandle downloadResult <- downloadFile cfg filePath tmpPath case downloadResult of Left err -> pure (Left err) Right () -> Pdf.extractPdfText tmpPath parseOpenRouterResponse :: BL.ByteString -> Either Text Text parseOpenRouterResponse body = case Aeson.decode body of Just (Aeson.Object obj) -> case KeyMap.lookup "choices" obj of Just (Aeson.Array choices) | not (null choices) -> case toList choices of (Aeson.Object choice : _) -> case KeyMap.lookup "message" choice of Just (Aeson.Object msg) -> case KeyMap.lookup "content" msg of Just (Aeson.String content) -> Right content Just Aeson.Null -> Left "No content in response" _ -> Left "Unexpected content type in response" _ -> Left "No message in choice" _ -> Left "Empty choices array" _ -> Left "No choices in response" _ -> Left "Failed to parse response" analyzeImage :: Text -> BL.ByteString -> Text -> IO (Either Text Text) analyzeImage apiKey imageBytes userPrompt = do let base64Data = TL.toStrict (TLE.decodeUtf8 (B64.encode imageBytes)) dataUrl = "data:image/jpeg;base64," <> base64Data prompt = if Text.null userPrompt then "describe this image objectively in third person. do not use first person pronouns like 'I can see'. just describe what is shown." else userPrompt <> "\n\n(describe objectively in third person, no first person pronouns)" body = Aeson.object [ "model" .= ("anthropic/claude-sonnet-4.5" :: Text), "messages" .= [ Aeson.object [ "role" .= ("user" :: Text), "content" .= [ Aeson.object [ "type" .= ("text" :: Text), "text" .= prompt ], Aeson.object [ "type" .= ("image_url" :: Text), "image_url" .= Aeson.object [ "url" .= dataUrl ] ] ] ] ] ] headers = [ ("Authorization", "Bearer " <> encodeUtf8 apiKey), ("HTTP-Referer", "https://omni.dev"), ("X-Title", "Omni Agent") ] result <- httpPostJson "https://openrouter.ai/api/v1/chat/completions" headers body 120 case result of Left err -> pure (Left ("Vision API error: " <> err)) Right respBody -> pure (first ("Vision API: " <>) (parseOpenRouterResponse respBody)) transcribeVoice :: Text -> BL.ByteString -> IO (Either Text Text) transcribeVoice _unusedApiKey audioBytes = do maybeKey <- lookupEnv "OPENAI_API_KEY" case maybeKey of Nothing -> pure (Left "OPENAI_API_KEY not set - required for voice transcription") Just key -> transcribeWithWhisper (Text.pack key) audioBytes transcribeWithWhisper :: Text -> BL.ByteString -> IO (Either Text Text) transcribeWithWhisper apiKey audioBytes = do result <- try <| do req0 <- HTTP.parseRequest "https://api.openai.com/v1/audio/transcriptions" let boundary = "----WebKitFormBoundary7MA4YWxkTrZu0gW" body = buildMultipartBody boundary audioBytes req = HTTP.setRequestMethod "POST" <| HTTP.setRequestHeader "Authorization" ["Bearer " <> encodeUtf8 apiKey] <| HTTP.setRequestHeader "Content-Type" ["multipart/form-data; boundary=" <> boundary] <| HTTP.setRequestBodyLBS body <| HTTP.setRequestResponseTimeout (HTTPClient.responseTimeoutMicro (120 * 1000000)) <| req0 resp <- HTTP.httpLBS req let status = HTTP.getResponseStatusCode resp if status >= 200 && status < 300 then pure (Right (HTTP.getResponseBody resp)) else pure (Left ("HTTP " <> tshow status <> ": " <> TL.toStrict (TLE.decodeUtf8 (BL.take 500 (HTTP.getResponseBody resp))))) case result of Left (e :: SomeException) -> pure (Left ("Whisper API error: " <> tshow e)) Right (Left err) -> pure (Left ("Whisper API error: " <> err)) Right (Right respBody) -> case Aeson.decode respBody of Just (Aeson.Object obj) -> case KeyMap.lookup "text" obj of Just (Aeson.String transcription) -> pure (Right transcription) _ -> pure (Left "No 'text' field in Whisper response") _ -> pure (Left "Failed to parse Whisper response") buildMultipartBody :: ByteString -> BL.ByteString -> BL.ByteString buildMultipartBody boundary audioBytes = BL.concat [ "--", BL.fromStrict boundary, "\r\n", "Content-Disposition: form-data; name=\"file\"; filename=\"audio.ogg\"\r\n", "Content-Type: audio/ogg\r\n\r\n", audioBytes, "\r\n", "--", BL.fromStrict boundary, "\r\n", "Content-Disposition: form-data; name=\"model\"\r\n\r\n", "whisper-1\r\n", "--", BL.fromStrict boundary, "--\r\n" ]