diff options
Diffstat (limited to 'Omni/Agent/Telegram/Media.hs')
| -rw-r--r-- | Omni/Agent/Telegram/Media.hs | 306 |
1 files changed, 306 insertions, 0 deletions
diff --git a/Omni/Agent/Telegram/Media.hs b/Omni/Agent/Telegram/Media.hs new file mode 100644 index 0000000..1ef35de --- /dev/null +++ b/Omni/Agent/Telegram/Media.hs @@ -0,0 +1,306 @@ +{-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE ScopedTypeVariables #-} +{-# LANGUAGE NoImplicitPrelude #-} + +-- | Telegram Media Handling - File downloads, image analysis, voice transcription. +-- +-- : out omni-agent-telegram-media +-- : dep aeson +-- : dep http-conduit +-- : dep base64-bytestring +module Omni.Agent.Telegram.Media + ( -- * File Downloads + getFile, + downloadFile, + downloadFileBytes, + downloadPhoto, + downloadVoice, + downloadAndExtractPdf, + + -- * Multimodal Processing + analyzeImage, + transcribeVoice, + + -- * Size Limits + maxImageBytes, + maxVoiceBytes, + checkPhotoSize, + checkVoiceSize, + + -- * HTTP Utilities + httpGetBytes, + httpPostJson, + + -- * Testing + main, + test, + ) +where + +import Alpha +import Data.Aeson ((.=)) +import qualified Data.Aeson as Aeson +import qualified Data.Aeson.KeyMap as KeyMap +import qualified Data.ByteString as BS +import qualified Data.ByteString.Base64.Lazy as B64 +import qualified Data.ByteString.Lazy as BL +import qualified Data.CaseInsensitive as CI +import qualified Data.Text as Text +import qualified Data.Text.Encoding as TE +import qualified Data.Text.Lazy as TL +import qualified Data.Text.Lazy.Encoding as TLE +import qualified Network.HTTP.Client as HTTPClient +import qualified Network.HTTP.Simple as HTTP +import qualified Omni.Agent.Telegram.Types as Types +import qualified Omni.Agent.Tools.Pdf as Pdf +import qualified Omni.Test as Test +import System.IO (hClose) +import System.IO.Temp (withSystemTempFile) + +main :: IO () +main = Test.run test + +test :: Test.Tree +test = + Test.group + "Omni.Agent.Telegram.Media" + [ Test.unit "maxImageBytes is 10MB" <| do + maxImageBytes Test.@=? 10_000_000, + Test.unit "maxVoiceBytes is 20MB" <| do + maxVoiceBytes Test.@=? 20_000_000, + Test.unit "checkPhotoSize accepts small photos" <| do + let photo = Types.TelegramPhoto "id" 800 600 (Just 100_000) + checkPhotoSize photo Test.@=? Right (), + Test.unit "checkPhotoSize rejects large photos" <| do + let photo = Types.TelegramPhoto "id" 800 600 (Just 15_000_000) + case checkPhotoSize photo of + Left _ -> pure () + Right _ -> Test.assertFailure "Expected rejection", + Test.unit "checkVoiceSize accepts small voice" <| do + let voice = Types.TelegramVoice "id" 60 (Just "audio/ogg") (Just 500_000) + checkVoiceSize voice Test.@=? Right (), + Test.unit "checkVoiceSize rejects large voice" <| do + let voice = Types.TelegramVoice "id" 60 (Just "audio/ogg") (Just 25_000_000) + case checkVoiceSize voice of + Left _ -> pure () + Right _ -> Test.assertFailure "Expected rejection" + ] + +maxImageBytes :: Int +maxImageBytes = 10_000_000 + +maxVoiceBytes :: Int +maxVoiceBytes = 20_000_000 + +checkPhotoSize :: Types.TelegramPhoto -> Either Text () +checkPhotoSize photo = + case Types.tpFileSize photo of + Just size + | size > maxImageBytes -> + Left <| "image too large (" <> tshow (size `div` 1_000_000) <> "MB), max " <> tshow (maxImageBytes `div` 1_000_000) <> "MB" + _ -> Right () + +checkVoiceSize :: Types.TelegramVoice -> Either Text () +checkVoiceSize voice = + case Types.tvFileSize voice of + Just size + | size > maxVoiceBytes -> + Left <| "voice message too large (" <> tshow (size `div` 1_000_000) <> "MB), max " <> tshow (maxVoiceBytes `div` 1_000_000) <> "MB" + _ -> Right () + +httpGetBytes :: String -> IO (Either Text BL.ByteString) +httpGetBytes url = do + result <- + try <| do + req <- HTTP.parseRequest url + resp <- HTTP.httpLBS req + let status = HTTP.getResponseStatusCode resp + if status >= 200 && status < 300 + then pure (Right (HTTP.getResponseBody resp)) + else pure (Left ("HTTP " <> tshow status)) + case result of + Left (e :: SomeException) -> pure (Left ("HTTP error: " <> tshow e)) + Right r -> pure r + +httpPostJson :: String -> [(ByteString, ByteString)] -> Aeson.Value -> Int -> IO (Either Text BL.ByteString) +httpPostJson url extraHeaders body timeoutSecs = do + result <- + try <| do + req0 <- HTTP.parseRequest url + let baseReq = + HTTP.setRequestMethod "POST" + <| HTTP.setRequestHeader "Content-Type" ["application/json"] + <| HTTP.setRequestBodyLBS (Aeson.encode body) + <| HTTP.setRequestResponseTimeout (HTTPClient.responseTimeoutMicro (timeoutSecs * 1000000)) + <| req0 + req = foldr addHeader baseReq extraHeaders + addHeader (name, value) = HTTP.addRequestHeader (CI.mk name) value + resp <- HTTP.httpLBS req + let status = HTTP.getResponseStatusCode resp + if status >= 200 && status < 300 + then pure (Right (HTTP.getResponseBody resp)) + else pure (Left ("HTTP " <> tshow status <> ": " <> shortBody resp)) + case result of + Left (e :: SomeException) -> pure (Left ("HTTP error: " <> tshow e)) + Right r -> pure r + where + shortBody r = + let b = BL.toStrict (HTTP.getResponseBody r) + in TE.decodeUtf8 (BS.take 200 b) + +getFile :: Types.TelegramConfig -> Text -> IO (Either Text Text) +getFile cfg fileId = do + let url = + Text.unpack (Types.tgApiBaseUrl cfg) + <> "/bot" + <> Text.unpack (Types.tgBotToken cfg) + <> "/getFile?file_id=" + <> Text.unpack fileId + result <- httpGetBytes url + case result of + Left err -> pure (Left err) + Right body -> + case Aeson.decode body of + Just (Aeson.Object obj) -> case KeyMap.lookup "result" obj of + Just (Aeson.Object resultObj) -> case KeyMap.lookup "file_path" resultObj of + Just (Aeson.String path) -> pure (Right path) + _ -> pure (Left "No file_path in response") + _ -> pure (Left "No result in response") + _ -> pure (Left "Failed to parse getFile response") + +downloadFileBytes :: Types.TelegramConfig -> Text -> IO (Either Text BL.ByteString) +downloadFileBytes cfg filePath = do + let url = + "https://api.telegram.org/file/bot" + <> Text.unpack (Types.tgBotToken cfg) + <> "/" + <> Text.unpack filePath + httpGetBytes url + +downloadFile :: Types.TelegramConfig -> Text -> FilePath -> IO (Either Text ()) +downloadFile cfg filePath destPath = do + result <- downloadFileBytes cfg filePath + case result of + Left err -> pure (Left err) + Right bytes -> do + BL.writeFile destPath bytes + pure (Right ()) + +downloadPhoto :: Types.TelegramConfig -> Types.TelegramPhoto -> IO (Either Text BL.ByteString) +downloadPhoto cfg photo = do + filePathResult <- getFile cfg (Types.tpFileId photo) + case filePathResult of + Left err -> pure (Left err) + Right filePath -> downloadFileBytes cfg filePath + +downloadVoice :: Types.TelegramConfig -> Types.TelegramVoice -> IO (Either Text BL.ByteString) +downloadVoice cfg voice = do + filePathResult <- getFile cfg (Types.tvFileId voice) + case filePathResult of + Left err -> pure (Left err) + Right filePath -> downloadFileBytes cfg filePath + +downloadAndExtractPdf :: Types.TelegramConfig -> Text -> IO (Either Text Text) +downloadAndExtractPdf cfg fileId = do + filePathResult <- getFile cfg fileId + case filePathResult of + Left err -> pure (Left err) + Right filePath -> + withSystemTempFile "telegram_pdf.pdf" <| \tmpPath tmpHandle -> do + hClose tmpHandle + downloadResult <- downloadFile cfg filePath tmpPath + case downloadResult of + Left err -> pure (Left err) + Right () -> Pdf.extractPdfText tmpPath + +parseOpenRouterResponse :: BL.ByteString -> Either Text Text +parseOpenRouterResponse body = + case Aeson.decode body of + Just (Aeson.Object obj) -> case KeyMap.lookup "choices" obj of + Just (Aeson.Array choices) | not (null choices) -> + case toList choices of + (Aeson.Object choice : _) -> case KeyMap.lookup "message" choice of + Just (Aeson.Object msg) -> case KeyMap.lookup "content" msg of + Just (Aeson.String content) -> Right content + Just Aeson.Null -> Left "No content in response" + _ -> Left "Unexpected content type in response" + _ -> Left "No message in choice" + _ -> Left "Empty choices array" + _ -> Left "No choices in response" + _ -> Left "Failed to parse response" + +analyzeImage :: Text -> BL.ByteString -> Text -> IO (Either Text Text) +analyzeImage apiKey imageBytes userPrompt = do + let base64Data = TL.toStrict (TLE.decodeUtf8 (B64.encode imageBytes)) + dataUrl = "data:image/jpeg;base64," <> base64Data + prompt = + if Text.null userPrompt + then "describe this image objectively in third person. do not use first person pronouns like 'I can see'. just describe what is shown." + else userPrompt <> "\n\n(describe objectively in third person, no first person pronouns)" + body = + Aeson.object + [ "model" .= ("anthropic/claude-sonnet-4" :: Text), + "messages" + .= [ Aeson.object + [ "role" .= ("user" :: Text), + "content" + .= [ Aeson.object + [ "type" .= ("text" :: Text), + "text" .= prompt + ], + Aeson.object + [ "type" .= ("image_url" :: Text), + "image_url" + .= Aeson.object + [ "url" .= dataUrl + ] + ] + ] + ] + ] + ] + headers = + [ ("Authorization", "Bearer " <> encodeUtf8 apiKey), + ("HTTP-Referer", "https://omni.dev"), + ("X-Title", "Omni Agent") + ] + result <- httpPostJson "https://openrouter.ai/api/v1/chat/completions" headers body 120 + case result of + Left err -> pure (Left ("Vision API error: " <> err)) + Right respBody -> pure (first ("Vision API: " <>) (parseOpenRouterResponse respBody)) + +transcribeVoice :: Text -> BL.ByteString -> IO (Either Text Text) +transcribeVoice apiKey audioBytes = do + let base64Data = TL.toStrict (TLE.decodeUtf8 (B64.encode audioBytes)) + body = + Aeson.object + [ "model" .= ("google/gemini-2.0-flash-001" :: Text), + "messages" + .= [ Aeson.object + [ "role" .= ("user" :: Text), + "content" + .= [ Aeson.object + [ "type" .= ("text" :: Text), + "text" .= ("transcribe this audio exactly, return only the transcription with no commentary" :: Text) + ], + Aeson.object + [ "type" .= ("input_audio" :: Text), + "input_audio" + .= Aeson.object + [ "data" .= base64Data, + "format" .= ("ogg" :: Text) + ] + ] + ] + ] + ] + ] + headers = + [ ("Authorization", "Bearer " <> encodeUtf8 apiKey), + ("HTTP-Referer", "https://omni.dev"), + ("X-Title", "Omni Agent") + ] + result <- httpPostJson "https://openrouter.ai/api/v1/chat/completions" headers body 120 + case result of + Left err -> pure (Left ("Transcription API error: " <> err)) + Right respBody -> pure (first ("Transcription API: " <>) (parseOpenRouterResponse respBody)) |
