summaryrefslogtreecommitdiff
path: root/Omni/Agent/Telegram/Media.hs
diff options
context:
space:
mode:
authorBen Sima <ben@bensima.com>2025-12-12 23:30:04 -0500
committerBen Sima <ben@bensima.com>2025-12-12 23:30:04 -0500
commit817bdb1f33e9825946a2da2aa1ff8f91b6166366 (patch)
tree32af363a03de72964e999ce437a7e01bfc80a85a /Omni/Agent/Telegram/Media.hs
parentbfa50a5a755e13c0ee2394d89280092a639d8f0d (diff)
telegram bot: refactor + multimedia + reply support
Refactor Telegram.hs into submodules to reduce file size: - Types.hs: data types, JSON parsing - Media.hs: file downloads, image/voice analysis - Reminders.hs: reminder loop, user chat persistence Multimedia improvements: - Vision uses third-person to avoid LLM confusion - Better message framing for embedded descriptions - Size validation (10MB images, 20MB voice) - MIME type validation for voice messages New features: - Reply support: bot sees context when users reply - Web search: default 5->10, max 10->20 results - Guardrails: duplicate tool limit 3->10 for research - Timezone: todos parse/display in Eastern time (ET)
Diffstat (limited to 'Omni/Agent/Telegram/Media.hs')
-rw-r--r--Omni/Agent/Telegram/Media.hs306
1 files changed, 306 insertions, 0 deletions
diff --git a/Omni/Agent/Telegram/Media.hs b/Omni/Agent/Telegram/Media.hs
new file mode 100644
index 0000000..1ef35de
--- /dev/null
+++ b/Omni/Agent/Telegram/Media.hs
@@ -0,0 +1,306 @@
+{-# LANGUAGE OverloadedStrings #-}
+{-# LANGUAGE ScopedTypeVariables #-}
+{-# LANGUAGE NoImplicitPrelude #-}
+
+-- | Telegram Media Handling - File downloads, image analysis, voice transcription.
+--
+-- : out omni-agent-telegram-media
+-- : dep aeson
+-- : dep http-conduit
+-- : dep base64-bytestring
+module Omni.Agent.Telegram.Media
+ ( -- * File Downloads
+ getFile,
+ downloadFile,
+ downloadFileBytes,
+ downloadPhoto,
+ downloadVoice,
+ downloadAndExtractPdf,
+
+ -- * Multimodal Processing
+ analyzeImage,
+ transcribeVoice,
+
+ -- * Size Limits
+ maxImageBytes,
+ maxVoiceBytes,
+ checkPhotoSize,
+ checkVoiceSize,
+
+ -- * HTTP Utilities
+ httpGetBytes,
+ httpPostJson,
+
+ -- * Testing
+ main,
+ test,
+ )
+where
+
+import Alpha
+import Data.Aeson ((.=))
+import qualified Data.Aeson as Aeson
+import qualified Data.Aeson.KeyMap as KeyMap
+import qualified Data.ByteString as BS
+import qualified Data.ByteString.Base64.Lazy as B64
+import qualified Data.ByteString.Lazy as BL
+import qualified Data.CaseInsensitive as CI
+import qualified Data.Text as Text
+import qualified Data.Text.Encoding as TE
+import qualified Data.Text.Lazy as TL
+import qualified Data.Text.Lazy.Encoding as TLE
+import qualified Network.HTTP.Client as HTTPClient
+import qualified Network.HTTP.Simple as HTTP
+import qualified Omni.Agent.Telegram.Types as Types
+import qualified Omni.Agent.Tools.Pdf as Pdf
+import qualified Omni.Test as Test
+import System.IO (hClose)
+import System.IO.Temp (withSystemTempFile)
+
+main :: IO ()
+main = Test.run test
+
+test :: Test.Tree
+test =
+ Test.group
+ "Omni.Agent.Telegram.Media"
+ [ Test.unit "maxImageBytes is 10MB" <| do
+ maxImageBytes Test.@=? 10_000_000,
+ Test.unit "maxVoiceBytes is 20MB" <| do
+ maxVoiceBytes Test.@=? 20_000_000,
+ Test.unit "checkPhotoSize accepts small photos" <| do
+ let photo = Types.TelegramPhoto "id" 800 600 (Just 100_000)
+ checkPhotoSize photo Test.@=? Right (),
+ Test.unit "checkPhotoSize rejects large photos" <| do
+ let photo = Types.TelegramPhoto "id" 800 600 (Just 15_000_000)
+ case checkPhotoSize photo of
+ Left _ -> pure ()
+ Right _ -> Test.assertFailure "Expected rejection",
+ Test.unit "checkVoiceSize accepts small voice" <| do
+ let voice = Types.TelegramVoice "id" 60 (Just "audio/ogg") (Just 500_000)
+ checkVoiceSize voice Test.@=? Right (),
+ Test.unit "checkVoiceSize rejects large voice" <| do
+ let voice = Types.TelegramVoice "id" 60 (Just "audio/ogg") (Just 25_000_000)
+ case checkVoiceSize voice of
+ Left _ -> pure ()
+ Right _ -> Test.assertFailure "Expected rejection"
+ ]
+
+maxImageBytes :: Int
+maxImageBytes = 10_000_000
+
+maxVoiceBytes :: Int
+maxVoiceBytes = 20_000_000
+
+checkPhotoSize :: Types.TelegramPhoto -> Either Text ()
+checkPhotoSize photo =
+ case Types.tpFileSize photo of
+ Just size
+ | size > maxImageBytes ->
+ Left <| "image too large (" <> tshow (size `div` 1_000_000) <> "MB), max " <> tshow (maxImageBytes `div` 1_000_000) <> "MB"
+ _ -> Right ()
+
+checkVoiceSize :: Types.TelegramVoice -> Either Text ()
+checkVoiceSize voice =
+ case Types.tvFileSize voice of
+ Just size
+ | size > maxVoiceBytes ->
+ Left <| "voice message too large (" <> tshow (size `div` 1_000_000) <> "MB), max " <> tshow (maxVoiceBytes `div` 1_000_000) <> "MB"
+ _ -> Right ()
+
+httpGetBytes :: String -> IO (Either Text BL.ByteString)
+httpGetBytes url = do
+ result <-
+ try <| do
+ req <- HTTP.parseRequest url
+ resp <- HTTP.httpLBS req
+ let status = HTTP.getResponseStatusCode resp
+ if status >= 200 && status < 300
+ then pure (Right (HTTP.getResponseBody resp))
+ else pure (Left ("HTTP " <> tshow status))
+ case result of
+ Left (e :: SomeException) -> pure (Left ("HTTP error: " <> tshow e))
+ Right r -> pure r
+
+httpPostJson :: String -> [(ByteString, ByteString)] -> Aeson.Value -> Int -> IO (Either Text BL.ByteString)
+httpPostJson url extraHeaders body timeoutSecs = do
+ result <-
+ try <| do
+ req0 <- HTTP.parseRequest url
+ let baseReq =
+ HTTP.setRequestMethod "POST"
+ <| HTTP.setRequestHeader "Content-Type" ["application/json"]
+ <| HTTP.setRequestBodyLBS (Aeson.encode body)
+ <| HTTP.setRequestResponseTimeout (HTTPClient.responseTimeoutMicro (timeoutSecs * 1000000))
+ <| req0
+ req = foldr addHeader baseReq extraHeaders
+ addHeader (name, value) = HTTP.addRequestHeader (CI.mk name) value
+ resp <- HTTP.httpLBS req
+ let status = HTTP.getResponseStatusCode resp
+ if status >= 200 && status < 300
+ then pure (Right (HTTP.getResponseBody resp))
+ else pure (Left ("HTTP " <> tshow status <> ": " <> shortBody resp))
+ case result of
+ Left (e :: SomeException) -> pure (Left ("HTTP error: " <> tshow e))
+ Right r -> pure r
+ where
+ shortBody r =
+ let b = BL.toStrict (HTTP.getResponseBody r)
+ in TE.decodeUtf8 (BS.take 200 b)
+
+getFile :: Types.TelegramConfig -> Text -> IO (Either Text Text)
+getFile cfg fileId = do
+ let url =
+ Text.unpack (Types.tgApiBaseUrl cfg)
+ <> "/bot"
+ <> Text.unpack (Types.tgBotToken cfg)
+ <> "/getFile?file_id="
+ <> Text.unpack fileId
+ result <- httpGetBytes url
+ case result of
+ Left err -> pure (Left err)
+ Right body ->
+ case Aeson.decode body of
+ Just (Aeson.Object obj) -> case KeyMap.lookup "result" obj of
+ Just (Aeson.Object resultObj) -> case KeyMap.lookup "file_path" resultObj of
+ Just (Aeson.String path) -> pure (Right path)
+ _ -> pure (Left "No file_path in response")
+ _ -> pure (Left "No result in response")
+ _ -> pure (Left "Failed to parse getFile response")
+
+downloadFileBytes :: Types.TelegramConfig -> Text -> IO (Either Text BL.ByteString)
+downloadFileBytes cfg filePath = do
+ let url =
+ "https://api.telegram.org/file/bot"
+ <> Text.unpack (Types.tgBotToken cfg)
+ <> "/"
+ <> Text.unpack filePath
+ httpGetBytes url
+
+downloadFile :: Types.TelegramConfig -> Text -> FilePath -> IO (Either Text ())
+downloadFile cfg filePath destPath = do
+ result <- downloadFileBytes cfg filePath
+ case result of
+ Left err -> pure (Left err)
+ Right bytes -> do
+ BL.writeFile destPath bytes
+ pure (Right ())
+
+downloadPhoto :: Types.TelegramConfig -> Types.TelegramPhoto -> IO (Either Text BL.ByteString)
+downloadPhoto cfg photo = do
+ filePathResult <- getFile cfg (Types.tpFileId photo)
+ case filePathResult of
+ Left err -> pure (Left err)
+ Right filePath -> downloadFileBytes cfg filePath
+
+downloadVoice :: Types.TelegramConfig -> Types.TelegramVoice -> IO (Either Text BL.ByteString)
+downloadVoice cfg voice = do
+ filePathResult <- getFile cfg (Types.tvFileId voice)
+ case filePathResult of
+ Left err -> pure (Left err)
+ Right filePath -> downloadFileBytes cfg filePath
+
+downloadAndExtractPdf :: Types.TelegramConfig -> Text -> IO (Either Text Text)
+downloadAndExtractPdf cfg fileId = do
+ filePathResult <- getFile cfg fileId
+ case filePathResult of
+ Left err -> pure (Left err)
+ Right filePath ->
+ withSystemTempFile "telegram_pdf.pdf" <| \tmpPath tmpHandle -> do
+ hClose tmpHandle
+ downloadResult <- downloadFile cfg filePath tmpPath
+ case downloadResult of
+ Left err -> pure (Left err)
+ Right () -> Pdf.extractPdfText tmpPath
+
+parseOpenRouterResponse :: BL.ByteString -> Either Text Text
+parseOpenRouterResponse body =
+ case Aeson.decode body of
+ Just (Aeson.Object obj) -> case KeyMap.lookup "choices" obj of
+ Just (Aeson.Array choices) | not (null choices) ->
+ case toList choices of
+ (Aeson.Object choice : _) -> case KeyMap.lookup "message" choice of
+ Just (Aeson.Object msg) -> case KeyMap.lookup "content" msg of
+ Just (Aeson.String content) -> Right content
+ Just Aeson.Null -> Left "No content in response"
+ _ -> Left "Unexpected content type in response"
+ _ -> Left "No message in choice"
+ _ -> Left "Empty choices array"
+ _ -> Left "No choices in response"
+ _ -> Left "Failed to parse response"
+
+analyzeImage :: Text -> BL.ByteString -> Text -> IO (Either Text Text)
+analyzeImage apiKey imageBytes userPrompt = do
+ let base64Data = TL.toStrict (TLE.decodeUtf8 (B64.encode imageBytes))
+ dataUrl = "data:image/jpeg;base64," <> base64Data
+ prompt =
+ if Text.null userPrompt
+ then "describe this image objectively in third person. do not use first person pronouns like 'I can see'. just describe what is shown."
+ else userPrompt <> "\n\n(describe objectively in third person, no first person pronouns)"
+ body =
+ Aeson.object
+ [ "model" .= ("anthropic/claude-sonnet-4" :: Text),
+ "messages"
+ .= [ Aeson.object
+ [ "role" .= ("user" :: Text),
+ "content"
+ .= [ Aeson.object
+ [ "type" .= ("text" :: Text),
+ "text" .= prompt
+ ],
+ Aeson.object
+ [ "type" .= ("image_url" :: Text),
+ "image_url"
+ .= Aeson.object
+ [ "url" .= dataUrl
+ ]
+ ]
+ ]
+ ]
+ ]
+ ]
+ headers =
+ [ ("Authorization", "Bearer " <> encodeUtf8 apiKey),
+ ("HTTP-Referer", "https://omni.dev"),
+ ("X-Title", "Omni Agent")
+ ]
+ result <- httpPostJson "https://openrouter.ai/api/v1/chat/completions" headers body 120
+ case result of
+ Left err -> pure (Left ("Vision API error: " <> err))
+ Right respBody -> pure (first ("Vision API: " <>) (parseOpenRouterResponse respBody))
+
+transcribeVoice :: Text -> BL.ByteString -> IO (Either Text Text)
+transcribeVoice apiKey audioBytes = do
+ let base64Data = TL.toStrict (TLE.decodeUtf8 (B64.encode audioBytes))
+ body =
+ Aeson.object
+ [ "model" .= ("google/gemini-2.0-flash-001" :: Text),
+ "messages"
+ .= [ Aeson.object
+ [ "role" .= ("user" :: Text),
+ "content"
+ .= [ Aeson.object
+ [ "type" .= ("text" :: Text),
+ "text" .= ("transcribe this audio exactly, return only the transcription with no commentary" :: Text)
+ ],
+ Aeson.object
+ [ "type" .= ("input_audio" :: Text),
+ "input_audio"
+ .= Aeson.object
+ [ "data" .= base64Data,
+ "format" .= ("ogg" :: Text)
+ ]
+ ]
+ ]
+ ]
+ ]
+ ]
+ headers =
+ [ ("Authorization", "Bearer " <> encodeUtf8 apiKey),
+ ("HTTP-Referer", "https://omni.dev"),
+ ("X-Title", "Omni Agent")
+ ]
+ result <- httpPostJson "https://openrouter.ai/api/v1/chat/completions" headers body 120
+ case result of
+ Left err -> pure (Left ("Transcription API error: " <> err))
+ Right respBody -> pure (first ("Transcription API: " <>) (parseOpenRouterResponse respBody))