diff options
| author | Ben Sima <ben@bensima.com> | 2025-12-12 22:25:56 -0500 |
|---|---|---|
| committer | Ben Sima <ben@bensima.com> | 2025-12-12 22:25:56 -0500 |
| commit | bfa50a5a755e13c0ee2394d89280092a639d8f0d (patch) | |
| tree | f60a290cc14f6b3abd7a39a1de316f984a652757 /Omni/Agent | |
| parent | 1b4dc94eb261e3f3cd22dc12fbc1941e2a545cb9 (diff) | |
feat: add image and voice message support for Telegram bot
- Add TelegramPhoto and TelegramVoice types
- Parse photo and voice fields from Telegram updates
- Download photos/voice via Telegram API
- Analyze images using Claude vision via OpenRouter
- Transcribe voice messages using Gemini audio via OpenRouter
- Wire multimedia processing into handleAuthorizedMessage
Photos are analyzed with user's caption as context.
Voice messages are transcribed and treated as text input.
Diffstat (limited to 'Omni/Agent')
| -rw-r--r-- | Omni/Agent/Telegram.hs | 352 |
1 files changed, 334 insertions, 18 deletions
diff --git a/Omni/Agent/Telegram.hs b/Omni/Agent/Telegram.hs index 27b3ccf..9184ef3 100644 --- a/Omni/Agent/Telegram.hs +++ b/Omni/Agent/Telegram.hs @@ -25,6 +25,8 @@ module Omni.Agent.Telegram TelegramMessage (..), TelegramUpdate (..), TelegramDocument (..), + TelegramPhoto (..), + TelegramVoice (..), -- * Telegram API getUpdates, @@ -63,8 +65,11 @@ import Control.Concurrent.STM (newTVarIO, readTVarIO, writeTVar) import Data.Aeson ((.!=), (.:), (.:?), (.=)) import qualified Data.Aeson as Aeson import qualified Data.Aeson.KeyMap as KeyMap +import qualified Data.ByteString.Base64.Lazy as B64 import qualified Data.ByteString.Lazy as BL import qualified Data.Text as Text +import qualified Data.Text.Lazy as TL +import qualified Data.Text.Lazy.Encoding as TLE import Data.Time (getCurrentTime, utcToLocalTime) import Data.Time.Format (defaultTimeLocale, formatTime) import Data.Time.LocalTime (getCurrentTimeZone) @@ -98,7 +103,8 @@ test = tgPollingTimeout = 30, tgApiBaseUrl = "https://api.telegram.org", tgAllowedUserIds = [123, 456], - tgKagiApiKey = Just "kagi-key" + tgKagiApiKey = Just "kagi-key", + tgOpenRouterApiKey = "or-key" } case Aeson.decode (Aeson.encode cfg) of Nothing -> Test.assertFailure "Failed to decode TelegramConfig" @@ -107,12 +113,12 @@ test = tgAllowedUserIds decoded Test.@=? [123, 456] tgKagiApiKey decoded Test.@=? Just "kagi-key", Test.unit "isUserAllowed checks whitelist" <| do - let cfg = defaultTelegramConfig "token" [100, 200, 300] Nothing + let cfg = defaultTelegramConfig "token" [100, 200, 300] Nothing "key" isUserAllowed cfg 100 Test.@=? True isUserAllowed cfg 200 Test.@=? True isUserAllowed cfg 999 Test.@=? False, Test.unit "isUserAllowed allows all when empty" <| do - let cfg = defaultTelegramConfig "token" [] Nothing + let cfg = defaultTelegramConfig "token" [] Nothing "key" isUserAllowed cfg 12345 Test.@=? True, Test.unit "TelegramMessage JSON roundtrip" <| do let msg = @@ -123,7 +129,9 @@ test = tmUserFirstName = "Test", tmUserLastName = Just "User", tmText = "Hello bot", - tmDocument = Nothing + tmDocument = Nothing, + tmPhoto = Nothing, + tmVoice = Nothing } case Aeson.decode (Aeson.encode msg) of Nothing -> Test.assertFailure "Failed to decode TelegramMessage" @@ -207,7 +215,8 @@ data TelegramConfig = TelegramConfig tgPollingTimeout :: Int, tgApiBaseUrl :: Text, tgAllowedUserIds :: [Int], - tgKagiApiKey :: Maybe Text + tgKagiApiKey :: Maybe Text, + tgOpenRouterApiKey :: Text } deriving (Show, Eq, Generic) @@ -218,7 +227,8 @@ instance Aeson.ToJSON TelegramConfig where "polling_timeout" .= tgPollingTimeout c, "api_base_url" .= tgApiBaseUrl c, "allowed_user_ids" .= tgAllowedUserIds c, - "kagi_api_key" .= tgKagiApiKey c + "kagi_api_key" .= tgKagiApiKey c, + "openrouter_api_key" .= tgOpenRouterApiKey c ] instance Aeson.FromJSON TelegramConfig where @@ -229,16 +239,18 @@ instance Aeson.FromJSON TelegramConfig where <*> (v .:? "api_base_url" .!= "https://api.telegram.org") <*> (v .:? "allowed_user_ids" .!= []) <*> (v .:? "kagi_api_key") + <*> (v .: "openrouter_api_key") -- | Default Telegram configuration (requires token from env). -defaultTelegramConfig :: Text -> [Int] -> Maybe Text -> TelegramConfig -defaultTelegramConfig token allowedIds kagiKey = +defaultTelegramConfig :: Text -> [Int] -> Maybe Text -> Text -> TelegramConfig +defaultTelegramConfig token allowedIds kagiKey openRouterKey = TelegramConfig { tgBotToken = token, tgPollingTimeout = 30, tgApiBaseUrl = "https://api.telegram.org", tgAllowedUserIds = allowedIds, - tgKagiApiKey = kagiKey + tgKagiApiKey = kagiKey, + tgOpenRouterApiKey = openRouterKey } -- | Check if a user is allowed to use the bot. @@ -272,6 +284,56 @@ instance Aeson.FromJSON TelegramDocument where <*> (v .:? "mime_type") <*> (v .:? "file_size") +data TelegramPhoto = TelegramPhoto + { tpFileId :: Text, + tpWidth :: Int, + tpHeight :: Int, + tpFileSize :: Maybe Int + } + deriving (Show, Eq, Generic) + +instance Aeson.ToJSON TelegramPhoto where + toJSON p = + Aeson.object + [ "file_id" .= tpFileId p, + "width" .= tpWidth p, + "height" .= tpHeight p, + "file_size" .= tpFileSize p + ] + +instance Aeson.FromJSON TelegramPhoto where + parseJSON = + Aeson.withObject "TelegramPhoto" <| \v -> + (TelegramPhoto </ (v .: "file_id")) + <*> (v .: "width") + <*> (v .: "height") + <*> (v .:? "file_size") + +data TelegramVoice = TelegramVoice + { tvFileId :: Text, + tvDuration :: Int, + tvMimeType :: Maybe Text, + tvFileSize :: Maybe Int + } + deriving (Show, Eq, Generic) + +instance Aeson.ToJSON TelegramVoice where + toJSON v = + Aeson.object + [ "file_id" .= tvFileId v, + "duration" .= tvDuration v, + "mime_type" .= tvMimeType v, + "file_size" .= tvFileSize v + ] + +instance Aeson.FromJSON TelegramVoice where + parseJSON = + Aeson.withObject "TelegramVoice" <| \v -> + (TelegramVoice </ (v .: "file_id")) + <*> (v .: "duration") + <*> (v .:? "mime_type") + <*> (v .:? "file_size") + -- | A parsed Telegram message from a user. data TelegramMessage = TelegramMessage { tmUpdateId :: Int, @@ -280,7 +342,9 @@ data TelegramMessage = TelegramMessage tmUserFirstName :: Text, tmUserLastName :: Maybe Text, tmText :: Text, - tmDocument :: Maybe TelegramDocument + tmDocument :: Maybe TelegramDocument, + tmPhoto :: Maybe TelegramPhoto, + tmVoice :: Maybe TelegramVoice } deriving (Show, Eq, Generic) @@ -293,7 +357,9 @@ instance Aeson.ToJSON TelegramMessage where "user_first_name" .= tmUserFirstName m, "user_last_name" .= tmUserLastName m, "text" .= tmText m, - "document" .= tmDocument m + "document" .= tmDocument m, + "photo" .= tmPhoto m, + "voice" .= tmVoice m ] instance Aeson.FromJSON TelegramMessage where @@ -306,6 +372,8 @@ instance Aeson.FromJSON TelegramMessage where <*> (v .:? "user_last_name") <*> (v .: "text") <*> (v .:? "document") + <*> (v .:? "photo") + <*> (v .:? "voice") -- | Raw Telegram update for parsing. data TelegramUpdate = TelegramUpdate @@ -352,7 +420,13 @@ parseUpdate val = do let document = case KeyMap.lookup "document" msgObj of Just (Aeson.Object docObj) -> parseDocument docObj _ -> Nothing - let hasContent = not (Text.null text) || not (Text.null caption) || isJust document + let photo = case KeyMap.lookup "photo" msgObj of + Just (Aeson.Array photos) -> parseLargestPhoto (toList photos) + _ -> Nothing + let voice = case KeyMap.lookup "voice" msgObj of + Just (Aeson.Object voiceObj) -> parseVoice voiceObj + _ -> Nothing + let hasContent = not (Text.null text) || not (Text.null caption) || isJust document || isJust photo || isJust voice guard hasContent pure TelegramMessage @@ -362,7 +436,9 @@ parseUpdate val = do tmUserFirstName = firstName, tmUserLastName = lastName, tmText = if Text.null text then caption else text, - tmDocument = document + tmDocument = document, + tmPhoto = photo, + tmVoice = voice } -- | Parse document object from Telegram message. @@ -388,6 +464,58 @@ parseDocument docObj = do tdFileSize = fileSize } +parseLargestPhoto :: [Aeson.Value] -> Maybe TelegramPhoto +parseLargestPhoto photos = do + let parsed = mapMaybe parsePhotoSize photos + case parsed of + [] -> Nothing + ps -> Just (maximumBy (comparing tpWidth) ps) + +parsePhotoSize :: Aeson.Value -> Maybe TelegramPhoto +parsePhotoSize val = do + Aeson.Object obj <- pure val + fileId <- case KeyMap.lookup "file_id" obj of + Just (Aeson.String s) -> Just s + _ -> Nothing + width <- case KeyMap.lookup "width" obj of + Just (Aeson.Number n) -> Just (round n) + _ -> Nothing + height <- case KeyMap.lookup "height" obj of + Just (Aeson.Number n) -> Just (round n) + _ -> Nothing + let fileSize = case KeyMap.lookup "file_size" obj of + Just (Aeson.Number n) -> Just (round n) + _ -> Nothing + pure + TelegramPhoto + { tpFileId = fileId, + tpWidth = width, + tpHeight = height, + tpFileSize = fileSize + } + +parseVoice :: Aeson.Object -> Maybe TelegramVoice +parseVoice obj = do + fileId <- case KeyMap.lookup "file_id" obj of + Just (Aeson.String s) -> Just s + _ -> Nothing + duration <- case KeyMap.lookup "duration" obj of + Just (Aeson.Number n) -> Just (round n) + _ -> Nothing + let mimeType = case KeyMap.lookup "mime_type" obj of + Just (Aeson.String s) -> Just s + _ -> Nothing + fileSize = case KeyMap.lookup "file_size" obj of + Just (Aeson.Number n) -> Just (round n) + _ -> Nothing + pure + TelegramVoice + { tvFileId = fileId, + tvDuration = duration, + tvMimeType = mimeType, + tvFileSize = fileSize + } + -- | Poll Telegram for new updates. getUpdates :: TelegramConfig -> Int -> IO [TelegramMessage] getUpdates cfg offset = do @@ -533,6 +661,148 @@ downloadFile cfg filePath destPath = do Left (e :: SomeException) -> pure (Left ("Download error: " <> tshow e)) Right r -> pure r +downloadFileBytes :: TelegramConfig -> Text -> IO (Either Text BL.ByteString) +downloadFileBytes cfg filePath = do + let url = + "https://api.telegram.org/file/bot" + <> Text.unpack (tgBotToken cfg) + <> "/" + <> Text.unpack filePath + result <- + try <| do + req <- HTTP.parseRequest url + response <- HTTP.httpLBS req + let status = HTTP.getResponseStatusCode response + if status >= 200 && status < 300 + then pure (Right (HTTP.getResponseBody response)) + else pure (Left ("Download failed: HTTP " <> tshow status)) + case result of + Left (e :: SomeException) -> pure (Left ("Download error: " <> tshow e)) + Right r -> pure r + +downloadPhoto :: TelegramConfig -> TelegramPhoto -> IO (Either Text BL.ByteString) +downloadPhoto cfg photo = do + filePathResult <- getFile cfg (tpFileId photo) + case filePathResult of + Left err -> pure (Left err) + Right filePath -> downloadFileBytes cfg filePath + +downloadVoice :: TelegramConfig -> TelegramVoice -> IO (Either Text BL.ByteString) +downloadVoice cfg voice = do + filePathResult <- getFile cfg (tvFileId voice) + case filePathResult of + Left err -> pure (Left err) + Right filePath -> downloadFileBytes cfg filePath + +analyzeImage :: Text -> BL.ByteString -> Text -> IO (Either Text Text) +analyzeImage apiKey imageBytes userPrompt = do + let base64Data = TL.toStrict (TLE.decodeUtf8 (B64.encode imageBytes)) + dataUrl = "data:image/jpeg;base64," <> base64Data + prompt = if Text.null userPrompt then "describe this image" else userPrompt + body = + Aeson.object + [ "model" .= ("anthropic/claude-sonnet-4" :: Text), + "messages" + .= [ Aeson.object + [ "role" .= ("user" :: Text), + "content" + .= [ Aeson.object + [ "type" .= ("text" :: Text), + "text" .= prompt + ], + Aeson.object + [ "type" .= ("image_url" :: Text), + "image_url" + .= Aeson.object + [ "url" .= dataUrl + ] + ] + ] + ] + ] + ] + req0 <- HTTP.parseRequest "https://openrouter.ai/api/v1/chat/completions" + let req = + HTTP.setRequestMethod "POST" + <| HTTP.setRequestHeader "Authorization" ["Bearer " <> encodeUtf8 apiKey] + <| HTTP.setRequestHeader "Content-Type" ["application/json"] + <| HTTP.setRequestBodyLBS (Aeson.encode body) + <| HTTP.setRequestResponseTimeout (HTTPClient.responseTimeoutMicro (120 * 1000000)) + <| req0 + result <- try (HTTP.httpLBS req) + case result of + Left (e :: SomeException) -> pure (Left ("Vision API error: " <> tshow e)) + Right response -> do + let status = HTTP.getResponseStatusCode response + if status >= 200 && status < 300 + then case Aeson.decode (HTTP.getResponseBody response) of + Just (Aeson.Object obj) -> case KeyMap.lookup "choices" obj of + Just (Aeson.Array choices) | not (null choices) -> + case toList choices of + (Aeson.Object choice : _) -> case KeyMap.lookup "message" choice of + Just (Aeson.Object msg) -> case KeyMap.lookup "content" msg of + Just (Aeson.String content) -> pure (Right content) + _ -> pure (Left "No content in message") + _ -> pure (Left "No message in choice") + _ -> pure (Left "Empty choices array") + _ -> pure (Left "No choices in response") + _ -> pure (Left "Failed to parse vision response") + else pure (Left ("Vision API HTTP error: " <> tshow status)) + +transcribeVoice :: Text -> BL.ByteString -> IO (Either Text Text) +transcribeVoice apiKey audioBytes = do + let base64Data = TL.toStrict (TLE.decodeUtf8 (B64.encode audioBytes)) + body = + Aeson.object + [ "model" .= ("google/gemini-2.0-flash-001" :: Text), + "messages" + .= [ Aeson.object + [ "role" .= ("user" :: Text), + "content" + .= [ Aeson.object + [ "type" .= ("text" :: Text), + "text" .= ("transcribe this audio exactly, return only the transcription with no commentary" :: Text) + ], + Aeson.object + [ "type" .= ("input_audio" :: Text), + "input_audio" + .= Aeson.object + [ "data" .= base64Data, + "format" .= ("ogg" :: Text) + ] + ] + ] + ] + ] + ] + req0 <- HTTP.parseRequest "https://openrouter.ai/api/v1/chat/completions" + let req = + HTTP.setRequestMethod "POST" + <| HTTP.setRequestHeader "Authorization" ["Bearer " <> encodeUtf8 apiKey] + <| HTTP.setRequestHeader "Content-Type" ["application/json"] + <| HTTP.setRequestBodyLBS (Aeson.encode body) + <| HTTP.setRequestResponseTimeout (HTTPClient.responseTimeoutMicro (120 * 1000000)) + <| req0 + result <- try (HTTP.httpLBS req) + case result of + Left (e :: SomeException) -> pure (Left ("Transcription API error: " <> tshow e)) + Right response -> do + let status = HTTP.getResponseStatusCode response + if status >= 200 && status < 300 + then case Aeson.decode (HTTP.getResponseBody response) of + Just (Aeson.Object obj) -> case KeyMap.lookup "choices" obj of + Just (Aeson.Array choices) | not (null choices) -> + case toList choices of + (Aeson.Object choice : _) -> case KeyMap.lookup "message" choice of + Just (Aeson.Object msg) -> case KeyMap.lookup "content" msg of + Just (Aeson.String content) -> pure (Right content) + _ -> pure (Left "No content in message") + _ -> pure (Left "No message in choice") + _ -> pure (Left "Empty choices array") + _ -> pure (Left "No choices in response") + _ -> pure (Left "Failed to parse transcription response") + else pure (Left ("Transcription API HTTP error: " <> tshow status)) + -- | Check if a document is a PDF. isPdf :: TelegramDocument -> Bool isPdf doc = @@ -728,12 +998,57 @@ handleAuthorizedMessage tgConfig provider engineCfg msg uid userName chatId = do pure (Just truncated) _ -> pure Nothing - let userMessage = case pdfContent of - Just pdfText -> + photoAnalysis <- case tmPhoto msg of + Just photo -> do + putText <| "Processing photo: " <> tshow (tpWidth photo) <> "x" <> tshow (tpHeight photo) + bytesResult <- downloadPhoto tgConfig photo + case bytesResult of + Left err -> do + putText <| "Photo download failed: " <> err + pure Nothing + Right bytes -> do + putText <| "Downloaded photo, " <> tshow (BL.length bytes) <> " bytes, analyzing..." + analysisResult <- analyzeImage (tgOpenRouterApiKey tgConfig) bytes (tmText msg) + case analysisResult of + Left err -> do + putText <| "Photo analysis failed: " <> err + pure Nothing + Right analysis -> do + putText <| "Photo analyzed: " <> Text.take 100 analysis <> "..." + pure (Just analysis) + Nothing -> pure Nothing + + voiceTranscription <- case tmVoice msg of + Just voice -> do + putText <| "Processing voice message: " <> tshow (tvDuration voice) <> " seconds" + bytesResult <- downloadVoice tgConfig voice + case bytesResult of + Left err -> do + putText <| "Voice download failed: " <> err + pure Nothing + Right bytes -> do + putText <| "Downloaded voice, " <> tshow (BL.length bytes) <> " bytes, transcribing..." + transcribeResult <- transcribeVoice (tgOpenRouterApiKey tgConfig) bytes + case transcribeResult of + Left err -> do + putText <| "Voice transcription failed: " <> err + pure Nothing + Right transcription -> do + putText <| "Transcribed: " <> Text.take 100 transcription <> "..." + pure (Just transcription) + Nothing -> pure Nothing + + let userMessage = case (pdfContent, photoAnalysis, voiceTranscription) of + (Just pdfText, _, _) -> let caption = tmText msg prefix = if Text.null caption then "here's the PDF content:\n\n" else caption <> "\n\n---\nPDF content:\n\n" in prefix <> pdfText - Nothing -> tmText msg + (_, Just analysis, _) -> + let caption = tmText msg + prefix = if Text.null caption then "[user sent an image]\n\n" else caption <> "\n\n[image analysis follows]\n\n" + in prefix <> analysis + (_, _, Just transcription) -> transcription + _ -> tmText msg _ <- Memory.saveMessage uid chatId Memory.UserRole (Just userName) userMessage @@ -957,8 +1272,9 @@ startBot maybeToken = do putText "Error: OPENROUTER_API_KEY not set" exitFailure Just key -> do - let tgConfig = defaultTelegramConfig token allowedIds kagiKey - provider = Provider.defaultOpenRouter (Text.pack key) "anthropic/claude-sonnet-4" + let orKey = Text.pack key + tgConfig = defaultTelegramConfig token allowedIds kagiKey orKey + provider = Provider.defaultOpenRouter orKey "anthropic/claude-sonnet-4" putText <| "Allowed user IDs: " <> tshow allowedIds putText <| "Kagi search: " <> if isJust kagiKey then "enabled" else "disabled" runTelegramBot tgConfig provider |
