summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBen Sima <ben@bensima.com>2025-12-12 22:25:56 -0500
committerBen Sima <ben@bensima.com>2025-12-12 22:25:56 -0500
commitbfa50a5a755e13c0ee2394d89280092a639d8f0d (patch)
treef60a290cc14f6b3abd7a39a1de316f984a652757
parent1b4dc94eb261e3f3cd22dc12fbc1941e2a545cb9 (diff)
feat: add image and voice message support for Telegram bot
- Add TelegramPhoto and TelegramVoice types - Parse photo and voice fields from Telegram updates - Download photos/voice via Telegram API - Analyze images using Claude vision via OpenRouter - Transcribe voice messages using Gemini audio via OpenRouter - Wire multimedia processing into handleAuthorizedMessage Photos are analyzed with user's caption as context. Voice messages are transcribed and treated as text input.
-rw-r--r--Omni/Agent/Telegram.hs352
1 files changed, 334 insertions, 18 deletions
diff --git a/Omni/Agent/Telegram.hs b/Omni/Agent/Telegram.hs
index 27b3ccf..9184ef3 100644
--- a/Omni/Agent/Telegram.hs
+++ b/Omni/Agent/Telegram.hs
@@ -25,6 +25,8 @@ module Omni.Agent.Telegram
TelegramMessage (..),
TelegramUpdate (..),
TelegramDocument (..),
+ TelegramPhoto (..),
+ TelegramVoice (..),
-- * Telegram API
getUpdates,
@@ -63,8 +65,11 @@ import Control.Concurrent.STM (newTVarIO, readTVarIO, writeTVar)
import Data.Aeson ((.!=), (.:), (.:?), (.=))
import qualified Data.Aeson as Aeson
import qualified Data.Aeson.KeyMap as KeyMap
+import qualified Data.ByteString.Base64.Lazy as B64
import qualified Data.ByteString.Lazy as BL
import qualified Data.Text as Text
+import qualified Data.Text.Lazy as TL
+import qualified Data.Text.Lazy.Encoding as TLE
import Data.Time (getCurrentTime, utcToLocalTime)
import Data.Time.Format (defaultTimeLocale, formatTime)
import Data.Time.LocalTime (getCurrentTimeZone)
@@ -98,7 +103,8 @@ test =
tgPollingTimeout = 30,
tgApiBaseUrl = "https://api.telegram.org",
tgAllowedUserIds = [123, 456],
- tgKagiApiKey = Just "kagi-key"
+ tgKagiApiKey = Just "kagi-key",
+ tgOpenRouterApiKey = "or-key"
}
case Aeson.decode (Aeson.encode cfg) of
Nothing -> Test.assertFailure "Failed to decode TelegramConfig"
@@ -107,12 +113,12 @@ test =
tgAllowedUserIds decoded Test.@=? [123, 456]
tgKagiApiKey decoded Test.@=? Just "kagi-key",
Test.unit "isUserAllowed checks whitelist" <| do
- let cfg = defaultTelegramConfig "token" [100, 200, 300] Nothing
+ let cfg = defaultTelegramConfig "token" [100, 200, 300] Nothing "key"
isUserAllowed cfg 100 Test.@=? True
isUserAllowed cfg 200 Test.@=? True
isUserAllowed cfg 999 Test.@=? False,
Test.unit "isUserAllowed allows all when empty" <| do
- let cfg = defaultTelegramConfig "token" [] Nothing
+ let cfg = defaultTelegramConfig "token" [] Nothing "key"
isUserAllowed cfg 12345 Test.@=? True,
Test.unit "TelegramMessage JSON roundtrip" <| do
let msg =
@@ -123,7 +129,9 @@ test =
tmUserFirstName = "Test",
tmUserLastName = Just "User",
tmText = "Hello bot",
- tmDocument = Nothing
+ tmDocument = Nothing,
+ tmPhoto = Nothing,
+ tmVoice = Nothing
}
case Aeson.decode (Aeson.encode msg) of
Nothing -> Test.assertFailure "Failed to decode TelegramMessage"
@@ -207,7 +215,8 @@ data TelegramConfig = TelegramConfig
tgPollingTimeout :: Int,
tgApiBaseUrl :: Text,
tgAllowedUserIds :: [Int],
- tgKagiApiKey :: Maybe Text
+ tgKagiApiKey :: Maybe Text,
+ tgOpenRouterApiKey :: Text
}
deriving (Show, Eq, Generic)
@@ -218,7 +227,8 @@ instance Aeson.ToJSON TelegramConfig where
"polling_timeout" .= tgPollingTimeout c,
"api_base_url" .= tgApiBaseUrl c,
"allowed_user_ids" .= tgAllowedUserIds c,
- "kagi_api_key" .= tgKagiApiKey c
+ "kagi_api_key" .= tgKagiApiKey c,
+ "openrouter_api_key" .= tgOpenRouterApiKey c
]
instance Aeson.FromJSON TelegramConfig where
@@ -229,16 +239,18 @@ instance Aeson.FromJSON TelegramConfig where
<*> (v .:? "api_base_url" .!= "https://api.telegram.org")
<*> (v .:? "allowed_user_ids" .!= [])
<*> (v .:? "kagi_api_key")
+ <*> (v .: "openrouter_api_key")
-- | Default Telegram configuration (requires token from env).
-defaultTelegramConfig :: Text -> [Int] -> Maybe Text -> TelegramConfig
-defaultTelegramConfig token allowedIds kagiKey =
+defaultTelegramConfig :: Text -> [Int] -> Maybe Text -> Text -> TelegramConfig
+defaultTelegramConfig token allowedIds kagiKey openRouterKey =
TelegramConfig
{ tgBotToken = token,
tgPollingTimeout = 30,
tgApiBaseUrl = "https://api.telegram.org",
tgAllowedUserIds = allowedIds,
- tgKagiApiKey = kagiKey
+ tgKagiApiKey = kagiKey,
+ tgOpenRouterApiKey = openRouterKey
}
-- | Check if a user is allowed to use the bot.
@@ -272,6 +284,56 @@ instance Aeson.FromJSON TelegramDocument where
<*> (v .:? "mime_type")
<*> (v .:? "file_size")
+data TelegramPhoto = TelegramPhoto
+ { tpFileId :: Text,
+ tpWidth :: Int,
+ tpHeight :: Int,
+ tpFileSize :: Maybe Int
+ }
+ deriving (Show, Eq, Generic)
+
+instance Aeson.ToJSON TelegramPhoto where
+ toJSON p =
+ Aeson.object
+ [ "file_id" .= tpFileId p,
+ "width" .= tpWidth p,
+ "height" .= tpHeight p,
+ "file_size" .= tpFileSize p
+ ]
+
+instance Aeson.FromJSON TelegramPhoto where
+ parseJSON =
+ Aeson.withObject "TelegramPhoto" <| \v ->
+ (TelegramPhoto </ (v .: "file_id"))
+ <*> (v .: "width")
+ <*> (v .: "height")
+ <*> (v .:? "file_size")
+
+data TelegramVoice = TelegramVoice
+ { tvFileId :: Text,
+ tvDuration :: Int,
+ tvMimeType :: Maybe Text,
+ tvFileSize :: Maybe Int
+ }
+ deriving (Show, Eq, Generic)
+
+instance Aeson.ToJSON TelegramVoice where
+ toJSON v =
+ Aeson.object
+ [ "file_id" .= tvFileId v,
+ "duration" .= tvDuration v,
+ "mime_type" .= tvMimeType v,
+ "file_size" .= tvFileSize v
+ ]
+
+instance Aeson.FromJSON TelegramVoice where
+ parseJSON =
+ Aeson.withObject "TelegramVoice" <| \v ->
+ (TelegramVoice </ (v .: "file_id"))
+ <*> (v .: "duration")
+ <*> (v .:? "mime_type")
+ <*> (v .:? "file_size")
+
-- | A parsed Telegram message from a user.
data TelegramMessage = TelegramMessage
{ tmUpdateId :: Int,
@@ -280,7 +342,9 @@ data TelegramMessage = TelegramMessage
tmUserFirstName :: Text,
tmUserLastName :: Maybe Text,
tmText :: Text,
- tmDocument :: Maybe TelegramDocument
+ tmDocument :: Maybe TelegramDocument,
+ tmPhoto :: Maybe TelegramPhoto,
+ tmVoice :: Maybe TelegramVoice
}
deriving (Show, Eq, Generic)
@@ -293,7 +357,9 @@ instance Aeson.ToJSON TelegramMessage where
"user_first_name" .= tmUserFirstName m,
"user_last_name" .= tmUserLastName m,
"text" .= tmText m,
- "document" .= tmDocument m
+ "document" .= tmDocument m,
+ "photo" .= tmPhoto m,
+ "voice" .= tmVoice m
]
instance Aeson.FromJSON TelegramMessage where
@@ -306,6 +372,8 @@ instance Aeson.FromJSON TelegramMessage where
<*> (v .:? "user_last_name")
<*> (v .: "text")
<*> (v .:? "document")
+ <*> (v .:? "photo")
+ <*> (v .:? "voice")
-- | Raw Telegram update for parsing.
data TelegramUpdate = TelegramUpdate
@@ -352,7 +420,13 @@ parseUpdate val = do
let document = case KeyMap.lookup "document" msgObj of
Just (Aeson.Object docObj) -> parseDocument docObj
_ -> Nothing
- let hasContent = not (Text.null text) || not (Text.null caption) || isJust document
+ let photo = case KeyMap.lookup "photo" msgObj of
+ Just (Aeson.Array photos) -> parseLargestPhoto (toList photos)
+ _ -> Nothing
+ let voice = case KeyMap.lookup "voice" msgObj of
+ Just (Aeson.Object voiceObj) -> parseVoice voiceObj
+ _ -> Nothing
+ let hasContent = not (Text.null text) || not (Text.null caption) || isJust document || isJust photo || isJust voice
guard hasContent
pure
TelegramMessage
@@ -362,7 +436,9 @@ parseUpdate val = do
tmUserFirstName = firstName,
tmUserLastName = lastName,
tmText = if Text.null text then caption else text,
- tmDocument = document
+ tmDocument = document,
+ tmPhoto = photo,
+ tmVoice = voice
}
-- | Parse document object from Telegram message.
@@ -388,6 +464,58 @@ parseDocument docObj = do
tdFileSize = fileSize
}
+parseLargestPhoto :: [Aeson.Value] -> Maybe TelegramPhoto
+parseLargestPhoto photos = do
+ let parsed = mapMaybe parsePhotoSize photos
+ case parsed of
+ [] -> Nothing
+ ps -> Just (maximumBy (comparing tpWidth) ps)
+
+parsePhotoSize :: Aeson.Value -> Maybe TelegramPhoto
+parsePhotoSize val = do
+ Aeson.Object obj <- pure val
+ fileId <- case KeyMap.lookup "file_id" obj of
+ Just (Aeson.String s) -> Just s
+ _ -> Nothing
+ width <- case KeyMap.lookup "width" obj of
+ Just (Aeson.Number n) -> Just (round n)
+ _ -> Nothing
+ height <- case KeyMap.lookup "height" obj of
+ Just (Aeson.Number n) -> Just (round n)
+ _ -> Nothing
+ let fileSize = case KeyMap.lookup "file_size" obj of
+ Just (Aeson.Number n) -> Just (round n)
+ _ -> Nothing
+ pure
+ TelegramPhoto
+ { tpFileId = fileId,
+ tpWidth = width,
+ tpHeight = height,
+ tpFileSize = fileSize
+ }
+
+parseVoice :: Aeson.Object -> Maybe TelegramVoice
+parseVoice obj = do
+ fileId <- case KeyMap.lookup "file_id" obj of
+ Just (Aeson.String s) -> Just s
+ _ -> Nothing
+ duration <- case KeyMap.lookup "duration" obj of
+ Just (Aeson.Number n) -> Just (round n)
+ _ -> Nothing
+ let mimeType = case KeyMap.lookup "mime_type" obj of
+ Just (Aeson.String s) -> Just s
+ _ -> Nothing
+ fileSize = case KeyMap.lookup "file_size" obj of
+ Just (Aeson.Number n) -> Just (round n)
+ _ -> Nothing
+ pure
+ TelegramVoice
+ { tvFileId = fileId,
+ tvDuration = duration,
+ tvMimeType = mimeType,
+ tvFileSize = fileSize
+ }
+
-- | Poll Telegram for new updates.
getUpdates :: TelegramConfig -> Int -> IO [TelegramMessage]
getUpdates cfg offset = do
@@ -533,6 +661,148 @@ downloadFile cfg filePath destPath = do
Left (e :: SomeException) -> pure (Left ("Download error: " <> tshow e))
Right r -> pure r
+downloadFileBytes :: TelegramConfig -> Text -> IO (Either Text BL.ByteString)
+downloadFileBytes cfg filePath = do
+ let url =
+ "https://api.telegram.org/file/bot"
+ <> Text.unpack (tgBotToken cfg)
+ <> "/"
+ <> Text.unpack filePath
+ result <-
+ try <| do
+ req <- HTTP.parseRequest url
+ response <- HTTP.httpLBS req
+ let status = HTTP.getResponseStatusCode response
+ if status >= 200 && status < 300
+ then pure (Right (HTTP.getResponseBody response))
+ else pure (Left ("Download failed: HTTP " <> tshow status))
+ case result of
+ Left (e :: SomeException) -> pure (Left ("Download error: " <> tshow e))
+ Right r -> pure r
+
+downloadPhoto :: TelegramConfig -> TelegramPhoto -> IO (Either Text BL.ByteString)
+downloadPhoto cfg photo = do
+ filePathResult <- getFile cfg (tpFileId photo)
+ case filePathResult of
+ Left err -> pure (Left err)
+ Right filePath -> downloadFileBytes cfg filePath
+
+downloadVoice :: TelegramConfig -> TelegramVoice -> IO (Either Text BL.ByteString)
+downloadVoice cfg voice = do
+ filePathResult <- getFile cfg (tvFileId voice)
+ case filePathResult of
+ Left err -> pure (Left err)
+ Right filePath -> downloadFileBytes cfg filePath
+
+analyzeImage :: Text -> BL.ByteString -> Text -> IO (Either Text Text)
+analyzeImage apiKey imageBytes userPrompt = do
+ let base64Data = TL.toStrict (TLE.decodeUtf8 (B64.encode imageBytes))
+ dataUrl = "data:image/jpeg;base64," <> base64Data
+ prompt = if Text.null userPrompt then "describe this image" else userPrompt
+ body =
+ Aeson.object
+ [ "model" .= ("anthropic/claude-sonnet-4" :: Text),
+ "messages"
+ .= [ Aeson.object
+ [ "role" .= ("user" :: Text),
+ "content"
+ .= [ Aeson.object
+ [ "type" .= ("text" :: Text),
+ "text" .= prompt
+ ],
+ Aeson.object
+ [ "type" .= ("image_url" :: Text),
+ "image_url"
+ .= Aeson.object
+ [ "url" .= dataUrl
+ ]
+ ]
+ ]
+ ]
+ ]
+ ]
+ req0 <- HTTP.parseRequest "https://openrouter.ai/api/v1/chat/completions"
+ let req =
+ HTTP.setRequestMethod "POST"
+ <| HTTP.setRequestHeader "Authorization" ["Bearer " <> encodeUtf8 apiKey]
+ <| HTTP.setRequestHeader "Content-Type" ["application/json"]
+ <| HTTP.setRequestBodyLBS (Aeson.encode body)
+ <| HTTP.setRequestResponseTimeout (HTTPClient.responseTimeoutMicro (120 * 1000000))
+ <| req0
+ result <- try (HTTP.httpLBS req)
+ case result of
+ Left (e :: SomeException) -> pure (Left ("Vision API error: " <> tshow e))
+ Right response -> do
+ let status = HTTP.getResponseStatusCode response
+ if status >= 200 && status < 300
+ then case Aeson.decode (HTTP.getResponseBody response) of
+ Just (Aeson.Object obj) -> case KeyMap.lookup "choices" obj of
+ Just (Aeson.Array choices) | not (null choices) ->
+ case toList choices of
+ (Aeson.Object choice : _) -> case KeyMap.lookup "message" choice of
+ Just (Aeson.Object msg) -> case KeyMap.lookup "content" msg of
+ Just (Aeson.String content) -> pure (Right content)
+ _ -> pure (Left "No content in message")
+ _ -> pure (Left "No message in choice")
+ _ -> pure (Left "Empty choices array")
+ _ -> pure (Left "No choices in response")
+ _ -> pure (Left "Failed to parse vision response")
+ else pure (Left ("Vision API HTTP error: " <> tshow status))
+
+transcribeVoice :: Text -> BL.ByteString -> IO (Either Text Text)
+transcribeVoice apiKey audioBytes = do
+ let base64Data = TL.toStrict (TLE.decodeUtf8 (B64.encode audioBytes))
+ body =
+ Aeson.object
+ [ "model" .= ("google/gemini-2.0-flash-001" :: Text),
+ "messages"
+ .= [ Aeson.object
+ [ "role" .= ("user" :: Text),
+ "content"
+ .= [ Aeson.object
+ [ "type" .= ("text" :: Text),
+ "text" .= ("transcribe this audio exactly, return only the transcription with no commentary" :: Text)
+ ],
+ Aeson.object
+ [ "type" .= ("input_audio" :: Text),
+ "input_audio"
+ .= Aeson.object
+ [ "data" .= base64Data,
+ "format" .= ("ogg" :: Text)
+ ]
+ ]
+ ]
+ ]
+ ]
+ ]
+ req0 <- HTTP.parseRequest "https://openrouter.ai/api/v1/chat/completions"
+ let req =
+ HTTP.setRequestMethod "POST"
+ <| HTTP.setRequestHeader "Authorization" ["Bearer " <> encodeUtf8 apiKey]
+ <| HTTP.setRequestHeader "Content-Type" ["application/json"]
+ <| HTTP.setRequestBodyLBS (Aeson.encode body)
+ <| HTTP.setRequestResponseTimeout (HTTPClient.responseTimeoutMicro (120 * 1000000))
+ <| req0
+ result <- try (HTTP.httpLBS req)
+ case result of
+ Left (e :: SomeException) -> pure (Left ("Transcription API error: " <> tshow e))
+ Right response -> do
+ let status = HTTP.getResponseStatusCode response
+ if status >= 200 && status < 300
+ then case Aeson.decode (HTTP.getResponseBody response) of
+ Just (Aeson.Object obj) -> case KeyMap.lookup "choices" obj of
+ Just (Aeson.Array choices) | not (null choices) ->
+ case toList choices of
+ (Aeson.Object choice : _) -> case KeyMap.lookup "message" choice of
+ Just (Aeson.Object msg) -> case KeyMap.lookup "content" msg of
+ Just (Aeson.String content) -> pure (Right content)
+ _ -> pure (Left "No content in message")
+ _ -> pure (Left "No message in choice")
+ _ -> pure (Left "Empty choices array")
+ _ -> pure (Left "No choices in response")
+ _ -> pure (Left "Failed to parse transcription response")
+ else pure (Left ("Transcription API HTTP error: " <> tshow status))
+
-- | Check if a document is a PDF.
isPdf :: TelegramDocument -> Bool
isPdf doc =
@@ -728,12 +998,57 @@ handleAuthorizedMessage tgConfig provider engineCfg msg uid userName chatId = do
pure (Just truncated)
_ -> pure Nothing
- let userMessage = case pdfContent of
- Just pdfText ->
+ photoAnalysis <- case tmPhoto msg of
+ Just photo -> do
+ putText <| "Processing photo: " <> tshow (tpWidth photo) <> "x" <> tshow (tpHeight photo)
+ bytesResult <- downloadPhoto tgConfig photo
+ case bytesResult of
+ Left err -> do
+ putText <| "Photo download failed: " <> err
+ pure Nothing
+ Right bytes -> do
+ putText <| "Downloaded photo, " <> tshow (BL.length bytes) <> " bytes, analyzing..."
+ analysisResult <- analyzeImage (tgOpenRouterApiKey tgConfig) bytes (tmText msg)
+ case analysisResult of
+ Left err -> do
+ putText <| "Photo analysis failed: " <> err
+ pure Nothing
+ Right analysis -> do
+ putText <| "Photo analyzed: " <> Text.take 100 analysis <> "..."
+ pure (Just analysis)
+ Nothing -> pure Nothing
+
+ voiceTranscription <- case tmVoice msg of
+ Just voice -> do
+ putText <| "Processing voice message: " <> tshow (tvDuration voice) <> " seconds"
+ bytesResult <- downloadVoice tgConfig voice
+ case bytesResult of
+ Left err -> do
+ putText <| "Voice download failed: " <> err
+ pure Nothing
+ Right bytes -> do
+ putText <| "Downloaded voice, " <> tshow (BL.length bytes) <> " bytes, transcribing..."
+ transcribeResult <- transcribeVoice (tgOpenRouterApiKey tgConfig) bytes
+ case transcribeResult of
+ Left err -> do
+ putText <| "Voice transcription failed: " <> err
+ pure Nothing
+ Right transcription -> do
+ putText <| "Transcribed: " <> Text.take 100 transcription <> "..."
+ pure (Just transcription)
+ Nothing -> pure Nothing
+
+ let userMessage = case (pdfContent, photoAnalysis, voiceTranscription) of
+ (Just pdfText, _, _) ->
let caption = tmText msg
prefix = if Text.null caption then "here's the PDF content:\n\n" else caption <> "\n\n---\nPDF content:\n\n"
in prefix <> pdfText
- Nothing -> tmText msg
+ (_, Just analysis, _) ->
+ let caption = tmText msg
+ prefix = if Text.null caption then "[user sent an image]\n\n" else caption <> "\n\n[image analysis follows]\n\n"
+ in prefix <> analysis
+ (_, _, Just transcription) -> transcription
+ _ -> tmText msg
_ <- Memory.saveMessage uid chatId Memory.UserRole (Just userName) userMessage
@@ -957,8 +1272,9 @@ startBot maybeToken = do
putText "Error: OPENROUTER_API_KEY not set"
exitFailure
Just key -> do
- let tgConfig = defaultTelegramConfig token allowedIds kagiKey
- provider = Provider.defaultOpenRouter (Text.pack key) "anthropic/claude-sonnet-4"
+ let orKey = Text.pack key
+ tgConfig = defaultTelegramConfig token allowedIds kagiKey orKey
+ provider = Provider.defaultOpenRouter orKey "anthropic/claude-sonnet-4"
putText <| "Allowed user IDs: " <> tshow allowedIds
putText <| "Kagi search: " <> if isJust kagiKey then "enabled" else "disabled"
runTelegramBot tgConfig provider