1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
|
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE NoImplicitPrelude #-}
-- | Telegram Media Handling - File downloads, image analysis, voice transcription.
--
-- : out omni-agent-telegram-media
-- : dep aeson
-- : dep http-conduit
-- : dep base64-bytestring
module Omni.Agent.Telegram.Media
( -- * File Downloads
getFile,
downloadFile,
downloadFileBytes,
downloadPhoto,
downloadVoice,
downloadAndExtractPdf,
-- * Multimodal Processing
analyzeImage,
transcribeVoice,
-- * Size Limits
maxImageBytes,
maxVoiceBytes,
checkPhotoSize,
checkVoiceSize,
-- * HTTP Utilities
httpGetBytes,
httpPostJson,
-- * Testing
main,
test,
)
where
import Alpha
import Data.Aeson ((.=))
import qualified Data.Aeson as Aeson
import qualified Data.Aeson.KeyMap as KeyMap
import qualified Data.ByteString as BS
import qualified Data.ByteString.Base64.Lazy as B64
import qualified Data.ByteString.Lazy as BL
import qualified Data.CaseInsensitive as CI
import qualified Data.Text as Text
import qualified Data.Text.Encoding as TE
import qualified Data.Text.Lazy as TL
import qualified Data.Text.Lazy.Encoding as TLE
import qualified Network.HTTP.Client as HTTPClient
import qualified Network.HTTP.Simple as HTTP
import qualified Omni.Agent.Telegram.Types as Types
import qualified Omni.Agent.Tools.Pdf as Pdf
import qualified Omni.Test as Test
import System.IO (hClose)
import System.IO.Temp (withSystemTempFile)
main :: IO ()
main = Test.run test
test :: Test.Tree
test =
Test.group
"Omni.Agent.Telegram.Media"
[ Test.unit "maxImageBytes is 10MB" <| do
maxImageBytes Test.@=? 10_000_000,
Test.unit "maxVoiceBytes is 20MB" <| do
maxVoiceBytes Test.@=? 20_000_000,
Test.unit "checkPhotoSize accepts small photos" <| do
let photo = Types.TelegramPhoto "id" 800 600 (Just 100_000)
checkPhotoSize photo Test.@=? Right (),
Test.unit "checkPhotoSize rejects large photos" <| do
let photo = Types.TelegramPhoto "id" 800 600 (Just 15_000_000)
case checkPhotoSize photo of
Left _ -> pure ()
Right _ -> Test.assertFailure "Expected rejection",
Test.unit "checkVoiceSize accepts small voice" <| do
let voice = Types.TelegramVoice "id" 60 (Just "audio/ogg") (Just 500_000)
checkVoiceSize voice Test.@=? Right (),
Test.unit "checkVoiceSize rejects large voice" <| do
let voice = Types.TelegramVoice "id" 60 (Just "audio/ogg") (Just 25_000_000)
case checkVoiceSize voice of
Left _ -> pure ()
Right _ -> Test.assertFailure "Expected rejection"
]
maxImageBytes :: Int
maxImageBytes = 10_000_000
maxVoiceBytes :: Int
maxVoiceBytes = 20_000_000
checkPhotoSize :: Types.TelegramPhoto -> Either Text ()
checkPhotoSize photo =
case Types.tpFileSize photo of
Just size
| size > maxImageBytes ->
Left <| "image too large (" <> tshow (size `div` 1_000_000) <> "MB), max " <> tshow (maxImageBytes `div` 1_000_000) <> "MB"
_ -> Right ()
checkVoiceSize :: Types.TelegramVoice -> Either Text ()
checkVoiceSize voice =
case Types.tvFileSize voice of
Just size
| size > maxVoiceBytes ->
Left <| "voice message too large (" <> tshow (size `div` 1_000_000) <> "MB), max " <> tshow (maxVoiceBytes `div` 1_000_000) <> "MB"
_ -> Right ()
httpGetBytes :: String -> IO (Either Text BL.ByteString)
httpGetBytes url = do
result <-
try <| do
req <- HTTP.parseRequest url
resp <- HTTP.httpLBS req
let status = HTTP.getResponseStatusCode resp
if status >= 200 && status < 300
then pure (Right (HTTP.getResponseBody resp))
else pure (Left ("HTTP " <> tshow status))
case result of
Left (e :: SomeException) -> pure (Left ("HTTP error: " <> tshow e))
Right r -> pure r
httpPostJson :: String -> [(ByteString, ByteString)] -> Aeson.Value -> Int -> IO (Either Text BL.ByteString)
httpPostJson url extraHeaders body timeoutSecs = do
result <-
try <| do
req0 <- HTTP.parseRequest url
let baseReq =
HTTP.setRequestMethod "POST"
<| HTTP.setRequestHeader "Content-Type" ["application/json"]
<| HTTP.setRequestBodyLBS (Aeson.encode body)
<| HTTP.setRequestResponseTimeout (HTTPClient.responseTimeoutMicro (timeoutSecs * 1000000))
<| req0
req = foldr addHeader baseReq extraHeaders
addHeader (name, value) = HTTP.addRequestHeader (CI.mk name) value
resp <- HTTP.httpLBS req
let status = HTTP.getResponseStatusCode resp
if status >= 200 && status < 300
then pure (Right (HTTP.getResponseBody resp))
else pure (Left ("HTTP " <> tshow status <> ": " <> shortBody resp))
case result of
Left (e :: SomeException) -> pure (Left ("HTTP error: " <> tshow e))
Right r -> pure r
where
shortBody r =
let b = BL.toStrict (HTTP.getResponseBody r)
in TE.decodeUtf8 (BS.take 200 b)
getFile :: Types.TelegramConfig -> Text -> IO (Either Text Text)
getFile cfg fileId = do
let url =
Text.unpack (Types.tgApiBaseUrl cfg)
<> "/bot"
<> Text.unpack (Types.tgBotToken cfg)
<> "/getFile?file_id="
<> Text.unpack fileId
result <- httpGetBytes url
case result of
Left err -> pure (Left err)
Right body ->
case Aeson.decode body of
Just (Aeson.Object obj) -> case KeyMap.lookup "result" obj of
Just (Aeson.Object resultObj) -> case KeyMap.lookup "file_path" resultObj of
Just (Aeson.String path) -> pure (Right path)
_ -> pure (Left "No file_path in response")
_ -> pure (Left "No result in response")
_ -> pure (Left "Failed to parse getFile response")
downloadFileBytes :: Types.TelegramConfig -> Text -> IO (Either Text BL.ByteString)
downloadFileBytes cfg filePath = do
let url =
"https://api.telegram.org/file/bot"
<> Text.unpack (Types.tgBotToken cfg)
<> "/"
<> Text.unpack filePath
httpGetBytes url
downloadFile :: Types.TelegramConfig -> Text -> FilePath -> IO (Either Text ())
downloadFile cfg filePath destPath = do
result <- downloadFileBytes cfg filePath
case result of
Left err -> pure (Left err)
Right bytes -> do
BL.writeFile destPath bytes
pure (Right ())
downloadPhoto :: Types.TelegramConfig -> Types.TelegramPhoto -> IO (Either Text BL.ByteString)
downloadPhoto cfg photo = do
filePathResult <- getFile cfg (Types.tpFileId photo)
case filePathResult of
Left err -> pure (Left err)
Right filePath -> downloadFileBytes cfg filePath
downloadVoice :: Types.TelegramConfig -> Types.TelegramVoice -> IO (Either Text BL.ByteString)
downloadVoice cfg voice = do
filePathResult <- getFile cfg (Types.tvFileId voice)
case filePathResult of
Left err -> pure (Left err)
Right filePath -> downloadFileBytes cfg filePath
downloadAndExtractPdf :: Types.TelegramConfig -> Text -> IO (Either Text Text)
downloadAndExtractPdf cfg fileId = do
filePathResult <- getFile cfg fileId
case filePathResult of
Left err -> pure (Left err)
Right filePath ->
withSystemTempFile "telegram_pdf.pdf" <| \tmpPath tmpHandle -> do
hClose tmpHandle
downloadResult <- downloadFile cfg filePath tmpPath
case downloadResult of
Left err -> pure (Left err)
Right () -> Pdf.extractPdfText tmpPath
parseOpenRouterResponse :: BL.ByteString -> Either Text Text
parseOpenRouterResponse body =
case Aeson.decode body of
Just (Aeson.Object obj) -> case KeyMap.lookup "choices" obj of
Just (Aeson.Array choices) | not (null choices) ->
case toList choices of
(Aeson.Object choice : _) -> case KeyMap.lookup "message" choice of
Just (Aeson.Object msg) -> case KeyMap.lookup "content" msg of
Just (Aeson.String content) -> Right content
Just Aeson.Null -> Left "No content in response"
_ -> Left "Unexpected content type in response"
_ -> Left "No message in choice"
_ -> Left "Empty choices array"
_ -> Left "No choices in response"
_ -> Left "Failed to parse response"
analyzeImage :: Text -> BL.ByteString -> Text -> IO (Either Text Text)
analyzeImage apiKey imageBytes userPrompt = do
let base64Data = TL.toStrict (TLE.decodeUtf8 (B64.encode imageBytes))
dataUrl = "data:image/jpeg;base64," <> base64Data
prompt =
if Text.null userPrompt
then "describe this image objectively in third person. do not use first person pronouns like 'I can see'. just describe what is shown."
else userPrompt <> "\n\n(describe objectively in third person, no first person pronouns)"
body =
Aeson.object
[ "model" .= ("anthropic/claude-sonnet-4.5" :: Text),
"messages"
.= [ Aeson.object
[ "role" .= ("user" :: Text),
"content"
.= [ Aeson.object
[ "type" .= ("text" :: Text),
"text" .= prompt
],
Aeson.object
[ "type" .= ("image_url" :: Text),
"image_url"
.= Aeson.object
[ "url" .= dataUrl
]
]
]
]
]
]
headers =
[ ("Authorization", "Bearer " <> encodeUtf8 apiKey),
("HTTP-Referer", "https://omni.dev"),
("X-Title", "Omni Agent")
]
result <- httpPostJson "https://openrouter.ai/api/v1/chat/completions" headers body 120
case result of
Left err -> pure (Left ("Vision API error: " <> err))
Right respBody -> pure (first ("Vision API: " <>) (parseOpenRouterResponse respBody))
transcribeVoice :: Text -> BL.ByteString -> IO (Either Text Text)
transcribeVoice apiKey audioBytes = do
let base64Data = TL.toStrict (TLE.decodeUtf8 (B64.encode audioBytes))
body =
Aeson.object
[ "model" .= ("google/gemini-2.0-flash-001" :: Text),
"messages"
.= [ Aeson.object
[ "role" .= ("user" :: Text),
"content"
.= [ Aeson.object
[ "type" .= ("text" :: Text),
"text" .= ("transcribe this audio exactly, return only the transcription with no commentary" :: Text)
],
Aeson.object
[ "type" .= ("input_audio" :: Text),
"input_audio"
.= Aeson.object
[ "data" .= base64Data,
"format" .= ("ogg" :: Text)
]
]
]
]
]
]
headers =
[ ("Authorization", "Bearer " <> encodeUtf8 apiKey),
("HTTP-Referer", "https://omni.dev"),
("X-Title", "Omni Agent")
]
result <- httpPostJson "https://openrouter.ai/api/v1/chat/completions" headers body 120
case result of
Left err -> pure (Left ("Transcription API error: " <> err))
Right respBody -> pure (first ("Transcription API: " <>) (parseOpenRouterResponse respBody))
|