summaryrefslogtreecommitdiff
path: root/Omni/Agent/Telegram/Media.hs
blob: 6539b796051b41433dffc347c8ec69d673e77552 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE NoImplicitPrelude #-}

-- | Telegram Media Handling - File downloads, image analysis, voice transcription.
--
-- : out omni-agent-telegram-media
-- : dep aeson
-- : dep http-conduit
-- : dep base64-bytestring
module Omni.Agent.Telegram.Media
  ( -- * File Downloads
    getFile,
    downloadFile,
    downloadFileBytes,
    downloadPhoto,
    downloadVoice,
    downloadAndExtractPdf,

    -- * Multimodal Processing
    analyzeImage,
    transcribeVoice,

    -- * Size Limits
    maxImageBytes,
    maxVoiceBytes,
    checkPhotoSize,
    checkVoiceSize,

    -- * HTTP Utilities
    httpGetBytes,
    httpPostJson,

    -- * Testing
    main,
    test,
  )
where

import Alpha
import Data.Aeson ((.=))
import qualified Data.Aeson as Aeson
import qualified Data.Aeson.KeyMap as KeyMap
import qualified Data.ByteString as BS
import qualified Data.ByteString.Base64.Lazy as B64
import qualified Data.ByteString.Lazy as BL
import qualified Data.CaseInsensitive as CI
import qualified Data.Text as Text
import qualified Data.Text.Encoding as TE
import qualified Data.Text.Lazy as TL
import qualified Data.Text.Lazy.Encoding as TLE
import qualified Network.HTTP.Client as HTTPClient
import qualified Network.HTTP.Simple as HTTP
import qualified Omni.Agent.Telegram.Types as Types
import qualified Omni.Agent.Tools.Pdf as Pdf
import qualified Omni.Test as Test
import System.IO (hClose)
import System.IO.Temp (withSystemTempFile)

main :: IO ()
main = Test.run test

test :: Test.Tree
test =
  Test.group
    "Omni.Agent.Telegram.Media"
    [ Test.unit "maxImageBytes is 10MB" <| do
        maxImageBytes Test.@=? 10_000_000,
      Test.unit "maxVoiceBytes is 20MB" <| do
        maxVoiceBytes Test.@=? 20_000_000,
      Test.unit "checkPhotoSize accepts small photos" <| do
        let photo = Types.TelegramPhoto "id" 800 600 (Just 100_000)
        checkPhotoSize photo Test.@=? Right (),
      Test.unit "checkPhotoSize rejects large photos" <| do
        let photo = Types.TelegramPhoto "id" 800 600 (Just 15_000_000)
        case checkPhotoSize photo of
          Left _ -> pure ()
          Right _ -> Test.assertFailure "Expected rejection",
      Test.unit "checkVoiceSize accepts small voice" <| do
        let voice = Types.TelegramVoice "id" 60 (Just "audio/ogg") (Just 500_000)
        checkVoiceSize voice Test.@=? Right (),
      Test.unit "checkVoiceSize rejects large voice" <| do
        let voice = Types.TelegramVoice "id" 60 (Just "audio/ogg") (Just 25_000_000)
        case checkVoiceSize voice of
          Left _ -> pure ()
          Right _ -> Test.assertFailure "Expected rejection"
    ]

maxImageBytes :: Int
maxImageBytes = 10_000_000

maxVoiceBytes :: Int
maxVoiceBytes = 20_000_000

checkPhotoSize :: Types.TelegramPhoto -> Either Text ()
checkPhotoSize photo =
  case Types.tpFileSize photo of
    Just size
      | size > maxImageBytes ->
          Left <| "image too large (" <> tshow (size `div` 1_000_000) <> "MB), max " <> tshow (maxImageBytes `div` 1_000_000) <> "MB"
    _ -> Right ()

checkVoiceSize :: Types.TelegramVoice -> Either Text ()
checkVoiceSize voice =
  case Types.tvFileSize voice of
    Just size
      | size > maxVoiceBytes ->
          Left <| "voice message too large (" <> tshow (size `div` 1_000_000) <> "MB), max " <> tshow (maxVoiceBytes `div` 1_000_000) <> "MB"
    _ -> Right ()

httpGetBytes :: String -> IO (Either Text BL.ByteString)
httpGetBytes url = do
  result <-
    try <| do
      req <- HTTP.parseRequest url
      resp <- HTTP.httpLBS req
      let status = HTTP.getResponseStatusCode resp
      if status >= 200 && status < 300
        then pure (Right (HTTP.getResponseBody resp))
        else pure (Left ("HTTP " <> tshow status))
  case result of
    Left (e :: SomeException) -> pure (Left ("HTTP error: " <> tshow e))
    Right r -> pure r

httpPostJson :: String -> [(ByteString, ByteString)] -> Aeson.Value -> Int -> IO (Either Text BL.ByteString)
httpPostJson url extraHeaders body timeoutSecs = do
  result <-
    try <| do
      req0 <- HTTP.parseRequest url
      let baseReq =
            HTTP.setRequestMethod "POST"
              <| HTTP.setRequestHeader "Content-Type" ["application/json"]
              <| HTTP.setRequestBodyLBS (Aeson.encode body)
              <| HTTP.setRequestResponseTimeout (HTTPClient.responseTimeoutMicro (timeoutSecs * 1000000))
              <| req0
          req = foldr addHeader baseReq extraHeaders
          addHeader (name, value) = HTTP.addRequestHeader (CI.mk name) value
      resp <- HTTP.httpLBS req
      let status = HTTP.getResponseStatusCode resp
      if status >= 200 && status < 300
        then pure (Right (HTTP.getResponseBody resp))
        else pure (Left ("HTTP " <> tshow status <> ": " <> shortBody resp))
  case result of
    Left (e :: SomeException) -> pure (Left ("HTTP error: " <> tshow e))
    Right r -> pure r
  where
    shortBody r =
      let b = BL.toStrict (HTTP.getResponseBody r)
       in TE.decodeUtf8 (BS.take 200 b)

getFile :: Types.TelegramConfig -> Text -> IO (Either Text Text)
getFile cfg fileId = do
  let url =
        Text.unpack (Types.tgApiBaseUrl cfg)
          <> "/bot"
          <> Text.unpack (Types.tgBotToken cfg)
          <> "/getFile?file_id="
          <> Text.unpack fileId
  result <- httpGetBytes url
  case result of
    Left err -> pure (Left err)
    Right body ->
      case Aeson.decode body of
        Just (Aeson.Object obj) -> case KeyMap.lookup "result" obj of
          Just (Aeson.Object resultObj) -> case KeyMap.lookup "file_path" resultObj of
            Just (Aeson.String path) -> pure (Right path)
            _ -> pure (Left "No file_path in response")
          _ -> pure (Left "No result in response")
        _ -> pure (Left "Failed to parse getFile response")

downloadFileBytes :: Types.TelegramConfig -> Text -> IO (Either Text BL.ByteString)
downloadFileBytes cfg filePath = do
  let url =
        "https://api.telegram.org/file/bot"
          <> Text.unpack (Types.tgBotToken cfg)
          <> "/"
          <> Text.unpack filePath
  httpGetBytes url

downloadFile :: Types.TelegramConfig -> Text -> FilePath -> IO (Either Text ())
downloadFile cfg filePath destPath = do
  result <- downloadFileBytes cfg filePath
  case result of
    Left err -> pure (Left err)
    Right bytes -> do
      BL.writeFile destPath bytes
      pure (Right ())

downloadPhoto :: Types.TelegramConfig -> Types.TelegramPhoto -> IO (Either Text BL.ByteString)
downloadPhoto cfg photo = do
  filePathResult <- getFile cfg (Types.tpFileId photo)
  case filePathResult of
    Left err -> pure (Left err)
    Right filePath -> downloadFileBytes cfg filePath

downloadVoice :: Types.TelegramConfig -> Types.TelegramVoice -> IO (Either Text BL.ByteString)
downloadVoice cfg voice = do
  filePathResult <- getFile cfg (Types.tvFileId voice)
  case filePathResult of
    Left err -> pure (Left err)
    Right filePath -> downloadFileBytes cfg filePath

downloadAndExtractPdf :: Types.TelegramConfig -> Text -> IO (Either Text Text)
downloadAndExtractPdf cfg fileId = do
  filePathResult <- getFile cfg fileId
  case filePathResult of
    Left err -> pure (Left err)
    Right filePath ->
      withSystemTempFile "telegram_pdf.pdf" <| \tmpPath tmpHandle -> do
        hClose tmpHandle
        downloadResult <- downloadFile cfg filePath tmpPath
        case downloadResult of
          Left err -> pure (Left err)
          Right () -> Pdf.extractPdfText tmpPath

parseOpenRouterResponse :: BL.ByteString -> Either Text Text
parseOpenRouterResponse body =
  case Aeson.decode body of
    Just (Aeson.Object obj) -> case KeyMap.lookup "choices" obj of
      Just (Aeson.Array choices) | not (null choices) ->
        case toList choices of
          (Aeson.Object choice : _) -> case KeyMap.lookup "message" choice of
            Just (Aeson.Object msg) -> case KeyMap.lookup "content" msg of
              Just (Aeson.String content) -> Right content
              Just Aeson.Null -> Left "No content in response"
              _ -> Left "Unexpected content type in response"
            _ -> Left "No message in choice"
          _ -> Left "Empty choices array"
      _ -> Left "No choices in response"
    _ -> Left "Failed to parse response"

analyzeImage :: Text -> BL.ByteString -> Text -> IO (Either Text Text)
analyzeImage apiKey imageBytes userPrompt = do
  let base64Data = TL.toStrict (TLE.decodeUtf8 (B64.encode imageBytes))
      dataUrl = "data:image/jpeg;base64," <> base64Data
      prompt =
        if Text.null userPrompt
          then "describe this image objectively in third person. do not use first person pronouns like 'I can see'. just describe what is shown."
          else userPrompt <> "\n\n(describe objectively in third person, no first person pronouns)"
      body =
        Aeson.object
          [ "model" .= ("anthropic/claude-sonnet-4.5" :: Text),
            "messages"
              .= [ Aeson.object
                     [ "role" .= ("user" :: Text),
                       "content"
                         .= [ Aeson.object
                                [ "type" .= ("text" :: Text),
                                  "text" .= prompt
                                ],
                              Aeson.object
                                [ "type" .= ("image_url" :: Text),
                                  "image_url"
                                    .= Aeson.object
                                      [ "url" .= dataUrl
                                      ]
                                ]
                            ]
                     ]
                 ]
          ]
      headers =
        [ ("Authorization", "Bearer " <> encodeUtf8 apiKey),
          ("HTTP-Referer", "https://omni.dev"),
          ("X-Title", "Omni Agent")
        ]
  result <- httpPostJson "https://openrouter.ai/api/v1/chat/completions" headers body 120
  case result of
    Left err -> pure (Left ("Vision API error: " <> err))
    Right respBody -> pure (first ("Vision API: " <>) (parseOpenRouterResponse respBody))

transcribeVoice :: Text -> BL.ByteString -> IO (Either Text Text)
transcribeVoice apiKey audioBytes = do
  let base64Data = TL.toStrict (TLE.decodeUtf8 (B64.encode audioBytes))
      body =
        Aeson.object
          [ "model" .= ("google/gemini-2.5-flash" :: Text),
            "messages"
              .= [ Aeson.object
                     [ "role" .= ("user" :: Text),
                       "content"
                         .= [ Aeson.object
                                [ "type" .= ("input_audio" :: Text),
                                  "input_audio"
                                    .= Aeson.object
                                      [ "data" .= base64Data,
                                        "format" .= ("ogg" :: Text)
                                      ]
                                ],
                              Aeson.object
                                [ "type" .= ("text" :: Text),
                                  "text" .= ("transcribe this audio exactly. return ONLY the transcription, no commentary or preamble." :: Text)
                                ]
                            ]
                     ]
                 ]
          ]
      headers =
        [ ("Authorization", "Bearer " <> encodeUtf8 apiKey),
          ("HTTP-Referer", "https://omni.dev"),
          ("X-Title", "Omni Agent")
        ]
  result <- httpPostJson "https://openrouter.ai/api/v1/chat/completions" headers body 120
  case result of
    Left err -> pure (Left ("Transcription API error: " <> err))
    Right respBody -> pure (first ("Transcription API: " <>) (parseOpenRouterResponse respBody))