summaryrefslogtreecommitdiff
path: root/Omni/Agent/Tools/Pdf.hs
blob: 768723465d385260711752b3234712b841e70941 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
{-# LANGUAGE DeriveGeneric #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE NoImplicitPrelude #-}

-- | PDF extraction tool using poppler-utils (pdftotext).
--
-- Extracts text from PDF files for LLM consumption.
--
-- : out omni-agent-tools-pdf
-- : dep aeson
-- : dep http-conduit
-- : dep directory
-- : dep process
module Omni.Agent.Tools.Pdf
  ( -- * Tool
    pdfTool,

    -- * Direct API
    extractPdfText,
    downloadAndExtract,

    -- * Testing
    main,
    test,
  )
where

import Alpha
import Data.Aeson ((.=))
import qualified Data.Aeson as Aeson
import qualified Data.ByteString.Lazy as BL
import qualified Data.Text as Text
import qualified Network.HTTP.Simple as HTTP
import qualified Omni.Agent.Engine as Engine
import qualified Omni.Test as Test
import System.IO (hClose)
import System.IO.Temp (withSystemTempFile)
import System.Process (readProcessWithExitCode)

main :: IO ()
main = Test.run test

test :: Test.Tree
test =
  Test.group
    "Omni.Agent.Tools.Pdf"
    [ Test.unit "pdfTool has correct schema" <| do
        let tool = pdfTool
        Engine.toolName tool Test.@=? "read_pdf",
      Test.unit "extractPdfText handles missing file" <| do
        result <- extractPdfText "/nonexistent/file.pdf"
        case result of
          Left err -> ("No such file" `Text.isInfixOf` err || "pdftotext" `Text.isInfixOf` err) Test.@=? True
          Right _ -> Test.assertFailure "Expected error for missing file",
      Test.unit "chunkText splits correctly" <| do
        let text = Text.replicate 5000 "a"
            chunks = chunkText 1000 text
        length chunks Test.@=? 5
        all (\c -> Text.length c <= 1000) chunks Test.@=? True,
      Test.unit "chunkText handles small text" <| do
        let text = "small text"
            chunks = chunkText 1000 text
        chunks Test.@=? ["small text"]
    ]

data PdfArgs = PdfArgs
  { pdfPath :: Text,
    pdfMaxChars :: Maybe Int
  }
  deriving (Generic)

instance Aeson.FromJSON PdfArgs where
  parseJSON =
    Aeson.withObject "PdfArgs" <| \v ->
      (PdfArgs </ (v Aeson..: "path"))
        <*> (v Aeson..:? "max_chars")

pdfTool :: Engine.Tool
pdfTool =
  Engine.Tool
    { Engine.toolName = "read_pdf",
      Engine.toolDescription =
        "Extract text from a PDF file. Use this when you receive a PDF document "
          <> "and need to read its contents. Returns the extracted text.",
      Engine.toolJsonSchema =
        Aeson.object
          [ "type" .= ("object" :: Text),
            "properties"
              .= Aeson.object
                [ "path"
                    .= Aeson.object
                      [ "type" .= ("string" :: Text),
                        "description" .= ("Path to the PDF file" :: Text)
                      ],
                  "max_chars"
                    .= Aeson.object
                      [ "type" .= ("integer" :: Text),
                        "description" .= ("Maximum characters to return (default: 50000)" :: Text)
                      ]
                ],
            "required" .= (["path"] :: [Text])
          ],
      Engine.toolExecute = executePdf
    }

executePdf :: Aeson.Value -> IO Aeson.Value
executePdf v =
  case Aeson.fromJSON v of
    Aeson.Error e -> pure (Aeson.object ["error" .= Text.pack e])
    Aeson.Success (args :: PdfArgs) -> do
      let maxChars = maybe 50000 (min 100000 <. max 1000) (pdfMaxChars args)
      result <- extractPdfText (Text.unpack (pdfPath args))
      case result of
        Left err ->
          pure (Aeson.object ["error" .= err])
        Right text -> do
          let truncated = Text.take maxChars text
              wasTruncated = Text.length text > maxChars
          pure
            ( Aeson.object
                [ "success" .= True,
                  "text" .= truncated,
                  "chars" .= Text.length truncated,
                  "truncated" .= wasTruncated
                ]
            )

extractPdfText :: FilePath -> IO (Either Text Text)
extractPdfText path = do
  result <-
    try <| readProcessWithExitCode "pdftotext" ["-layout", path, "-"] ""
  case result of
    Left (e :: SomeException) ->
      pure (Left ("pdftotext error: " <> tshow e))
    Right (exitCode, stdoutStr, stderrStr) ->
      case exitCode of
        ExitSuccess -> pure (Right (Text.pack stdoutStr))
        ExitFailure code ->
          pure (Left ("pdftotext failed (" <> tshow code <> "): " <> Text.pack stderrStr))

downloadAndExtract :: Text -> Text -> Text -> IO (Either Text Text)
downloadAndExtract botToken filePath maxCharsText = do
  let url =
        "https://api.telegram.org/file/bot"
          <> Text.unpack botToken
          <> "/"
          <> Text.unpack filePath
      maxChars = maybe 50000 identity (readMaybe (Text.unpack maxCharsText) :: Maybe Int)
  withSystemTempFile "telegram_pdf.pdf" <| \tmpPath tmpHandle -> do
    hClose tmpHandle
    downloadResult <-
      try <| do
        req <- HTTP.parseRequest url
        response <- HTTP.httpLBS req
        let status = HTTP.getResponseStatusCode response
        if status >= 200 && status < 300
          then do
            BL.writeFile tmpPath (HTTP.getResponseBody response)
            pure (Right ())
          else pure (Left ("Download failed: HTTP " <> tshow status))
    case downloadResult of
      Left (e :: SomeException) ->
        pure (Left ("Download error: " <> tshow e))
      Right (Left err) -> pure (Left err)
      Right (Right ()) -> do
        result <- extractPdfText tmpPath
        case result of
          Left err -> pure (Left err)
          Right text -> do
            let truncated = Text.take maxChars text
            pure (Right truncated)

chunkText :: Int -> Text -> [Text]
chunkText chunkSize text
  | Text.null text = []
  | Text.length text <= chunkSize = [text]
  | otherwise =
      let (chunk, rest) = Text.splitAt chunkSize text
       in chunk : chunkText chunkSize rest