1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
|
{-# LANGUAGE DeriveGeneric #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE NoImplicitPrelude #-}
-- | PDF extraction tool using poppler-utils (pdftotext).
--
-- Extracts text from PDF files for LLM consumption.
--
-- : out omni-agent-tools-pdf
-- : dep aeson
-- : dep http-conduit
-- : dep directory
-- : dep process
module Omni.Agent.Tools.Pdf
( -- * Tool
pdfTool,
-- * Direct API
extractPdfText,
downloadAndExtract,
-- * Testing
main,
test,
)
where
import Alpha
import Data.Aeson ((.=))
import qualified Data.Aeson as Aeson
import qualified Data.ByteString.Lazy as BL
import qualified Data.Text as Text
import qualified Network.HTTP.Simple as HTTP
import qualified Omni.Agent.Engine as Engine
import qualified Omni.Test as Test
import System.IO (hClose)
import System.IO.Temp (withSystemTempFile)
import System.Process (readProcessWithExitCode)
main :: IO ()
main = Test.run test
test :: Test.Tree
test =
Test.group
"Omni.Agent.Tools.Pdf"
[ Test.unit "pdfTool has correct schema" <| do
let tool = pdfTool
Engine.toolName tool Test.@=? "read_pdf",
Test.unit "extractPdfText handles missing file" <| do
result <- extractPdfText "/nonexistent/file.pdf"
case result of
Left err -> ("No such file" `Text.isInfixOf` err || "pdftotext" `Text.isInfixOf` err) Test.@=? True
Right _ -> Test.assertFailure "Expected error for missing file",
Test.unit "chunkText splits correctly" <| do
let text = Text.replicate 5000 "a"
chunks = chunkText 1000 text
length chunks Test.@=? 5
all (\c -> Text.length c <= 1000) chunks Test.@=? True,
Test.unit "chunkText handles small text" <| do
let text = "small text"
chunks = chunkText 1000 text
chunks Test.@=? ["small text"]
]
data PdfArgs = PdfArgs
{ pdfPath :: Text,
pdfMaxChars :: Maybe Int
}
deriving (Generic)
instance Aeson.FromJSON PdfArgs where
parseJSON =
Aeson.withObject "PdfArgs" <| \v ->
(PdfArgs </ (v Aeson..: "path"))
<*> (v Aeson..:? "max_chars")
pdfTool :: Engine.Tool
pdfTool =
Engine.Tool
{ Engine.toolName = "read_pdf",
Engine.toolDescription =
"Extract text from a PDF file. Use this when you receive a PDF document "
<> "and need to read its contents. Returns the extracted text.",
Engine.toolJsonSchema =
Aeson.object
[ "type" .= ("object" :: Text),
"properties"
.= Aeson.object
[ "path"
.= Aeson.object
[ "type" .= ("string" :: Text),
"description" .= ("Path to the PDF file" :: Text)
],
"max_chars"
.= Aeson.object
[ "type" .= ("integer" :: Text),
"description" .= ("Maximum characters to return (default: 50000)" :: Text)
]
],
"required" .= (["path"] :: [Text])
],
Engine.toolExecute = executePdf
}
executePdf :: Aeson.Value -> IO Aeson.Value
executePdf v =
case Aeson.fromJSON v of
Aeson.Error e -> pure (Aeson.object ["error" .= Text.pack e])
Aeson.Success (args :: PdfArgs) -> do
let maxChars = maybe 50000 (min 100000 <. max 1000) (pdfMaxChars args)
result <- extractPdfText (Text.unpack (pdfPath args))
case result of
Left err ->
pure (Aeson.object ["error" .= err])
Right text -> do
let truncated = Text.take maxChars text
wasTruncated = Text.length text > maxChars
pure
( Aeson.object
[ "success" .= True,
"text" .= truncated,
"chars" .= Text.length truncated,
"truncated" .= wasTruncated
]
)
extractPdfText :: FilePath -> IO (Either Text Text)
extractPdfText path = do
result <-
try <| readProcessWithExitCode "pdftotext" ["-layout", path, "-"] ""
case result of
Left (e :: SomeException) ->
pure (Left ("pdftotext error: " <> tshow e))
Right (exitCode, stdoutStr, stderrStr) ->
case exitCode of
ExitSuccess -> pure (Right (Text.pack stdoutStr))
ExitFailure code ->
pure (Left ("pdftotext failed (" <> tshow code <> "): " <> Text.pack stderrStr))
downloadAndExtract :: Text -> Text -> Text -> IO (Either Text Text)
downloadAndExtract botToken filePath maxCharsText = do
let url =
"https://api.telegram.org/file/bot"
<> Text.unpack botToken
<> "/"
<> Text.unpack filePath
maxChars = maybe 50000 identity (readMaybe (Text.unpack maxCharsText) :: Maybe Int)
withSystemTempFile "telegram_pdf.pdf" <| \tmpPath tmpHandle -> do
hClose tmpHandle
downloadResult <-
try <| do
req <- HTTP.parseRequest url
response <- HTTP.httpLBS req
let status = HTTP.getResponseStatusCode response
if status >= 200 && status < 300
then do
BL.writeFile tmpPath (HTTP.getResponseBody response)
pure (Right ())
else pure (Left ("Download failed: HTTP " <> tshow status))
case downloadResult of
Left (e :: SomeException) ->
pure (Left ("Download error: " <> tshow e))
Right (Left err) -> pure (Left err)
Right (Right ()) -> do
result <- extractPdfText tmpPath
case result of
Left err -> pure (Left err)
Right text -> do
let truncated = Text.take maxChars text
pure (Right truncated)
chunkText :: Int -> Text -> [Text]
chunkText chunkSize text
| Text.null text = []
| Text.length text <= chunkSize = [text]
| otherwise =
let (chunk, rest) = Text.splitAt chunkSize text
in chunk : chunkText chunkSize rest
|