summaryrefslogtreecommitdiff
path: root/Omni/Agent/Tools/Pdf.hs
diff options
context:
space:
mode:
authorBen Sima <ben@bensima.com>2025-12-12 18:55:15 -0500
committerBen Sima <ben@bensima.com>2025-12-12 18:55:15 -0500
commita6863d562a76eff5de36e0faa244e6ae2310bc22 (patch)
tree1953a43ecaacbf48355bb0c6b32d7b81e853455d /Omni/Agent/Tools/Pdf.hs
parent622786d69393c650d8d5e2b080ba9fad77f901e0 (diff)
Add PDF and Notes tools to Telegram bot
- Omni/Agent/Tools/Pdf.hs: Extract text from PDFs using pdftotext - Omni/Agent/Tools/Notes.hs: Quick notes CRUD with topics - Add notes table schema to Memory.hs initMemoryDb - Wire both tools into Telegram bot with logging callbacks
Diffstat (limited to 'Omni/Agent/Tools/Pdf.hs')
-rw-r--r--Omni/Agent/Tools/Pdf.hs180
1 files changed, 180 insertions, 0 deletions
diff --git a/Omni/Agent/Tools/Pdf.hs b/Omni/Agent/Tools/Pdf.hs
new file mode 100644
index 0000000..7687234
--- /dev/null
+++ b/Omni/Agent/Tools/Pdf.hs
@@ -0,0 +1,180 @@
+{-# LANGUAGE DeriveGeneric #-}
+{-# LANGUAGE OverloadedStrings #-}
+{-# LANGUAGE ScopedTypeVariables #-}
+{-# LANGUAGE NoImplicitPrelude #-}
+
+-- | PDF extraction tool using poppler-utils (pdftotext).
+--
+-- Extracts text from PDF files for LLM consumption.
+--
+-- : out omni-agent-tools-pdf
+-- : dep aeson
+-- : dep http-conduit
+-- : dep directory
+-- : dep process
+module Omni.Agent.Tools.Pdf
+ ( -- * Tool
+ pdfTool,
+
+ -- * Direct API
+ extractPdfText,
+ downloadAndExtract,
+
+ -- * Testing
+ main,
+ test,
+ )
+where
+
+import Alpha
+import Data.Aeson ((.=))
+import qualified Data.Aeson as Aeson
+import qualified Data.ByteString.Lazy as BL
+import qualified Data.Text as Text
+import qualified Network.HTTP.Simple as HTTP
+import qualified Omni.Agent.Engine as Engine
+import qualified Omni.Test as Test
+import System.IO (hClose)
+import System.IO.Temp (withSystemTempFile)
+import System.Process (readProcessWithExitCode)
+
+main :: IO ()
+main = Test.run test
+
+test :: Test.Tree
+test =
+ Test.group
+ "Omni.Agent.Tools.Pdf"
+ [ Test.unit "pdfTool has correct schema" <| do
+ let tool = pdfTool
+ Engine.toolName tool Test.@=? "read_pdf",
+ Test.unit "extractPdfText handles missing file" <| do
+ result <- extractPdfText "/nonexistent/file.pdf"
+ case result of
+ Left err -> ("No such file" `Text.isInfixOf` err || "pdftotext" `Text.isInfixOf` err) Test.@=? True
+ Right _ -> Test.assertFailure "Expected error for missing file",
+ Test.unit "chunkText splits correctly" <| do
+ let text = Text.replicate 5000 "a"
+ chunks = chunkText 1000 text
+ length chunks Test.@=? 5
+ all (\c -> Text.length c <= 1000) chunks Test.@=? True,
+ Test.unit "chunkText handles small text" <| do
+ let text = "small text"
+ chunks = chunkText 1000 text
+ chunks Test.@=? ["small text"]
+ ]
+
+data PdfArgs = PdfArgs
+ { pdfPath :: Text,
+ pdfMaxChars :: Maybe Int
+ }
+ deriving (Generic)
+
+instance Aeson.FromJSON PdfArgs where
+ parseJSON =
+ Aeson.withObject "PdfArgs" <| \v ->
+ (PdfArgs </ (v Aeson..: "path"))
+ <*> (v Aeson..:? "max_chars")
+
+pdfTool :: Engine.Tool
+pdfTool =
+ Engine.Tool
+ { Engine.toolName = "read_pdf",
+ Engine.toolDescription =
+ "Extract text from a PDF file. Use this when you receive a PDF document "
+ <> "and need to read its contents. Returns the extracted text.",
+ Engine.toolJsonSchema =
+ Aeson.object
+ [ "type" .= ("object" :: Text),
+ "properties"
+ .= Aeson.object
+ [ "path"
+ .= Aeson.object
+ [ "type" .= ("string" :: Text),
+ "description" .= ("Path to the PDF file" :: Text)
+ ],
+ "max_chars"
+ .= Aeson.object
+ [ "type" .= ("integer" :: Text),
+ "description" .= ("Maximum characters to return (default: 50000)" :: Text)
+ ]
+ ],
+ "required" .= (["path"] :: [Text])
+ ],
+ Engine.toolExecute = executePdf
+ }
+
+executePdf :: Aeson.Value -> IO Aeson.Value
+executePdf v =
+ case Aeson.fromJSON v of
+ Aeson.Error e -> pure (Aeson.object ["error" .= Text.pack e])
+ Aeson.Success (args :: PdfArgs) -> do
+ let maxChars = maybe 50000 (min 100000 <. max 1000) (pdfMaxChars args)
+ result <- extractPdfText (Text.unpack (pdfPath args))
+ case result of
+ Left err ->
+ pure (Aeson.object ["error" .= err])
+ Right text -> do
+ let truncated = Text.take maxChars text
+ wasTruncated = Text.length text > maxChars
+ pure
+ ( Aeson.object
+ [ "success" .= True,
+ "text" .= truncated,
+ "chars" .= Text.length truncated,
+ "truncated" .= wasTruncated
+ ]
+ )
+
+extractPdfText :: FilePath -> IO (Either Text Text)
+extractPdfText path = do
+ result <-
+ try <| readProcessWithExitCode "pdftotext" ["-layout", path, "-"] ""
+ case result of
+ Left (e :: SomeException) ->
+ pure (Left ("pdftotext error: " <> tshow e))
+ Right (exitCode, stdoutStr, stderrStr) ->
+ case exitCode of
+ ExitSuccess -> pure (Right (Text.pack stdoutStr))
+ ExitFailure code ->
+ pure (Left ("pdftotext failed (" <> tshow code <> "): " <> Text.pack stderrStr))
+
+downloadAndExtract :: Text -> Text -> Text -> IO (Either Text Text)
+downloadAndExtract botToken filePath maxCharsText = do
+ let url =
+ "https://api.telegram.org/file/bot"
+ <> Text.unpack botToken
+ <> "/"
+ <> Text.unpack filePath
+ maxChars = maybe 50000 identity (readMaybe (Text.unpack maxCharsText) :: Maybe Int)
+ withSystemTempFile "telegram_pdf.pdf" <| \tmpPath tmpHandle -> do
+ hClose tmpHandle
+ downloadResult <-
+ try <| do
+ req <- HTTP.parseRequest url
+ response <- HTTP.httpLBS req
+ let status = HTTP.getResponseStatusCode response
+ if status >= 200 && status < 300
+ then do
+ BL.writeFile tmpPath (HTTP.getResponseBody response)
+ pure (Right ())
+ else pure (Left ("Download failed: HTTP " <> tshow status))
+ case downloadResult of
+ Left (e :: SomeException) ->
+ pure (Left ("Download error: " <> tshow e))
+ Right (Left err) -> pure (Left err)
+ Right (Right ()) -> do
+ result <- extractPdfText tmpPath
+ case result of
+ Left err -> pure (Left err)
+ Right text -> do
+ let truncated = Text.take maxChars text
+ pure (Right truncated)
+
+chunkText :: Int -> Text -> [Text]
+chunkText chunkSize text
+ | Text.null text = []
+ | Text.length text <= chunkSize = [text]
+ | otherwise =
+ let (chunk, rest) = Text.splitAt chunkSize text
+ in chunk : chunkText chunkSize rest