{-# LANGUAGE DeriveGeneric #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE ScopedTypeVariables #-} {-# LANGUAGE NoImplicitPrelude #-} -- | PDF extraction tool using poppler-utils (pdftotext). -- -- Extracts text from PDF files for LLM consumption. -- -- : out omni-agent-tools-pdf -- : dep aeson -- : dep http-conduit -- : dep directory -- : dep process module Omni.Agent.Tools.Pdf ( -- * Tool pdfTool, -- * Direct API extractPdfText, downloadAndExtract, -- * Testing main, test, ) where import Alpha import Data.Aeson ((.=)) import qualified Data.Aeson as Aeson import qualified Data.ByteString.Lazy as BL import qualified Data.Text as Text import qualified Network.HTTP.Simple as HTTP import qualified Omni.Agent.Engine as Engine import qualified Omni.Test as Test import System.IO (hClose) import System.IO.Temp (withSystemTempFile) import System.Process (readProcessWithExitCode) main :: IO () main = Test.run test test :: Test.Tree test = Test.group "Omni.Agent.Tools.Pdf" [ Test.unit "pdfTool has correct schema" <| do let tool = pdfTool Engine.toolName tool Test.@=? "read_pdf", Test.unit "extractPdfText handles missing file" <| do result <- extractPdfText "/nonexistent/file.pdf" case result of Left err -> ("No such file" `Text.isInfixOf` err || "pdftotext" `Text.isInfixOf` err) Test.@=? True Right _ -> Test.assertFailure "Expected error for missing file", Test.unit "chunkText splits correctly" <| do let text = Text.replicate 5000 "a" chunks = chunkText 1000 text length chunks Test.@=? 5 all (\c -> Text.length c <= 1000) chunks Test.@=? True, Test.unit "chunkText handles small text" <| do let text = "small text" chunks = chunkText 1000 text chunks Test.@=? ["small text"] ] data PdfArgs = PdfArgs { pdfPath :: Text, pdfMaxChars :: Maybe Int } deriving (Generic) instance Aeson.FromJSON PdfArgs where parseJSON = Aeson.withObject "PdfArgs" <| \v -> (PdfArgs (v Aeson..:? "max_chars") pdfTool :: Engine.Tool pdfTool = Engine.Tool { Engine.toolName = "read_pdf", Engine.toolDescription = "Extract text from a PDF file. Use this when you receive a PDF document " <> "and need to read its contents. Returns the extracted text.", Engine.toolJsonSchema = Aeson.object [ "type" .= ("object" :: Text), "properties" .= Aeson.object [ "path" .= Aeson.object [ "type" .= ("string" :: Text), "description" .= ("Path to the PDF file" :: Text) ], "max_chars" .= Aeson.object [ "type" .= ("integer" :: Text), "description" .= ("Maximum characters to return (default: 50000)" :: Text) ] ], "required" .= (["path"] :: [Text]) ], Engine.toolExecute = executePdf } executePdf :: Aeson.Value -> IO Aeson.Value executePdf v = case Aeson.fromJSON v of Aeson.Error e -> pure (Aeson.object ["error" .= Text.pack e]) Aeson.Success (args :: PdfArgs) -> do let maxChars = maybe 50000 (min 100000 <. max 1000) (pdfMaxChars args) result <- extractPdfText (Text.unpack (pdfPath args)) case result of Left err -> pure (Aeson.object ["error" .= err]) Right text -> do let truncated = Text.take maxChars text wasTruncated = Text.length text > maxChars pure ( Aeson.object [ "success" .= True, "text" .= truncated, "chars" .= Text.length truncated, "truncated" .= wasTruncated ] ) extractPdfText :: FilePath -> IO (Either Text Text) extractPdfText path = do result <- try <| readProcessWithExitCode "pdftotext" ["-layout", path, "-"] "" case result of Left (e :: SomeException) -> pure (Left ("pdftotext error: " <> tshow e)) Right (exitCode, stdoutStr, stderrStr) -> case exitCode of ExitSuccess -> pure (Right (Text.pack stdoutStr)) ExitFailure code -> pure (Left ("pdftotext failed (" <> tshow code <> "): " <> Text.pack stderrStr)) downloadAndExtract :: Text -> Text -> Text -> IO (Either Text Text) downloadAndExtract botToken filePath maxCharsText = do let url = "https://api.telegram.org/file/bot" <> Text.unpack botToken <> "/" <> Text.unpack filePath maxChars = maybe 50000 identity (readMaybe (Text.unpack maxCharsText) :: Maybe Int) withSystemTempFile "telegram_pdf.pdf" <| \tmpPath tmpHandle -> do hClose tmpHandle downloadResult <- try <| do req <- HTTP.parseRequest url response <- HTTP.httpLBS req let status = HTTP.getResponseStatusCode response if status >= 200 && status < 300 then do BL.writeFile tmpPath (HTTP.getResponseBody response) pure (Right ()) else pure (Left ("Download failed: HTTP " <> tshow status)) case downloadResult of Left (e :: SomeException) -> pure (Left ("Download error: " <> tshow e)) Right (Left err) -> pure (Left err) Right (Right ()) -> do result <- extractPdfText tmpPath case result of Left err -> pure (Left err) Right text -> do let truncated = Text.take maxChars text pure (Right truncated) chunkText :: Int -> Text -> [Text] chunkText chunkSize text | Text.null text = [] | Text.length text <= chunkSize = [text] | otherwise = let (chunk, rest) = Text.splitAt chunkSize text in chunk : chunkText chunkSize rest