From a6863d562a76eff5de36e0faa244e6ae2310bc22 Mon Sep 17 00:00:00 2001 From: Ben Sima Date: Fri, 12 Dec 2025 18:55:15 -0500 Subject: Add PDF and Notes tools to Telegram bot - Omni/Agent/Tools/Pdf.hs: Extract text from PDFs using pdftotext - Omni/Agent/Tools/Notes.hs: Quick notes CRUD with topics - Add notes table schema to Memory.hs initMemoryDb - Wire both tools into Telegram bot with logging callbacks --- Omni/Agent/Tools/Pdf.hs | 180 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 180 insertions(+) create mode 100644 Omni/Agent/Tools/Pdf.hs (limited to 'Omni/Agent/Tools/Pdf.hs') diff --git a/Omni/Agent/Tools/Pdf.hs b/Omni/Agent/Tools/Pdf.hs new file mode 100644 index 0000000..7687234 --- /dev/null +++ b/Omni/Agent/Tools/Pdf.hs @@ -0,0 +1,180 @@ +{-# LANGUAGE DeriveGeneric #-} +{-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE ScopedTypeVariables #-} +{-# LANGUAGE NoImplicitPrelude #-} + +-- | PDF extraction tool using poppler-utils (pdftotext). +-- +-- Extracts text from PDF files for LLM consumption. +-- +-- : out omni-agent-tools-pdf +-- : dep aeson +-- : dep http-conduit +-- : dep directory +-- : dep process +module Omni.Agent.Tools.Pdf + ( -- * Tool + pdfTool, + + -- * Direct API + extractPdfText, + downloadAndExtract, + + -- * Testing + main, + test, + ) +where + +import Alpha +import Data.Aeson ((.=)) +import qualified Data.Aeson as Aeson +import qualified Data.ByteString.Lazy as BL +import qualified Data.Text as Text +import qualified Network.HTTP.Simple as HTTP +import qualified Omni.Agent.Engine as Engine +import qualified Omni.Test as Test +import System.IO (hClose) +import System.IO.Temp (withSystemTempFile) +import System.Process (readProcessWithExitCode) + +main :: IO () +main = Test.run test + +test :: Test.Tree +test = + Test.group + "Omni.Agent.Tools.Pdf" + [ Test.unit "pdfTool has correct schema" <| do + let tool = pdfTool + Engine.toolName tool Test.@=? "read_pdf", + Test.unit "extractPdfText handles missing file" <| do + result <- extractPdfText "/nonexistent/file.pdf" + case result of + Left err -> ("No such file" `Text.isInfixOf` err || "pdftotext" `Text.isInfixOf` err) Test.@=? True + Right _ -> Test.assertFailure "Expected error for missing file", + Test.unit "chunkText splits correctly" <| do + let text = Text.replicate 5000 "a" + chunks = chunkText 1000 text + length chunks Test.@=? 5 + all (\c -> Text.length c <= 1000) chunks Test.@=? True, + Test.unit "chunkText handles small text" <| do + let text = "small text" + chunks = chunkText 1000 text + chunks Test.@=? ["small text"] + ] + +data PdfArgs = PdfArgs + { pdfPath :: Text, + pdfMaxChars :: Maybe Int + } + deriving (Generic) + +instance Aeson.FromJSON PdfArgs where + parseJSON = + Aeson.withObject "PdfArgs" <| \v -> + (PdfArgs (v Aeson..:? "max_chars") + +pdfTool :: Engine.Tool +pdfTool = + Engine.Tool + { Engine.toolName = "read_pdf", + Engine.toolDescription = + "Extract text from a PDF file. Use this when you receive a PDF document " + <> "and need to read its contents. Returns the extracted text.", + Engine.toolJsonSchema = + Aeson.object + [ "type" .= ("object" :: Text), + "properties" + .= Aeson.object + [ "path" + .= Aeson.object + [ "type" .= ("string" :: Text), + "description" .= ("Path to the PDF file" :: Text) + ], + "max_chars" + .= Aeson.object + [ "type" .= ("integer" :: Text), + "description" .= ("Maximum characters to return (default: 50000)" :: Text) + ] + ], + "required" .= (["path"] :: [Text]) + ], + Engine.toolExecute = executePdf + } + +executePdf :: Aeson.Value -> IO Aeson.Value +executePdf v = + case Aeson.fromJSON v of + Aeson.Error e -> pure (Aeson.object ["error" .= Text.pack e]) + Aeson.Success (args :: PdfArgs) -> do + let maxChars = maybe 50000 (min 100000 <. max 1000) (pdfMaxChars args) + result <- extractPdfText (Text.unpack (pdfPath args)) + case result of + Left err -> + pure (Aeson.object ["error" .= err]) + Right text -> do + let truncated = Text.take maxChars text + wasTruncated = Text.length text > maxChars + pure + ( Aeson.object + [ "success" .= True, + "text" .= truncated, + "chars" .= Text.length truncated, + "truncated" .= wasTruncated + ] + ) + +extractPdfText :: FilePath -> IO (Either Text Text) +extractPdfText path = do + result <- + try <| readProcessWithExitCode "pdftotext" ["-layout", path, "-"] "" + case result of + Left (e :: SomeException) -> + pure (Left ("pdftotext error: " <> tshow e)) + Right (exitCode, stdoutStr, stderrStr) -> + case exitCode of + ExitSuccess -> pure (Right (Text.pack stdoutStr)) + ExitFailure code -> + pure (Left ("pdftotext failed (" <> tshow code <> "): " <> Text.pack stderrStr)) + +downloadAndExtract :: Text -> Text -> Text -> IO (Either Text Text) +downloadAndExtract botToken filePath maxCharsText = do + let url = + "https://api.telegram.org/file/bot" + <> Text.unpack botToken + <> "/" + <> Text.unpack filePath + maxChars = maybe 50000 identity (readMaybe (Text.unpack maxCharsText) :: Maybe Int) + withSystemTempFile "telegram_pdf.pdf" <| \tmpPath tmpHandle -> do + hClose tmpHandle + downloadResult <- + try <| do + req <- HTTP.parseRequest url + response <- HTTP.httpLBS req + let status = HTTP.getResponseStatusCode response + if status >= 200 && status < 300 + then do + BL.writeFile tmpPath (HTTP.getResponseBody response) + pure (Right ()) + else pure (Left ("Download failed: HTTP " <> tshow status)) + case downloadResult of + Left (e :: SomeException) -> + pure (Left ("Download error: " <> tshow e)) + Right (Left err) -> pure (Left err) + Right (Right ()) -> do + result <- extractPdfText tmpPath + case result of + Left err -> pure (Left err) + Right text -> do + let truncated = Text.take maxChars text + pure (Right truncated) + +chunkText :: Int -> Text -> [Text] +chunkText chunkSize text + | Text.null text = [] + | Text.length text <= chunkSize = [text] + | otherwise = + let (chunk, rest) = Text.splitAt chunkSize text + in chunk : chunkText chunkSize rest -- cgit v1.2.3