From a6863d562a76eff5de36e0faa244e6ae2310bc22 Mon Sep 17 00:00:00 2001
From: Ben Sima <ben@bensima.com>
Date: Fri, 12 Dec 2025 18:55:15 -0500
Subject: Add PDF and Notes tools to Telegram bot

- Omni/Agent/Tools/Pdf.hs: Extract text from PDFs using pdftotext
- Omni/Agent/Tools/Notes.hs: Quick notes CRUD with topics
- Add notes table schema to Memory.hs initMemoryDb
- Wire both tools into Telegram bot with logging callbacks
---
 Omni/Agent/Tools/Pdf.hs | 180 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 180 insertions(+)
 create mode 100644 Omni/Agent/Tools/Pdf.hs

(limited to 'Omni/Agent/Tools/Pdf.hs')

diff --git a/Omni/Agent/Tools/Pdf.hs b/Omni/Agent/Tools/Pdf.hs
new file mode 100644
index 0000000..7687234
--- /dev/null
+++ b/Omni/Agent/Tools/Pdf.hs
@@ -0,0 +1,180 @@
+{-# LANGUAGE DeriveGeneric #-}
+{-# LANGUAGE OverloadedStrings #-}
+{-# LANGUAGE ScopedTypeVariables #-}
+{-# LANGUAGE NoImplicitPrelude #-}
+
+-- | PDF extraction tool using poppler-utils (pdftotext).
+--
+-- Extracts text from PDF files for LLM consumption.
+--
+-- : out omni-agent-tools-pdf
+-- : dep aeson
+-- : dep http-conduit
+-- : dep directory
+-- : dep process
+module Omni.Agent.Tools.Pdf
+  ( -- * Tool
+    pdfTool,
+
+    -- * Direct API
+    extractPdfText,
+    downloadAndExtract,
+
+    -- * Testing
+    main,
+    test,
+  )
+where
+
+import Alpha
+import Data.Aeson ((.=))
+import qualified Data.Aeson as Aeson
+import qualified Data.ByteString.Lazy as BL
+import qualified Data.Text as Text
+import qualified Network.HTTP.Simple as HTTP
+import qualified Omni.Agent.Engine as Engine
+import qualified Omni.Test as Test
+import System.IO (hClose)
+import System.IO.Temp (withSystemTempFile)
+import System.Process (readProcessWithExitCode)
+
+main :: IO ()
+main = Test.run test
+
+test :: Test.Tree
+test =
+  Test.group
+    "Omni.Agent.Tools.Pdf"
+    [ Test.unit "pdfTool has correct schema" <| do
+        let tool = pdfTool
+        Engine.toolName tool Test.@=? "read_pdf",
+      Test.unit "extractPdfText handles missing file" <| do
+        result <- extractPdfText "/nonexistent/file.pdf"
+        case result of
+          Left err -> ("No such file" `Text.isInfixOf` err || "pdftotext" `Text.isInfixOf` err) Test.@=? True
+          Right _ -> Test.assertFailure "Expected error for missing file",
+      Test.unit "chunkText splits correctly" <| do
+        let text = Text.replicate 5000 "a"
+            chunks = chunkText 1000 text
+        length chunks Test.@=? 5
+        all (\c -> Text.length c <= 1000) chunks Test.@=? True,
+      Test.unit "chunkText handles small text" <| do
+        let text = "small text"
+            chunks = chunkText 1000 text
+        chunks Test.@=? ["small text"]
+    ]
+
+data PdfArgs = PdfArgs
+  { pdfPath :: Text,
+    pdfMaxChars :: Maybe Int
+  }
+  deriving (Generic)
+
+instance Aeson.FromJSON PdfArgs where
+  parseJSON =
+    Aeson.withObject "PdfArgs" <| \v ->
+      (PdfArgs </ (v Aeson..: "path"))
+        <*> (v Aeson..:? "max_chars")
+
+pdfTool :: Engine.Tool
+pdfTool =
+  Engine.Tool
+    { Engine.toolName = "read_pdf",
+      Engine.toolDescription =
+        "Extract text from a PDF file. Use this when you receive a PDF document "
+          <> "and need to read its contents. Returns the extracted text.",
+      Engine.toolJsonSchema =
+        Aeson.object
+          [ "type" .= ("object" :: Text),
+            "properties"
+              .= Aeson.object
+                [ "path"
+                    .= Aeson.object
+                      [ "type" .= ("string" :: Text),
+                        "description" .= ("Path to the PDF file" :: Text)
+                      ],
+                  "max_chars"
+                    .= Aeson.object
+                      [ "type" .= ("integer" :: Text),
+                        "description" .= ("Maximum characters to return (default: 50000)" :: Text)
+                      ]
+                ],
+            "required" .= (["path"] :: [Text])
+          ],
+      Engine.toolExecute = executePdf
+    }
+
+executePdf :: Aeson.Value -> IO Aeson.Value
+executePdf v =
+  case Aeson.fromJSON v of
+    Aeson.Error e -> pure (Aeson.object ["error" .= Text.pack e])
+    Aeson.Success (args :: PdfArgs) -> do
+      let maxChars = maybe 50000 (min 100000 <. max 1000) (pdfMaxChars args)
+      result <- extractPdfText (Text.unpack (pdfPath args))
+      case result of
+        Left err ->
+          pure (Aeson.object ["error" .= err])
+        Right text -> do
+          let truncated = Text.take maxChars text
+              wasTruncated = Text.length text > maxChars
+          pure
+            ( Aeson.object
+                [ "success" .= True,
+                  "text" .= truncated,
+                  "chars" .= Text.length truncated,
+                  "truncated" .= wasTruncated
+                ]
+            )
+
+extractPdfText :: FilePath -> IO (Either Text Text)
+extractPdfText path = do
+  result <-
+    try <| readProcessWithExitCode "pdftotext" ["-layout", path, "-"] ""
+  case result of
+    Left (e :: SomeException) ->
+      pure (Left ("pdftotext error: " <> tshow e))
+    Right (exitCode, stdoutStr, stderrStr) ->
+      case exitCode of
+        ExitSuccess -> pure (Right (Text.pack stdoutStr))
+        ExitFailure code ->
+          pure (Left ("pdftotext failed (" <> tshow code <> "): " <> Text.pack stderrStr))
+
+downloadAndExtract :: Text -> Text -> Text -> IO (Either Text Text)
+downloadAndExtract botToken filePath maxCharsText = do
+  let url =
+        "https://api.telegram.org/file/bot"
+          <> Text.unpack botToken
+          <> "/"
+          <> Text.unpack filePath
+      maxChars = maybe 50000 identity (readMaybe (Text.unpack maxCharsText) :: Maybe Int)
+  withSystemTempFile "telegram_pdf.pdf" <| \tmpPath tmpHandle -> do
+    hClose tmpHandle
+    downloadResult <-
+      try <| do
+        req <- HTTP.parseRequest url
+        response <- HTTP.httpLBS req
+        let status = HTTP.getResponseStatusCode response
+        if status >= 200 && status < 300
+          then do
+            BL.writeFile tmpPath (HTTP.getResponseBody response)
+            pure (Right ())
+          else pure (Left ("Download failed: HTTP " <> tshow status))
+    case downloadResult of
+      Left (e :: SomeException) ->
+        pure (Left ("Download error: " <> tshow e))
+      Right (Left err) -> pure (Left err)
+      Right (Right ()) -> do
+        result <- extractPdfText tmpPath
+        case result of
+          Left err -> pure (Left err)
+          Right text -> do
+            let truncated = Text.take maxChars text
+            pure (Right truncated)
+
+chunkText :: Int -> Text -> [Text]
+chunkText chunkSize text
+  | Text.null text = []
+  | Text.length text <= chunkSize = [text]
+  | otherwise =
+      let (chunk, rest) = Text.splitAt chunkSize text
+       in chunk : chunkText chunkSize rest
-- 
cgit v1.2.3