{-# LANGUAGE DeriveGeneric #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE NoImplicitPrelude #-}

-- | PDF extraction tool using poppler-utils (pdftotext).
--
-- Extracts text from PDF files for LLM consumption.
--
-- : out omni-agent-tools-pdf
-- : dep aeson
-- : dep http-conduit
-- : dep directory
-- : dep process
module Omni.Agent.Tools.Pdf
  ( -- * Tool
    pdfTool,

    -- * Direct API
    extractPdfText,
    downloadAndExtract,

    -- * Testing
    main,
    test,
  )
where

import Alpha
import Data.Aeson ((.=))
import qualified Data.Aeson as Aeson
import qualified Data.ByteString.Lazy as BL
import qualified Data.Text as Text
import qualified Network.HTTP.Simple as HTTP
import qualified Omni.Agent.Engine as Engine
import qualified Omni.Test as Test
import System.IO (hClose)
import System.IO.Temp (withSystemTempFile)
import System.Process (readProcessWithExitCode)

main :: IO ()
main = Test.run test

test :: Test.Tree
test =
  Test.group
    "Omni.Agent.Tools.Pdf"
    [ Test.unit "pdfTool has correct schema" <| do
        let tool = pdfTool
        Engine.toolName tool Test.@=? "read_pdf",
      Test.unit "extractPdfText handles missing file" <| do
        result <- extractPdfText "/nonexistent/file.pdf"
        case result of
          Left err -> ("No such file" `Text.isInfixOf` err || "pdftotext" `Text.isInfixOf` err) Test.@=? True
          Right _ -> Test.assertFailure "Expected error for missing file",
      Test.unit "chunkText splits correctly" <| do
        let text = Text.replicate 5000 "a"
            chunks = chunkText 1000 text
        length chunks Test.@=? 5
        all (\c -> Text.length c <= 1000) chunks Test.@=? True,
      Test.unit "chunkText handles small text" <| do
        let text = "small text"
            chunks = chunkText 1000 text
        chunks Test.@=? ["small text"]
    ]

data PdfArgs = PdfArgs
  { pdfPath :: Text,
    pdfMaxChars :: Maybe Int
  }
  deriving (Generic)

instance Aeson.FromJSON PdfArgs where
  parseJSON =
    Aeson.withObject "PdfArgs" <| \v ->
      (PdfArgs </ (v Aeson..: "path"))
        <*> (v Aeson..:? "max_chars")

pdfTool :: Engine.Tool
pdfTool =
  Engine.Tool
    { Engine.toolName = "read_pdf",
      Engine.toolDescription =
        "Extract text from a PDF file. Use this when you receive a PDF document "
          <> "and need to read its contents. Returns the extracted text.",
      Engine.toolJsonSchema =
        Aeson.object
          [ "type" .= ("object" :: Text),
            "properties"
              .= Aeson.object
                [ "path"
                    .= Aeson.object
                      [ "type" .= ("string" :: Text),
                        "description" .= ("Path to the PDF file" :: Text)
                      ],
                  "max_chars"
                    .= Aeson.object
                      [ "type" .= ("integer" :: Text),
                        "description" .= ("Maximum characters to return (default: 50000)" :: Text)
                      ]
                ],
            "required" .= (["path"] :: [Text])
          ],
      Engine.toolExecute = executePdf
    }

executePdf :: Aeson.Value -> IO Aeson.Value
executePdf v =
  case Aeson.fromJSON v of
    Aeson.Error e -> pure (Aeson.object ["error" .= Text.pack e])
    Aeson.Success (args :: PdfArgs) -> do
      let maxChars = maybe 50000 (min 100000 <. max 1000) (pdfMaxChars args)
      result <- extractPdfText (Text.unpack (pdfPath args))
      case result of
        Left err ->
          pure (Aeson.object ["error" .= err])
        Right text -> do
          let truncated = Text.take maxChars text
              wasTruncated = Text.length text > maxChars
          pure
            ( Aeson.object
                [ "success" .= True,
                  "text" .= truncated,
                  "chars" .= Text.length truncated,
                  "truncated" .= wasTruncated
                ]
            )

extractPdfText :: FilePath -> IO (Either Text Text)
extractPdfText path = do
  result <-
    try <| readProcessWithExitCode "pdftotext" ["-layout", path, "-"] ""
  case result of
    Left (e :: SomeException) ->
      pure (Left ("pdftotext error: " <> tshow e))
    Right (exitCode, stdoutStr, stderrStr) ->
      case exitCode of
        ExitSuccess -> pure (Right (Text.pack stdoutStr))
        ExitFailure code ->
          pure (Left ("pdftotext failed (" <> tshow code <> "): " <> Text.pack stderrStr))

downloadAndExtract :: Text -> Text -> Text -> IO (Either Text Text)
downloadAndExtract botToken filePath maxCharsText = do
  let url =
        "https://api.telegram.org/file/bot"
          <> Text.unpack botToken
          <> "/"
          <> Text.unpack filePath
      maxChars = maybe 50000 identity (readMaybe (Text.unpack maxCharsText) :: Maybe Int)
  withSystemTempFile "telegram_pdf.pdf" <| \tmpPath tmpHandle -> do
    hClose tmpHandle
    downloadResult <-
      try <| do
        req <- HTTP.parseRequest url
        response <- HTTP.httpLBS req
        let status = HTTP.getResponseStatusCode response
        if status >= 200 && status < 300
          then do
            BL.writeFile tmpPath (HTTP.getResponseBody response)
            pure (Right ())
          else pure (Left ("Download failed: HTTP " <> tshow status))
    case downloadResult of
      Left (e :: SomeException) ->
        pure (Left ("Download error: " <> tshow e))
      Right (Left err) -> pure (Left err)
      Right (Right ()) -> do
        result <- extractPdfText tmpPath
        case result of
          Left err -> pure (Left err)
          Right text -> do
            let truncated = Text.take maxChars text
            pure (Right truncated)

chunkText :: Int -> Text -> [Text]
chunkText chunkSize text
  | Text.null text = []
  | Text.length text <= chunkSize = [text]
  | otherwise =
      let (chunk, rest) = Text.splitAt chunkSize text
       in chunk : chunkText chunkSize rest