summaryrefslogtreecommitdiff
path: root/Omni/Agent/Tools/WebReader.hs
diff options
context:
space:
mode:
Diffstat (limited to 'Omni/Agent/Tools/WebReader.hs')
-rw-r--r--Omni/Agent/Tools/WebReader.hs308
1 files changed, 308 insertions, 0 deletions
diff --git a/Omni/Agent/Tools/WebReader.hs b/Omni/Agent/Tools/WebReader.hs
new file mode 100644
index 0000000..a69e3cf
--- /dev/null
+++ b/Omni/Agent/Tools/WebReader.hs
@@ -0,0 +1,308 @@
+{-# LANGUAGE DeriveGeneric #-}
+{-# LANGUAGE OverloadedStrings #-}
+{-# LANGUAGE ScopedTypeVariables #-}
+{-# LANGUAGE NoImplicitPrelude #-}
+
+-- | Web page reader tool - fetches and summarizes web pages.
+--
+-- : out omni-agent-tools-webreader
+-- : dep aeson
+-- : dep http-conduit
+-- : run trafilatura
+module Omni.Agent.Tools.WebReader
+ ( -- * Tool
+ webReaderTool,
+
+ -- * Direct API
+ fetchWebpage,
+ extractText,
+ fetchAndSummarize,
+
+ -- * Testing
+ main,
+ test,
+ )
+where
+
+import Alpha
+import qualified Control.Concurrent.Sema as Sema
+import Data.Aeson ((.=))
+import qualified Data.Aeson as Aeson
+import qualified Data.Aeson.KeyMap as KeyMap
+import qualified Data.ByteString.Lazy as BL
+import qualified Data.Text as Text
+import qualified Data.Text.Encoding as TE
+import qualified Data.Text.IO as TIO
+import Data.Time.Clock (diffUTCTime, getCurrentTime)
+import qualified Network.HTTP.Client as HTTPClient
+import qualified Network.HTTP.Simple as HTTP
+import qualified Omni.Agent.Engine as Engine
+import qualified Omni.Agent.Provider as Provider
+import qualified Omni.Test as Test
+import qualified System.Exit as Exit
+import qualified System.IO as IO
+import qualified System.Process as Process
+import qualified System.Timeout as Timeout
+
+main :: IO ()
+main = Test.run test
+
+test :: Test.Tree
+test =
+ Test.group
+ "Omni.Agent.Tools.WebReader"
+ [ Test.unit "extractText removes HTML tags" <| do
+ let html = "<html><body><p>Hello world</p></body></html>"
+ result = extractText html
+ ("Hello world" `Text.isInfixOf` result) Test.@=? True,
+ Test.unit "extractText removes script tags" <| do
+ let html = "<html><script>alert('hi')</script><p>Content</p></html>"
+ result = extractText html
+ ("alert" `Text.isInfixOf` result) Test.@=? False
+ ("Content" `Text.isInfixOf` result) Test.@=? True,
+ Test.unit "webReaderTool has correct schema" <| do
+ let tool = webReaderTool "test-key"
+ Engine.toolName tool Test.@=? "read_webpages"
+ ]
+
+-- | Fetch timeout in microseconds (15 seconds - short because blocked sites won't respond anyway)
+fetchTimeoutMicros :: Int
+fetchTimeoutMicros = 15 * 1000000
+
+-- | Summarization timeout in microseconds (30 seconds)
+summarizeTimeoutMicros :: Int
+summarizeTimeoutMicros = 30 * 1000000
+
+-- | Maximum concurrent fetches
+maxConcurrentFetches :: Int
+maxConcurrentFetches = 10
+
+-- | Simple debug logging to stderr
+dbg :: Text -> IO ()
+dbg = TIO.hPutStrLn IO.stderr
+
+fetchWebpage :: Text -> IO (Either Text Text)
+fetchWebpage url = do
+ dbg ("[webreader] Fetching: " <> url)
+ result <-
+ Timeout.timeout fetchTimeoutMicros <| do
+ innerResult <-
+ try <| do
+ req0 <- HTTP.parseRequest (Text.unpack url)
+ let req =
+ HTTP.setRequestMethod "GET"
+ <| HTTP.setRequestHeader "User-Agent" ["Mozilla/5.0 (compatible; OmniBot/1.0)"]
+ <| HTTP.setRequestHeader "Accept" ["text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"]
+ <| HTTP.setRequestResponseTimeout (HTTPClient.responseTimeoutMicro fetchTimeoutMicros)
+ <| req0
+ HTTP.httpLBS req
+ case innerResult of
+ Left (e :: SomeException) -> do
+ dbg ("[webreader] Fetch error: " <> url <> " - " <> tshow e)
+ pure (Left ("Failed to fetch URL: " <> tshow e))
+ Right response -> do
+ let status = HTTP.getResponseStatusCode response
+ if status >= 200 && status < 300
+ then do
+ let body = HTTP.getResponseBody response
+ text = TE.decodeUtf8With (\_ _ -> Just '?') (BL.toStrict body)
+ len = Text.length text
+ dbg ("[webreader] Fetched: " <> url <> " (" <> tshow len <> " chars)")
+ pure (Right text)
+ else do
+ dbg ("[webreader] HTTP " <> tshow status <> ": " <> url)
+ pure (Left ("HTTP error: " <> tshow status))
+ case result of
+ Nothing -> do
+ dbg ("[webreader] Timeout: " <> url)
+ pure (Left ("Timeout fetching " <> url))
+ Just r -> pure r
+
+-- | Fast single-pass text extraction from HTML
+-- Strips all tags in one pass, no expensive operations
+extractText :: Text -> Text
+extractText html = collapseWhitespace (stripAllTags html)
+ where
+ -- Single pass: accumulate text outside of tags
+ stripAllTags :: Text -> Text
+ stripAllTags txt = Text.pack (go (Text.unpack txt) False [])
+ where
+ go :: [Char] -> Bool -> [Char] -> [Char]
+ go [] _ acc = reverse acc
+ go ('<' : rest) _ acc = go rest True acc -- Enter tag
+ go ('>' : rest) True acc = go rest False (' ' : acc) -- Exit tag, add space
+ go (_ : rest) True acc = go rest True acc -- Inside tag, skip
+ go (c : rest) False acc = go rest False (c : acc) -- Outside tag, keep
+ collapseWhitespace = Text.unwords <. Text.words
+
+-- | Maximum chars to send for summarization (keep it small for fast LLM response)
+maxContentForSummary :: Int
+maxContentForSummary = 15000
+
+-- | Maximum summary length to return
+maxSummaryLength :: Int
+maxSummaryLength = 1000
+
+-- | Timeout for trafilatura extraction in microseconds (10 seconds)
+extractTimeoutMicros :: Int
+extractTimeoutMicros = 10 * 1000000
+
+-- | Extract article content using trafilatura (Python library)
+-- Falls back to naive extractText if trafilatura fails
+extractWithTrafilatura :: Text -> IO Text
+extractWithTrafilatura html = do
+ let pythonScript =
+ "import sys; import trafilatura; "
+ <> "html = sys.stdin.read(); "
+ <> "result = trafilatura.extract(html, include_comments=False, include_tables=False); "
+ <> "print(result if result else '')"
+ proc =
+ (Process.proc "python3" ["-c", Text.unpack pythonScript])
+ { Process.std_in = Process.CreatePipe,
+ Process.std_out = Process.CreatePipe,
+ Process.std_err = Process.CreatePipe
+ }
+ result <-
+ Timeout.timeout extractTimeoutMicros <| do
+ (exitCode, stdoutStr, _stderrStr) <- Process.readCreateProcessWithExitCode proc (Text.unpack html)
+ case exitCode of
+ Exit.ExitSuccess -> pure (Text.strip (Text.pack stdoutStr))
+ Exit.ExitFailure _ -> pure ""
+ case result of
+ Just txt | not (Text.null txt) -> pure txt
+ _ -> do
+ dbg "[webreader] trafilatura failed, falling back to naive extraction"
+ pure (extractText (Text.take 100000 html))
+
+summarizeContent :: Text -> Text -> Text -> IO (Either Text Text)
+summarizeContent apiKey url content = do
+ let truncatedContent = Text.take maxContentForSummary content
+ haiku = Provider.defaultOpenRouter apiKey "anthropic/claude-haiku-4.5"
+ dbg ("[webreader] Summarizing: " <> url <> " (" <> tshow (Text.length truncatedContent) <> " chars)")
+ dbg "[webreader] Calling LLM for summarization..."
+ startTime <- getCurrentTime
+ result <-
+ Timeout.timeout summarizeTimeoutMicros
+ <| Provider.chat
+ haiku
+ []
+ [ Provider.Message
+ Provider.System
+ ( "You are a webpage summarizer. Extract the key information in 3-5 bullet points. "
+ <> "Be extremely concise - max 500 characters total. No preamble, just bullets."
+ )
+ Nothing
+ Nothing,
+ Provider.Message
+ Provider.User
+ ("Summarize: " <> url <> "\n\n" <> truncatedContent)
+ Nothing
+ Nothing
+ ]
+ endTime <- getCurrentTime
+ let elapsed = diffUTCTime endTime startTime
+ dbg ("[webreader] LLM call completed in " <> tshow elapsed)
+ case result of
+ Nothing -> do
+ dbg ("[webreader] Summarize timeout after " <> tshow elapsed <> ": " <> url)
+ pure (Left ("Timeout summarizing " <> url))
+ Just (Left err) -> do
+ dbg ("[webreader] Summarize error: " <> url <> " - " <> err)
+ pure (Left ("Summarization failed: " <> err))
+ Just (Right msg) -> do
+ let summary = Text.take maxSummaryLength (Provider.msgContent msg)
+ dbg ("[webreader] Summarized: " <> url <> " (" <> tshow (Text.length summary) <> " chars)")
+ pure (Right summary)
+
+-- | Fetch and summarize a single URL, returning a result object
+-- This is the core function used by both single and batch tools
+fetchAndSummarize :: Text -> Text -> IO Aeson.Value
+fetchAndSummarize apiKey url = do
+ fetchResult <- fetchWebpage url
+ case fetchResult of
+ Left err ->
+ pure (Aeson.object ["url" .= url, "error" .= err])
+ Right html -> do
+ dbg ("[webreader] Extracting article from: " <> url <> " (" <> tshow (Text.length html) <> " chars HTML)")
+ extractStart <- getCurrentTime
+ textContent <- extractWithTrafilatura html
+ extractEnd <- getCurrentTime
+ let extractElapsed = diffUTCTime extractEnd extractStart
+ dbg ("[webreader] Extracted: " <> url <> " (" <> tshow (Text.length textContent) <> " chars text) in " <> tshow extractElapsed)
+ if Text.null (Text.strip textContent)
+ then pure (Aeson.object ["url" .= url, "error" .= ("Page appears to be empty or JavaScript-only" :: Text)])
+ else do
+ summaryResult <- summarizeContent apiKey url textContent
+ case summaryResult of
+ Left err ->
+ pure
+ ( Aeson.object
+ [ "url" .= url,
+ "error" .= err,
+ "raw_content" .= Text.take 2000 textContent
+ ]
+ )
+ Right summary ->
+ pure
+ ( Aeson.object
+ [ "url" .= url,
+ "success" .= True,
+ "summary" .= summary
+ ]
+ )
+
+-- | Web reader tool - fetches and summarizes webpages in parallel
+webReaderTool :: Text -> Engine.Tool
+webReaderTool apiKey =
+ Engine.Tool
+ { Engine.toolName = "read_webpages",
+ Engine.toolDescription =
+ "Fetch and summarize webpages in parallel. Each page is processed independently - "
+ <> "failures on one page won't affect others. Returns a list of summaries.",
+ Engine.toolJsonSchema =
+ Aeson.object
+ [ "type" .= ("object" :: Text),
+ "properties"
+ .= Aeson.object
+ [ "urls"
+ .= Aeson.object
+ [ "type" .= ("array" :: Text),
+ "items" .= Aeson.object ["type" .= ("string" :: Text)],
+ "description" .= ("List of URLs to read and summarize" :: Text)
+ ]
+ ],
+ "required" .= (["urls"] :: [Text])
+ ],
+ Engine.toolExecute = executeWebReader apiKey
+ }
+
+executeWebReader :: Text -> Aeson.Value -> IO Aeson.Value
+executeWebReader apiKey v =
+ case Aeson.fromJSON v of
+ Aeson.Error e -> pure (Aeson.object ["error" .= Text.pack e])
+ Aeson.Success (args :: WebReaderArgs) -> do
+ let urls = wrUrls args
+ dbg ("[webreader] Starting batch: " <> tshow (length urls) <> " URLs")
+ results <- Sema.mapPool maxConcurrentFetches (fetchAndSummarize apiKey) urls
+ let succeeded = length (filter isSuccess results)
+ dbg ("[webreader] Batch complete: " <> tshow succeeded <> "/" <> tshow (length urls) <> " succeeded")
+ pure
+ ( Aeson.object
+ [ "results" .= results,
+ "total" .= length urls,
+ "succeeded" .= succeeded
+ ]
+ )
+ where
+ isSuccess (Aeson.Object obj) = KeyMap.member "success" obj
+ isSuccess _ = False
+
+newtype WebReaderArgs = WebReaderArgs
+ { wrUrls :: [Text]
+ }
+ deriving (Generic)
+
+instance Aeson.FromJSON WebReaderArgs where
+ parseJSON =
+ Aeson.withObject "WebReaderArgs" <| \v ->
+ WebReaderArgs </ (v Aeson..: "urls")