summaryrefslogtreecommitdiff
path: root/Omni/Agent/Tools/WebReader.hs
diff options
context:
space:
mode:
authorBen Sima <ben@bensima.com>2025-12-16 13:24:54 -0500
committerBen Sima <ben@bensima.com>2025-12-16 13:24:54 -0500
commitb18bd4eee969681ee532c4898ddaaa0851e6b846 (patch)
tree0a966754459c5873b9dad4289ea51e901bd4399b /Omni/Agent/Tools/WebReader.hs
parent122d73ac9d2472f91ed00965d03d1e761da72699 (diff)
Batch web_reader tool, much faster
Added retry with backoff, parallel proccessing, editing pages down to main content, summarization with haiku. It's so much faster and more reliable now. Plus improved the logging system and distangled the status UI bar from the logging module.
Diffstat (limited to 'Omni/Agent/Tools/WebReader.hs')
-rw-r--r--Omni/Agent/Tools/WebReader.hs318
1 files changed, 208 insertions, 110 deletions
diff --git a/Omni/Agent/Tools/WebReader.hs b/Omni/Agent/Tools/WebReader.hs
index 9b776ad..a69e3cf 100644
--- a/Omni/Agent/Tools/WebReader.hs
+++ b/Omni/Agent/Tools/WebReader.hs
@@ -8,6 +8,7 @@
-- : out omni-agent-tools-webreader
-- : dep aeson
-- : dep http-conduit
+-- : run trafilatura
module Omni.Agent.Tools.WebReader
( -- * Tool
webReaderTool,
@@ -15,6 +16,7 @@ module Omni.Agent.Tools.WebReader
-- * Direct API
fetchWebpage,
extractText,
+ fetchAndSummarize,
-- * Testing
main,
@@ -23,16 +25,24 @@ module Omni.Agent.Tools.WebReader
where
import Alpha
+import qualified Control.Concurrent.Sema as Sema
import Data.Aeson ((.=))
import qualified Data.Aeson as Aeson
+import qualified Data.Aeson.KeyMap as KeyMap
import qualified Data.ByteString.Lazy as BL
import qualified Data.Text as Text
import qualified Data.Text.Encoding as TE
+import qualified Data.Text.IO as TIO
+import Data.Time.Clock (diffUTCTime, getCurrentTime)
import qualified Network.HTTP.Client as HTTPClient
import qualified Network.HTTP.Simple as HTTP
import qualified Omni.Agent.Engine as Engine
import qualified Omni.Agent.Provider as Provider
import qualified Omni.Test as Test
+import qualified System.Exit as Exit
+import qualified System.IO as IO
+import qualified System.Process as Process
+import qualified System.Timeout as Timeout
main :: IO ()
main = Test.run test
@@ -52,117 +62,216 @@ test =
("Content" `Text.isInfixOf` result) Test.@=? True,
Test.unit "webReaderTool has correct schema" <| do
let tool = webReaderTool "test-key"
- Engine.toolName tool Test.@=? "read_webpage"
+ Engine.toolName tool Test.@=? "read_webpages"
]
+-- | Fetch timeout in microseconds (15 seconds - short because blocked sites won't respond anyway)
+fetchTimeoutMicros :: Int
+fetchTimeoutMicros = 15 * 1000000
+
+-- | Summarization timeout in microseconds (30 seconds)
+summarizeTimeoutMicros :: Int
+summarizeTimeoutMicros = 30 * 1000000
+
+-- | Maximum concurrent fetches
+maxConcurrentFetches :: Int
+maxConcurrentFetches = 10
+
+-- | Simple debug logging to stderr
+dbg :: Text -> IO ()
+dbg = TIO.hPutStrLn IO.stderr
+
fetchWebpage :: Text -> IO (Either Text Text)
fetchWebpage url = do
+ dbg ("[webreader] Fetching: " <> url)
result <-
- try <| do
- req0 <- HTTP.parseRequest (Text.unpack url)
- let req =
- HTTP.setRequestMethod "GET"
- <| HTTP.setRequestHeader "User-Agent" ["Mozilla/5.0 (compatible; OmniBot/1.0)"]
- <| HTTP.setRequestHeader "Accept" ["text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"]
- <| HTTP.setRequestResponseTimeout (HTTPClient.responseTimeoutMicro (30 * 1000000))
- <| req0
- HTTP.httpLBS req
+ Timeout.timeout fetchTimeoutMicros <| do
+ innerResult <-
+ try <| do
+ req0 <- HTTP.parseRequest (Text.unpack url)
+ let req =
+ HTTP.setRequestMethod "GET"
+ <| HTTP.setRequestHeader "User-Agent" ["Mozilla/5.0 (compatible; OmniBot/1.0)"]
+ <| HTTP.setRequestHeader "Accept" ["text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"]
+ <| HTTP.setRequestResponseTimeout (HTTPClient.responseTimeoutMicro fetchTimeoutMicros)
+ <| req0
+ HTTP.httpLBS req
+ case innerResult of
+ Left (e :: SomeException) -> do
+ dbg ("[webreader] Fetch error: " <> url <> " - " <> tshow e)
+ pure (Left ("Failed to fetch URL: " <> tshow e))
+ Right response -> do
+ let status = HTTP.getResponseStatusCode response
+ if status >= 200 && status < 300
+ then do
+ let body = HTTP.getResponseBody response
+ text = TE.decodeUtf8With (\_ _ -> Just '?') (BL.toStrict body)
+ len = Text.length text
+ dbg ("[webreader] Fetched: " <> url <> " (" <> tshow len <> " chars)")
+ pure (Right text)
+ else do
+ dbg ("[webreader] HTTP " <> tshow status <> ": " <> url)
+ pure (Left ("HTTP error: " <> tshow status))
case result of
- Left (e :: SomeException) ->
- pure (Left ("Failed to fetch URL: " <> tshow e))
- Right response -> do
- let status = HTTP.getResponseStatusCode response
- if status >= 200 && status < 300
- then do
- let body = HTTP.getResponseBody response
- text = TE.decodeUtf8With (\_ _ -> Just '?') (BL.toStrict body)
- pure (Right text)
- else pure (Left ("HTTP error: " <> tshow status))
+ Nothing -> do
+ dbg ("[webreader] Timeout: " <> url)
+ pure (Left ("Timeout fetching " <> url))
+ Just r -> pure r
+-- | Fast single-pass text extraction from HTML
+-- Strips all tags in one pass, no expensive operations
extractText :: Text -> Text
-extractText html =
- let noScript = removeTagContent "script" html
- noStyle = removeTagContent "style" noScript
- noNoscript = removeTagContent "noscript" noStyle
- noTags = stripTags noNoscript
- in collapseWhitespace noTags
+extractText html = collapseWhitespace (stripAllTags html)
where
- removeTagContent :: Text -> Text -> Text
- removeTagContent tag txt =
- let openTag = "<" <> tag
- closeTag = "</" <> tag <> ">"
- in removeMatches openTag closeTag txt
-
- removeMatches :: Text -> Text -> Text -> Text
- removeMatches open close txt =
- case Text.breakOn open (Text.toLower txt) of
- (_, "") -> txt
- (before, _) ->
- let actualBefore = Text.take (Text.length before) txt
- rest = Text.drop (Text.length before) txt
- in case Text.breakOn close (Text.toLower rest) of
- (_, "") -> actualBefore
- (_, afterClose) ->
- let skipLen = Text.length close
- remaining = Text.drop (Text.length rest - Text.length afterClose + skipLen) txt
- in actualBefore <> removeMatches open close remaining
-
- stripTags :: Text -> Text
- stripTags txt = go txt ""
+ -- Single pass: accumulate text outside of tags
+ stripAllTags :: Text -> Text
+ stripAllTags txt = Text.pack (go (Text.unpack txt) False [])
where
- go :: Text -> Text -> Text
- go remaining acc =
- case Text.breakOn "<" remaining of
- (before, "") -> acc <> before
- (before, rest) ->
- case Text.breakOn ">" rest of
- (_, "") -> acc <> before
- (_, afterTag) -> go (Text.drop 1 afterTag) (acc <> before <> " ")
-
+ go :: [Char] -> Bool -> [Char] -> [Char]
+ go [] _ acc = reverse acc
+ go ('<' : rest) _ acc = go rest True acc -- Enter tag
+ go ('>' : rest) True acc = go rest False (' ' : acc) -- Exit tag, add space
+ go (_ : rest) True acc = go rest True acc -- Inside tag, skip
+ go (c : rest) False acc = go rest False (c : acc) -- Outside tag, keep
collapseWhitespace = Text.unwords <. Text.words
+-- | Maximum chars to send for summarization (keep it small for fast LLM response)
+maxContentForSummary :: Int
+maxContentForSummary = 15000
+
+-- | Maximum summary length to return
+maxSummaryLength :: Int
+maxSummaryLength = 1000
+
+-- | Timeout for trafilatura extraction in microseconds (10 seconds)
+extractTimeoutMicros :: Int
+extractTimeoutMicros = 10 * 1000000
+
+-- | Extract article content using trafilatura (Python library)
+-- Falls back to naive extractText if trafilatura fails
+extractWithTrafilatura :: Text -> IO Text
+extractWithTrafilatura html = do
+ let pythonScript =
+ "import sys; import trafilatura; "
+ <> "html = sys.stdin.read(); "
+ <> "result = trafilatura.extract(html, include_comments=False, include_tables=False); "
+ <> "print(result if result else '')"
+ proc =
+ (Process.proc "python3" ["-c", Text.unpack pythonScript])
+ { Process.std_in = Process.CreatePipe,
+ Process.std_out = Process.CreatePipe,
+ Process.std_err = Process.CreatePipe
+ }
+ result <-
+ Timeout.timeout extractTimeoutMicros <| do
+ (exitCode, stdoutStr, _stderrStr) <- Process.readCreateProcessWithExitCode proc (Text.unpack html)
+ case exitCode of
+ Exit.ExitSuccess -> pure (Text.strip (Text.pack stdoutStr))
+ Exit.ExitFailure _ -> pure ""
+ case result of
+ Just txt | not (Text.null txt) -> pure txt
+ _ -> do
+ dbg "[webreader] trafilatura failed, falling back to naive extraction"
+ pure (extractText (Text.take 100000 html))
+
summarizeContent :: Text -> Text -> Text -> IO (Either Text Text)
summarizeContent apiKey url content = do
- let truncatedContent = Text.take 50000 content
- gemini = Provider.defaultOpenRouter apiKey "google/gemini-2.0-flash-001"
+ let truncatedContent = Text.take maxContentForSummary content
+ haiku = Provider.defaultOpenRouter apiKey "anthropic/claude-haiku-4.5"
+ dbg ("[webreader] Summarizing: " <> url <> " (" <> tshow (Text.length truncatedContent) <> " chars)")
+ dbg "[webreader] Calling LLM for summarization..."
+ startTime <- getCurrentTime
result <-
- Provider.chat
- gemini
- []
- [ Provider.Message
- Provider.System
- "You are a webpage summarizer. Provide a concise summary of the webpage content. Focus on the main points and key information. Be brief but comprehensive."
- Nothing
- Nothing,
- Provider.Message
- Provider.User
- ("Summarize this webpage (" <> url <> "):\n\n" <> truncatedContent)
- Nothing
- Nothing
- ]
+ Timeout.timeout summarizeTimeoutMicros
+ <| Provider.chat
+ haiku
+ []
+ [ Provider.Message
+ Provider.System
+ ( "You are a webpage summarizer. Extract the key information in 3-5 bullet points. "
+ <> "Be extremely concise - max 500 characters total. No preamble, just bullets."
+ )
+ Nothing
+ Nothing,
+ Provider.Message
+ Provider.User
+ ("Summarize: " <> url <> "\n\n" <> truncatedContent)
+ Nothing
+ Nothing
+ ]
+ endTime <- getCurrentTime
+ let elapsed = diffUTCTime endTime startTime
+ dbg ("[webreader] LLM call completed in " <> tshow elapsed)
case result of
- Left err -> pure (Left ("Summarization failed: " <> err))
- Right msg -> pure (Right (Provider.msgContent msg))
+ Nothing -> do
+ dbg ("[webreader] Summarize timeout after " <> tshow elapsed <> ": " <> url)
+ pure (Left ("Timeout summarizing " <> url))
+ Just (Left err) -> do
+ dbg ("[webreader] Summarize error: " <> url <> " - " <> err)
+ pure (Left ("Summarization failed: " <> err))
+ Just (Right msg) -> do
+ let summary = Text.take maxSummaryLength (Provider.msgContent msg)
+ dbg ("[webreader] Summarized: " <> url <> " (" <> tshow (Text.length summary) <> " chars)")
+ pure (Right summary)
+-- | Fetch and summarize a single URL, returning a result object
+-- This is the core function used by both single and batch tools
+fetchAndSummarize :: Text -> Text -> IO Aeson.Value
+fetchAndSummarize apiKey url = do
+ fetchResult <- fetchWebpage url
+ case fetchResult of
+ Left err ->
+ pure (Aeson.object ["url" .= url, "error" .= err])
+ Right html -> do
+ dbg ("[webreader] Extracting article from: " <> url <> " (" <> tshow (Text.length html) <> " chars HTML)")
+ extractStart <- getCurrentTime
+ textContent <- extractWithTrafilatura html
+ extractEnd <- getCurrentTime
+ let extractElapsed = diffUTCTime extractEnd extractStart
+ dbg ("[webreader] Extracted: " <> url <> " (" <> tshow (Text.length textContent) <> " chars text) in " <> tshow extractElapsed)
+ if Text.null (Text.strip textContent)
+ then pure (Aeson.object ["url" .= url, "error" .= ("Page appears to be empty or JavaScript-only" :: Text)])
+ else do
+ summaryResult <- summarizeContent apiKey url textContent
+ case summaryResult of
+ Left err ->
+ pure
+ ( Aeson.object
+ [ "url" .= url,
+ "error" .= err,
+ "raw_content" .= Text.take 2000 textContent
+ ]
+ )
+ Right summary ->
+ pure
+ ( Aeson.object
+ [ "url" .= url,
+ "success" .= True,
+ "summary" .= summary
+ ]
+ )
+
+-- | Web reader tool - fetches and summarizes webpages in parallel
webReaderTool :: Text -> Engine.Tool
webReaderTool apiKey =
Engine.Tool
- { Engine.toolName = "read_webpage",
+ { Engine.toolName = "read_webpages",
Engine.toolDescription =
- "Fetch and summarize a webpage. Use this when the user shares a URL or link "
- <> "and wants to know what it contains. Returns a summary of the page content.",
+ "Fetch and summarize webpages in parallel. Each page is processed independently - "
+ <> "failures on one page won't affect others. Returns a list of summaries.",
Engine.toolJsonSchema =
Aeson.object
[ "type" .= ("object" :: Text),
"properties"
.= Aeson.object
- [ "url"
+ [ "urls"
.= Aeson.object
- [ "type" .= ("string" :: Text),
- "description" .= ("The URL of the webpage to read" :: Text)
+ [ "type" .= ("array" :: Text),
+ "items" .= Aeson.object ["type" .= ("string" :: Text)],
+ "description" .= ("List of URLs to read and summarize" :: Text)
]
],
- "required" .= (["url"] :: [Text])
+ "required" .= (["urls"] :: [Text])
],
Engine.toolExecute = executeWebReader apiKey
}
@@ -172,39 +281,28 @@ executeWebReader apiKey v =
case Aeson.fromJSON v of
Aeson.Error e -> pure (Aeson.object ["error" .= Text.pack e])
Aeson.Success (args :: WebReaderArgs) -> do
- fetchResult <- fetchWebpage (wrUrl args)
- case fetchResult of
- Left err ->
- pure (Aeson.object ["error" .= err])
- Right html -> do
- let textContent = extractText html
- if Text.null (Text.strip textContent)
- then pure (Aeson.object ["error" .= ("Page appears to be empty or JavaScript-only" :: Text)])
- else do
- summaryResult <- summarizeContent apiKey (wrUrl args) textContent
- case summaryResult of
- Left err ->
- pure
- ( Aeson.object
- [ "error" .= err,
- "raw_content" .= Text.take 2000 textContent
- ]
- )
- Right summary ->
- pure
- ( Aeson.object
- [ "success" .= True,
- "url" .= wrUrl args,
- "summary" .= summary
- ]
- )
+ let urls = wrUrls args
+ dbg ("[webreader] Starting batch: " <> tshow (length urls) <> " URLs")
+ results <- Sema.mapPool maxConcurrentFetches (fetchAndSummarize apiKey) urls
+ let succeeded = length (filter isSuccess results)
+ dbg ("[webreader] Batch complete: " <> tshow succeeded <> "/" <> tshow (length urls) <> " succeeded")
+ pure
+ ( Aeson.object
+ [ "results" .= results,
+ "total" .= length urls,
+ "succeeded" .= succeeded
+ ]
+ )
+ where
+ isSuccess (Aeson.Object obj) = KeyMap.member "success" obj
+ isSuccess _ = False
newtype WebReaderArgs = WebReaderArgs
- { wrUrl :: Text
+ { wrUrls :: [Text]
}
deriving (Generic)
instance Aeson.FromJSON WebReaderArgs where
parseJSON =
Aeson.withObject "WebReaderArgs" <| \v ->
- WebReaderArgs </ (v Aeson..: "url")
+ WebReaderArgs </ (v Aeson..: "urls")