diff options
| author | Ben Sima <ben@bensima.com> | 2025-12-16 13:24:54 -0500 |
|---|---|---|
| committer | Ben Sima <ben@bensima.com> | 2025-12-16 13:24:54 -0500 |
| commit | b18bd4eee969681ee532c4898ddaaa0851e6b846 (patch) | |
| tree | 0a966754459c5873b9dad4289ea51e901bd4399b /Omni/Agent/Tools/WebReader.hs | |
| parent | 122d73ac9d2472f91ed00965d03d1e761da72699 (diff) | |
Batch web_reader tool, much faster
Added retry with backoff, parallel proccessing, editing pages down to main
content, summarization with haiku. It's so much faster and more reliable
now. Plus improved the logging system and distangled the status UI bar from the
logging module.
Diffstat (limited to 'Omni/Agent/Tools/WebReader.hs')
| -rw-r--r-- | Omni/Agent/Tools/WebReader.hs | 318 |
1 files changed, 208 insertions, 110 deletions
diff --git a/Omni/Agent/Tools/WebReader.hs b/Omni/Agent/Tools/WebReader.hs index 9b776ad..a69e3cf 100644 --- a/Omni/Agent/Tools/WebReader.hs +++ b/Omni/Agent/Tools/WebReader.hs @@ -8,6 +8,7 @@ -- : out omni-agent-tools-webreader -- : dep aeson -- : dep http-conduit +-- : run trafilatura module Omni.Agent.Tools.WebReader ( -- * Tool webReaderTool, @@ -15,6 +16,7 @@ module Omni.Agent.Tools.WebReader -- * Direct API fetchWebpage, extractText, + fetchAndSummarize, -- * Testing main, @@ -23,16 +25,24 @@ module Omni.Agent.Tools.WebReader where import Alpha +import qualified Control.Concurrent.Sema as Sema import Data.Aeson ((.=)) import qualified Data.Aeson as Aeson +import qualified Data.Aeson.KeyMap as KeyMap import qualified Data.ByteString.Lazy as BL import qualified Data.Text as Text import qualified Data.Text.Encoding as TE +import qualified Data.Text.IO as TIO +import Data.Time.Clock (diffUTCTime, getCurrentTime) import qualified Network.HTTP.Client as HTTPClient import qualified Network.HTTP.Simple as HTTP import qualified Omni.Agent.Engine as Engine import qualified Omni.Agent.Provider as Provider import qualified Omni.Test as Test +import qualified System.Exit as Exit +import qualified System.IO as IO +import qualified System.Process as Process +import qualified System.Timeout as Timeout main :: IO () main = Test.run test @@ -52,117 +62,216 @@ test = ("Content" `Text.isInfixOf` result) Test.@=? True, Test.unit "webReaderTool has correct schema" <| do let tool = webReaderTool "test-key" - Engine.toolName tool Test.@=? "read_webpage" + Engine.toolName tool Test.@=? "read_webpages" ] +-- | Fetch timeout in microseconds (15 seconds - short because blocked sites won't respond anyway) +fetchTimeoutMicros :: Int +fetchTimeoutMicros = 15 * 1000000 + +-- | Summarization timeout in microseconds (30 seconds) +summarizeTimeoutMicros :: Int +summarizeTimeoutMicros = 30 * 1000000 + +-- | Maximum concurrent fetches +maxConcurrentFetches :: Int +maxConcurrentFetches = 10 + +-- | Simple debug logging to stderr +dbg :: Text -> IO () +dbg = TIO.hPutStrLn IO.stderr + fetchWebpage :: Text -> IO (Either Text Text) fetchWebpage url = do + dbg ("[webreader] Fetching: " <> url) result <- - try <| do - req0 <- HTTP.parseRequest (Text.unpack url) - let req = - HTTP.setRequestMethod "GET" - <| HTTP.setRequestHeader "User-Agent" ["Mozilla/5.0 (compatible; OmniBot/1.0)"] - <| HTTP.setRequestHeader "Accept" ["text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"] - <| HTTP.setRequestResponseTimeout (HTTPClient.responseTimeoutMicro (30 * 1000000)) - <| req0 - HTTP.httpLBS req + Timeout.timeout fetchTimeoutMicros <| do + innerResult <- + try <| do + req0 <- HTTP.parseRequest (Text.unpack url) + let req = + HTTP.setRequestMethod "GET" + <| HTTP.setRequestHeader "User-Agent" ["Mozilla/5.0 (compatible; OmniBot/1.0)"] + <| HTTP.setRequestHeader "Accept" ["text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"] + <| HTTP.setRequestResponseTimeout (HTTPClient.responseTimeoutMicro fetchTimeoutMicros) + <| req0 + HTTP.httpLBS req + case innerResult of + Left (e :: SomeException) -> do + dbg ("[webreader] Fetch error: " <> url <> " - " <> tshow e) + pure (Left ("Failed to fetch URL: " <> tshow e)) + Right response -> do + let status = HTTP.getResponseStatusCode response + if status >= 200 && status < 300 + then do + let body = HTTP.getResponseBody response + text = TE.decodeUtf8With (\_ _ -> Just '?') (BL.toStrict body) + len = Text.length text + dbg ("[webreader] Fetched: " <> url <> " (" <> tshow len <> " chars)") + pure (Right text) + else do + dbg ("[webreader] HTTP " <> tshow status <> ": " <> url) + pure (Left ("HTTP error: " <> tshow status)) case result of - Left (e :: SomeException) -> - pure (Left ("Failed to fetch URL: " <> tshow e)) - Right response -> do - let status = HTTP.getResponseStatusCode response - if status >= 200 && status < 300 - then do - let body = HTTP.getResponseBody response - text = TE.decodeUtf8With (\_ _ -> Just '?') (BL.toStrict body) - pure (Right text) - else pure (Left ("HTTP error: " <> tshow status)) + Nothing -> do + dbg ("[webreader] Timeout: " <> url) + pure (Left ("Timeout fetching " <> url)) + Just r -> pure r +-- | Fast single-pass text extraction from HTML +-- Strips all tags in one pass, no expensive operations extractText :: Text -> Text -extractText html = - let noScript = removeTagContent "script" html - noStyle = removeTagContent "style" noScript - noNoscript = removeTagContent "noscript" noStyle - noTags = stripTags noNoscript - in collapseWhitespace noTags +extractText html = collapseWhitespace (stripAllTags html) where - removeTagContent :: Text -> Text -> Text - removeTagContent tag txt = - let openTag = "<" <> tag - closeTag = "</" <> tag <> ">" - in removeMatches openTag closeTag txt - - removeMatches :: Text -> Text -> Text -> Text - removeMatches open close txt = - case Text.breakOn open (Text.toLower txt) of - (_, "") -> txt - (before, _) -> - let actualBefore = Text.take (Text.length before) txt - rest = Text.drop (Text.length before) txt - in case Text.breakOn close (Text.toLower rest) of - (_, "") -> actualBefore - (_, afterClose) -> - let skipLen = Text.length close - remaining = Text.drop (Text.length rest - Text.length afterClose + skipLen) txt - in actualBefore <> removeMatches open close remaining - - stripTags :: Text -> Text - stripTags txt = go txt "" + -- Single pass: accumulate text outside of tags + stripAllTags :: Text -> Text + stripAllTags txt = Text.pack (go (Text.unpack txt) False []) where - go :: Text -> Text -> Text - go remaining acc = - case Text.breakOn "<" remaining of - (before, "") -> acc <> before - (before, rest) -> - case Text.breakOn ">" rest of - (_, "") -> acc <> before - (_, afterTag) -> go (Text.drop 1 afterTag) (acc <> before <> " ") - + go :: [Char] -> Bool -> [Char] -> [Char] + go [] _ acc = reverse acc + go ('<' : rest) _ acc = go rest True acc -- Enter tag + go ('>' : rest) True acc = go rest False (' ' : acc) -- Exit tag, add space + go (_ : rest) True acc = go rest True acc -- Inside tag, skip + go (c : rest) False acc = go rest False (c : acc) -- Outside tag, keep collapseWhitespace = Text.unwords <. Text.words +-- | Maximum chars to send for summarization (keep it small for fast LLM response) +maxContentForSummary :: Int +maxContentForSummary = 15000 + +-- | Maximum summary length to return +maxSummaryLength :: Int +maxSummaryLength = 1000 + +-- | Timeout for trafilatura extraction in microseconds (10 seconds) +extractTimeoutMicros :: Int +extractTimeoutMicros = 10 * 1000000 + +-- | Extract article content using trafilatura (Python library) +-- Falls back to naive extractText if trafilatura fails +extractWithTrafilatura :: Text -> IO Text +extractWithTrafilatura html = do + let pythonScript = + "import sys; import trafilatura; " + <> "html = sys.stdin.read(); " + <> "result = trafilatura.extract(html, include_comments=False, include_tables=False); " + <> "print(result if result else '')" + proc = + (Process.proc "python3" ["-c", Text.unpack pythonScript]) + { Process.std_in = Process.CreatePipe, + Process.std_out = Process.CreatePipe, + Process.std_err = Process.CreatePipe + } + result <- + Timeout.timeout extractTimeoutMicros <| do + (exitCode, stdoutStr, _stderrStr) <- Process.readCreateProcessWithExitCode proc (Text.unpack html) + case exitCode of + Exit.ExitSuccess -> pure (Text.strip (Text.pack stdoutStr)) + Exit.ExitFailure _ -> pure "" + case result of + Just txt | not (Text.null txt) -> pure txt + _ -> do + dbg "[webreader] trafilatura failed, falling back to naive extraction" + pure (extractText (Text.take 100000 html)) + summarizeContent :: Text -> Text -> Text -> IO (Either Text Text) summarizeContent apiKey url content = do - let truncatedContent = Text.take 50000 content - gemini = Provider.defaultOpenRouter apiKey "google/gemini-2.0-flash-001" + let truncatedContent = Text.take maxContentForSummary content + haiku = Provider.defaultOpenRouter apiKey "anthropic/claude-haiku-4.5" + dbg ("[webreader] Summarizing: " <> url <> " (" <> tshow (Text.length truncatedContent) <> " chars)") + dbg "[webreader] Calling LLM for summarization..." + startTime <- getCurrentTime result <- - Provider.chat - gemini - [] - [ Provider.Message - Provider.System - "You are a webpage summarizer. Provide a concise summary of the webpage content. Focus on the main points and key information. Be brief but comprehensive." - Nothing - Nothing, - Provider.Message - Provider.User - ("Summarize this webpage (" <> url <> "):\n\n" <> truncatedContent) - Nothing - Nothing - ] + Timeout.timeout summarizeTimeoutMicros + <| Provider.chat + haiku + [] + [ Provider.Message + Provider.System + ( "You are a webpage summarizer. Extract the key information in 3-5 bullet points. " + <> "Be extremely concise - max 500 characters total. No preamble, just bullets." + ) + Nothing + Nothing, + Provider.Message + Provider.User + ("Summarize: " <> url <> "\n\n" <> truncatedContent) + Nothing + Nothing + ] + endTime <- getCurrentTime + let elapsed = diffUTCTime endTime startTime + dbg ("[webreader] LLM call completed in " <> tshow elapsed) case result of - Left err -> pure (Left ("Summarization failed: " <> err)) - Right msg -> pure (Right (Provider.msgContent msg)) + Nothing -> do + dbg ("[webreader] Summarize timeout after " <> tshow elapsed <> ": " <> url) + pure (Left ("Timeout summarizing " <> url)) + Just (Left err) -> do + dbg ("[webreader] Summarize error: " <> url <> " - " <> err) + pure (Left ("Summarization failed: " <> err)) + Just (Right msg) -> do + let summary = Text.take maxSummaryLength (Provider.msgContent msg) + dbg ("[webreader] Summarized: " <> url <> " (" <> tshow (Text.length summary) <> " chars)") + pure (Right summary) +-- | Fetch and summarize a single URL, returning a result object +-- This is the core function used by both single and batch tools +fetchAndSummarize :: Text -> Text -> IO Aeson.Value +fetchAndSummarize apiKey url = do + fetchResult <- fetchWebpage url + case fetchResult of + Left err -> + pure (Aeson.object ["url" .= url, "error" .= err]) + Right html -> do + dbg ("[webreader] Extracting article from: " <> url <> " (" <> tshow (Text.length html) <> " chars HTML)") + extractStart <- getCurrentTime + textContent <- extractWithTrafilatura html + extractEnd <- getCurrentTime + let extractElapsed = diffUTCTime extractEnd extractStart + dbg ("[webreader] Extracted: " <> url <> " (" <> tshow (Text.length textContent) <> " chars text) in " <> tshow extractElapsed) + if Text.null (Text.strip textContent) + then pure (Aeson.object ["url" .= url, "error" .= ("Page appears to be empty or JavaScript-only" :: Text)]) + else do + summaryResult <- summarizeContent apiKey url textContent + case summaryResult of + Left err -> + pure + ( Aeson.object + [ "url" .= url, + "error" .= err, + "raw_content" .= Text.take 2000 textContent + ] + ) + Right summary -> + pure + ( Aeson.object + [ "url" .= url, + "success" .= True, + "summary" .= summary + ] + ) + +-- | Web reader tool - fetches and summarizes webpages in parallel webReaderTool :: Text -> Engine.Tool webReaderTool apiKey = Engine.Tool - { Engine.toolName = "read_webpage", + { Engine.toolName = "read_webpages", Engine.toolDescription = - "Fetch and summarize a webpage. Use this when the user shares a URL or link " - <> "and wants to know what it contains. Returns a summary of the page content.", + "Fetch and summarize webpages in parallel. Each page is processed independently - " + <> "failures on one page won't affect others. Returns a list of summaries.", Engine.toolJsonSchema = Aeson.object [ "type" .= ("object" :: Text), "properties" .= Aeson.object - [ "url" + [ "urls" .= Aeson.object - [ "type" .= ("string" :: Text), - "description" .= ("The URL of the webpage to read" :: Text) + [ "type" .= ("array" :: Text), + "items" .= Aeson.object ["type" .= ("string" :: Text)], + "description" .= ("List of URLs to read and summarize" :: Text) ] ], - "required" .= (["url"] :: [Text]) + "required" .= (["urls"] :: [Text]) ], Engine.toolExecute = executeWebReader apiKey } @@ -172,39 +281,28 @@ executeWebReader apiKey v = case Aeson.fromJSON v of Aeson.Error e -> pure (Aeson.object ["error" .= Text.pack e]) Aeson.Success (args :: WebReaderArgs) -> do - fetchResult <- fetchWebpage (wrUrl args) - case fetchResult of - Left err -> - pure (Aeson.object ["error" .= err]) - Right html -> do - let textContent = extractText html - if Text.null (Text.strip textContent) - then pure (Aeson.object ["error" .= ("Page appears to be empty or JavaScript-only" :: Text)]) - else do - summaryResult <- summarizeContent apiKey (wrUrl args) textContent - case summaryResult of - Left err -> - pure - ( Aeson.object - [ "error" .= err, - "raw_content" .= Text.take 2000 textContent - ] - ) - Right summary -> - pure - ( Aeson.object - [ "success" .= True, - "url" .= wrUrl args, - "summary" .= summary - ] - ) + let urls = wrUrls args + dbg ("[webreader] Starting batch: " <> tshow (length urls) <> " URLs") + results <- Sema.mapPool maxConcurrentFetches (fetchAndSummarize apiKey) urls + let succeeded = length (filter isSuccess results) + dbg ("[webreader] Batch complete: " <> tshow succeeded <> "/" <> tshow (length urls) <> " succeeded") + pure + ( Aeson.object + [ "results" .= results, + "total" .= length urls, + "succeeded" .= succeeded + ] + ) + where + isSuccess (Aeson.Object obj) = KeyMap.member "success" obj + isSuccess _ = False newtype WebReaderArgs = WebReaderArgs - { wrUrl :: Text + { wrUrls :: [Text] } deriving (Generic) instance Aeson.FromJSON WebReaderArgs where parseJSON = Aeson.withObject "WebReaderArgs" <| \v -> - WebReaderArgs </ (v Aeson..: "url") + WebReaderArgs </ (v Aeson..: "urls") |
