diff options
Diffstat (limited to 'Omni/Agent/Tools/WebReader.hs')
| -rw-r--r-- | Omni/Agent/Tools/WebReader.hs | 318 |
1 files changed, 208 insertions, 110 deletions
diff --git a/Omni/Agent/Tools/WebReader.hs b/Omni/Agent/Tools/WebReader.hs index 9b776ad..a69e3cf 100644 --- a/Omni/Agent/Tools/WebReader.hs +++ b/Omni/Agent/Tools/WebReader.hs @@ -8,6 +8,7 @@ -- : out omni-agent-tools-webreader -- : dep aeson -- : dep http-conduit +-- : run trafilatura module Omni.Agent.Tools.WebReader ( -- * Tool webReaderTool, @@ -15,6 +16,7 @@ module Omni.Agent.Tools.WebReader -- * Direct API fetchWebpage, extractText, + fetchAndSummarize, -- * Testing main, @@ -23,16 +25,24 @@ module Omni.Agent.Tools.WebReader where import Alpha +import qualified Control.Concurrent.Sema as Sema import Data.Aeson ((.=)) import qualified Data.Aeson as Aeson +import qualified Data.Aeson.KeyMap as KeyMap import qualified Data.ByteString.Lazy as BL import qualified Data.Text as Text import qualified Data.Text.Encoding as TE +import qualified Data.Text.IO as TIO +import Data.Time.Clock (diffUTCTime, getCurrentTime) import qualified Network.HTTP.Client as HTTPClient import qualified Network.HTTP.Simple as HTTP import qualified Omni.Agent.Engine as Engine import qualified Omni.Agent.Provider as Provider import qualified Omni.Test as Test +import qualified System.Exit as Exit +import qualified System.IO as IO +import qualified System.Process as Process +import qualified System.Timeout as Timeout main :: IO () main = Test.run test @@ -52,117 +62,216 @@ test = ("Content" `Text.isInfixOf` result) Test.@=? True, Test.unit "webReaderTool has correct schema" <| do let tool = webReaderTool "test-key" - Engine.toolName tool Test.@=? "read_webpage" + Engine.toolName tool Test.@=? "read_webpages" ] +-- | Fetch timeout in microseconds (15 seconds - short because blocked sites won't respond anyway) +fetchTimeoutMicros :: Int +fetchTimeoutMicros = 15 * 1000000 + +-- | Summarization timeout in microseconds (30 seconds) +summarizeTimeoutMicros :: Int +summarizeTimeoutMicros = 30 * 1000000 + +-- | Maximum concurrent fetches +maxConcurrentFetches :: Int +maxConcurrentFetches = 10 + +-- | Simple debug logging to stderr +dbg :: Text -> IO () +dbg = TIO.hPutStrLn IO.stderr + fetchWebpage :: Text -> IO (Either Text Text) fetchWebpage url = do + dbg ("[webreader] Fetching: " <> url) result <- - try <| do - req0 <- HTTP.parseRequest (Text.unpack url) - let req = - HTTP.setRequestMethod "GET" - <| HTTP.setRequestHeader "User-Agent" ["Mozilla/5.0 (compatible; OmniBot/1.0)"] - <| HTTP.setRequestHeader "Accept" ["text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"] - <| HTTP.setRequestResponseTimeout (HTTPClient.responseTimeoutMicro (30 * 1000000)) - <| req0 - HTTP.httpLBS req + Timeout.timeout fetchTimeoutMicros <| do + innerResult <- + try <| do + req0 <- HTTP.parseRequest (Text.unpack url) + let req = + HTTP.setRequestMethod "GET" + <| HTTP.setRequestHeader "User-Agent" ["Mozilla/5.0 (compatible; OmniBot/1.0)"] + <| HTTP.setRequestHeader "Accept" ["text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"] + <| HTTP.setRequestResponseTimeout (HTTPClient.responseTimeoutMicro fetchTimeoutMicros) + <| req0 + HTTP.httpLBS req + case innerResult of + Left (e :: SomeException) -> do + dbg ("[webreader] Fetch error: " <> url <> " - " <> tshow e) + pure (Left ("Failed to fetch URL: " <> tshow e)) + Right response -> do + let status = HTTP.getResponseStatusCode response + if status >= 200 && status < 300 + then do + let body = HTTP.getResponseBody response + text = TE.decodeUtf8With (\_ _ -> Just '?') (BL.toStrict body) + len = Text.length text + dbg ("[webreader] Fetched: " <> url <> " (" <> tshow len <> " chars)") + pure (Right text) + else do + dbg ("[webreader] HTTP " <> tshow status <> ": " <> url) + pure (Left ("HTTP error: " <> tshow status)) case result of - Left (e :: SomeException) -> - pure (Left ("Failed to fetch URL: " <> tshow e)) - Right response -> do - let status = HTTP.getResponseStatusCode response - if status >= 200 && status < 300 - then do - let body = HTTP.getResponseBody response - text = TE.decodeUtf8With (\_ _ -> Just '?') (BL.toStrict body) - pure (Right text) - else pure (Left ("HTTP error: " <> tshow status)) + Nothing -> do + dbg ("[webreader] Timeout: " <> url) + pure (Left ("Timeout fetching " <> url)) + Just r -> pure r +-- | Fast single-pass text extraction from HTML +-- Strips all tags in one pass, no expensive operations extractText :: Text -> Text -extractText html = - let noScript = removeTagContent "script" html - noStyle = removeTagContent "style" noScript - noNoscript = removeTagContent "noscript" noStyle - noTags = stripTags noNoscript - in collapseWhitespace noTags +extractText html = collapseWhitespace (stripAllTags html) where - removeTagContent :: Text -> Text -> Text - removeTagContent tag txt = - let openTag = "<" <> tag - closeTag = "</" <> tag <> ">" - in removeMatches openTag closeTag txt - - removeMatches :: Text -> Text -> Text -> Text - removeMatches open close txt = - case Text.breakOn open (Text.toLower txt) of - (_, "") -> txt - (before, _) -> - let actualBefore = Text.take (Text.length before) txt - rest = Text.drop (Text.length before) txt - in case Text.breakOn close (Text.toLower rest) of - (_, "") -> actualBefore - (_, afterClose) -> - let skipLen = Text.length close - remaining = Text.drop (Text.length rest - Text.length afterClose + skipLen) txt - in actualBefore <> removeMatches open close remaining - - stripTags :: Text -> Text - stripTags txt = go txt "" + -- Single pass: accumulate text outside of tags + stripAllTags :: Text -> Text + stripAllTags txt = Text.pack (go (Text.unpack txt) False []) where - go :: Text -> Text -> Text - go remaining acc = - case Text.breakOn "<" remaining of - (before, "") -> acc <> before - (before, rest) -> - case Text.breakOn ">" rest of - (_, "") -> acc <> before - (_, afterTag) -> go (Text.drop 1 afterTag) (acc <> before <> " ") - + go :: [Char] -> Bool -> [Char] -> [Char] + go [] _ acc = reverse acc + go ('<' : rest) _ acc = go rest True acc -- Enter tag + go ('>' : rest) True acc = go rest False (' ' : acc) -- Exit tag, add space + go (_ : rest) True acc = go rest True acc -- Inside tag, skip + go (c : rest) False acc = go rest False (c : acc) -- Outside tag, keep collapseWhitespace = Text.unwords <. Text.words +-- | Maximum chars to send for summarization (keep it small for fast LLM response) +maxContentForSummary :: Int +maxContentForSummary = 15000 + +-- | Maximum summary length to return +maxSummaryLength :: Int +maxSummaryLength = 1000 + +-- | Timeout for trafilatura extraction in microseconds (10 seconds) +extractTimeoutMicros :: Int +extractTimeoutMicros = 10 * 1000000 + +-- | Extract article content using trafilatura (Python library) +-- Falls back to naive extractText if trafilatura fails +extractWithTrafilatura :: Text -> IO Text +extractWithTrafilatura html = do + let pythonScript = + "import sys; import trafilatura; " + <> "html = sys.stdin.read(); " + <> "result = trafilatura.extract(html, include_comments=False, include_tables=False); " + <> "print(result if result else '')" + proc = + (Process.proc "python3" ["-c", Text.unpack pythonScript]) + { Process.std_in = Process.CreatePipe, + Process.std_out = Process.CreatePipe, + Process.std_err = Process.CreatePipe + } + result <- + Timeout.timeout extractTimeoutMicros <| do + (exitCode, stdoutStr, _stderrStr) <- Process.readCreateProcessWithExitCode proc (Text.unpack html) + case exitCode of + Exit.ExitSuccess -> pure (Text.strip (Text.pack stdoutStr)) + Exit.ExitFailure _ -> pure "" + case result of + Just txt | not (Text.null txt) -> pure txt + _ -> do + dbg "[webreader] trafilatura failed, falling back to naive extraction" + pure (extractText (Text.take 100000 html)) + summarizeContent :: Text -> Text -> Text -> IO (Either Text Text) summarizeContent apiKey url content = do - let truncatedContent = Text.take 50000 content - gemini = Provider.defaultOpenRouter apiKey "google/gemini-2.0-flash-001" + let truncatedContent = Text.take maxContentForSummary content + haiku = Provider.defaultOpenRouter apiKey "anthropic/claude-haiku-4.5" + dbg ("[webreader] Summarizing: " <> url <> " (" <> tshow (Text.length truncatedContent) <> " chars)") + dbg "[webreader] Calling LLM for summarization..." + startTime <- getCurrentTime result <- - Provider.chat - gemini - [] - [ Provider.Message - Provider.System - "You are a webpage summarizer. Provide a concise summary of the webpage content. Focus on the main points and key information. Be brief but comprehensive." - Nothing - Nothing, - Provider.Message - Provider.User - ("Summarize this webpage (" <> url <> "):\n\n" <> truncatedContent) - Nothing - Nothing - ] + Timeout.timeout summarizeTimeoutMicros + <| Provider.chat + haiku + [] + [ Provider.Message + Provider.System + ( "You are a webpage summarizer. Extract the key information in 3-5 bullet points. " + <> "Be extremely concise - max 500 characters total. No preamble, just bullets." + ) + Nothing + Nothing, + Provider.Message + Provider.User + ("Summarize: " <> url <> "\n\n" <> truncatedContent) + Nothing + Nothing + ] + endTime <- getCurrentTime + let elapsed = diffUTCTime endTime startTime + dbg ("[webreader] LLM call completed in " <> tshow elapsed) case result of - Left err -> pure (Left ("Summarization failed: " <> err)) - Right msg -> pure (Right (Provider.msgContent msg)) + Nothing -> do + dbg ("[webreader] Summarize timeout after " <> tshow elapsed <> ": " <> url) + pure (Left ("Timeout summarizing " <> url)) + Just (Left err) -> do + dbg ("[webreader] Summarize error: " <> url <> " - " <> err) + pure (Left ("Summarization failed: " <> err)) + Just (Right msg) -> do + let summary = Text.take maxSummaryLength (Provider.msgContent msg) + dbg ("[webreader] Summarized: " <> url <> " (" <> tshow (Text.length summary) <> " chars)") + pure (Right summary) +-- | Fetch and summarize a single URL, returning a result object +-- This is the core function used by both single and batch tools +fetchAndSummarize :: Text -> Text -> IO Aeson.Value +fetchAndSummarize apiKey url = do + fetchResult <- fetchWebpage url + case fetchResult of + Left err -> + pure (Aeson.object ["url" .= url, "error" .= err]) + Right html -> do + dbg ("[webreader] Extracting article from: " <> url <> " (" <> tshow (Text.length html) <> " chars HTML)") + extractStart <- getCurrentTime + textContent <- extractWithTrafilatura html + extractEnd <- getCurrentTime + let extractElapsed = diffUTCTime extractEnd extractStart + dbg ("[webreader] Extracted: " <> url <> " (" <> tshow (Text.length textContent) <> " chars text) in " <> tshow extractElapsed) + if Text.null (Text.strip textContent) + then pure (Aeson.object ["url" .= url, "error" .= ("Page appears to be empty or JavaScript-only" :: Text)]) + else do + summaryResult <- summarizeContent apiKey url textContent + case summaryResult of + Left err -> + pure + ( Aeson.object + [ "url" .= url, + "error" .= err, + "raw_content" .= Text.take 2000 textContent + ] + ) + Right summary -> + pure + ( Aeson.object + [ "url" .= url, + "success" .= True, + "summary" .= summary + ] + ) + +-- | Web reader tool - fetches and summarizes webpages in parallel webReaderTool :: Text -> Engine.Tool webReaderTool apiKey = Engine.Tool - { Engine.toolName = "read_webpage", + { Engine.toolName = "read_webpages", Engine.toolDescription = - "Fetch and summarize a webpage. Use this when the user shares a URL or link " - <> "and wants to know what it contains. Returns a summary of the page content.", + "Fetch and summarize webpages in parallel. Each page is processed independently - " + <> "failures on one page won't affect others. Returns a list of summaries.", Engine.toolJsonSchema = Aeson.object [ "type" .= ("object" :: Text), "properties" .= Aeson.object - [ "url" + [ "urls" .= Aeson.object - [ "type" .= ("string" :: Text), - "description" .= ("The URL of the webpage to read" :: Text) + [ "type" .= ("array" :: Text), + "items" .= Aeson.object ["type" .= ("string" :: Text)], + "description" .= ("List of URLs to read and summarize" :: Text) ] ], - "required" .= (["url"] :: [Text]) + "required" .= (["urls"] :: [Text]) ], Engine.toolExecute = executeWebReader apiKey } @@ -172,39 +281,28 @@ executeWebReader apiKey v = case Aeson.fromJSON v of Aeson.Error e -> pure (Aeson.object ["error" .= Text.pack e]) Aeson.Success (args :: WebReaderArgs) -> do - fetchResult <- fetchWebpage (wrUrl args) - case fetchResult of - Left err -> - pure (Aeson.object ["error" .= err]) - Right html -> do - let textContent = extractText html - if Text.null (Text.strip textContent) - then pure (Aeson.object ["error" .= ("Page appears to be empty or JavaScript-only" :: Text)]) - else do - summaryResult <- summarizeContent apiKey (wrUrl args) textContent - case summaryResult of - Left err -> - pure - ( Aeson.object - [ "error" .= err, - "raw_content" .= Text.take 2000 textContent - ] - ) - Right summary -> - pure - ( Aeson.object - [ "success" .= True, - "url" .= wrUrl args, - "summary" .= summary - ] - ) + let urls = wrUrls args + dbg ("[webreader] Starting batch: " <> tshow (length urls) <> " URLs") + results <- Sema.mapPool maxConcurrentFetches (fetchAndSummarize apiKey) urls + let succeeded = length (filter isSuccess results) + dbg ("[webreader] Batch complete: " <> tshow succeeded <> "/" <> tshow (length urls) <> " succeeded") + pure + ( Aeson.object + [ "results" .= results, + "total" .= length urls, + "succeeded" .= succeeded + ] + ) + where + isSuccess (Aeson.Object obj) = KeyMap.member "success" obj + isSuccess _ = False newtype WebReaderArgs = WebReaderArgs - { wrUrl :: Text + { wrUrls :: [Text] } deriving (Generic) instance Aeson.FromJSON WebReaderArgs where parseJSON = Aeson.withObject "WebReaderArgs" <| \v -> - WebReaderArgs </ (v Aeson..: "url") + WebReaderArgs </ (v Aeson..: "urls") |
