diff options
Diffstat (limited to 'Omni/Agent/Tools/WebReader.hs')
| -rw-r--r-- | Omni/Agent/Tools/WebReader.hs | 308 |
1 files changed, 308 insertions, 0 deletions
diff --git a/Omni/Agent/Tools/WebReader.hs b/Omni/Agent/Tools/WebReader.hs new file mode 100644 index 0000000..a69e3cf --- /dev/null +++ b/Omni/Agent/Tools/WebReader.hs @@ -0,0 +1,308 @@ +{-# LANGUAGE DeriveGeneric #-} +{-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE ScopedTypeVariables #-} +{-# LANGUAGE NoImplicitPrelude #-} + +-- | Web page reader tool - fetches and summarizes web pages. +-- +-- : out omni-agent-tools-webreader +-- : dep aeson +-- : dep http-conduit +-- : run trafilatura +module Omni.Agent.Tools.WebReader + ( -- * Tool + webReaderTool, + + -- * Direct API + fetchWebpage, + extractText, + fetchAndSummarize, + + -- * Testing + main, + test, + ) +where + +import Alpha +import qualified Control.Concurrent.Sema as Sema +import Data.Aeson ((.=)) +import qualified Data.Aeson as Aeson +import qualified Data.Aeson.KeyMap as KeyMap +import qualified Data.ByteString.Lazy as BL +import qualified Data.Text as Text +import qualified Data.Text.Encoding as TE +import qualified Data.Text.IO as TIO +import Data.Time.Clock (diffUTCTime, getCurrentTime) +import qualified Network.HTTP.Client as HTTPClient +import qualified Network.HTTP.Simple as HTTP +import qualified Omni.Agent.Engine as Engine +import qualified Omni.Agent.Provider as Provider +import qualified Omni.Test as Test +import qualified System.Exit as Exit +import qualified System.IO as IO +import qualified System.Process as Process +import qualified System.Timeout as Timeout + +main :: IO () +main = Test.run test + +test :: Test.Tree +test = + Test.group + "Omni.Agent.Tools.WebReader" + [ Test.unit "extractText removes HTML tags" <| do + let html = "<html><body><p>Hello world</p></body></html>" + result = extractText html + ("Hello world" `Text.isInfixOf` result) Test.@=? True, + Test.unit "extractText removes script tags" <| do + let html = "<html><script>alert('hi')</script><p>Content</p></html>" + result = extractText html + ("alert" `Text.isInfixOf` result) Test.@=? False + ("Content" `Text.isInfixOf` result) Test.@=? True, + Test.unit "webReaderTool has correct schema" <| do + let tool = webReaderTool "test-key" + Engine.toolName tool Test.@=? "read_webpages" + ] + +-- | Fetch timeout in microseconds (15 seconds - short because blocked sites won't respond anyway) +fetchTimeoutMicros :: Int +fetchTimeoutMicros = 15 * 1000000 + +-- | Summarization timeout in microseconds (30 seconds) +summarizeTimeoutMicros :: Int +summarizeTimeoutMicros = 30 * 1000000 + +-- | Maximum concurrent fetches +maxConcurrentFetches :: Int +maxConcurrentFetches = 10 + +-- | Simple debug logging to stderr +dbg :: Text -> IO () +dbg = TIO.hPutStrLn IO.stderr + +fetchWebpage :: Text -> IO (Either Text Text) +fetchWebpage url = do + dbg ("[webreader] Fetching: " <> url) + result <- + Timeout.timeout fetchTimeoutMicros <| do + innerResult <- + try <| do + req0 <- HTTP.parseRequest (Text.unpack url) + let req = + HTTP.setRequestMethod "GET" + <| HTTP.setRequestHeader "User-Agent" ["Mozilla/5.0 (compatible; OmniBot/1.0)"] + <| HTTP.setRequestHeader "Accept" ["text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"] + <| HTTP.setRequestResponseTimeout (HTTPClient.responseTimeoutMicro fetchTimeoutMicros) + <| req0 + HTTP.httpLBS req + case innerResult of + Left (e :: SomeException) -> do + dbg ("[webreader] Fetch error: " <> url <> " - " <> tshow e) + pure (Left ("Failed to fetch URL: " <> tshow e)) + Right response -> do + let status = HTTP.getResponseStatusCode response + if status >= 200 && status < 300 + then do + let body = HTTP.getResponseBody response + text = TE.decodeUtf8With (\_ _ -> Just '?') (BL.toStrict body) + len = Text.length text + dbg ("[webreader] Fetched: " <> url <> " (" <> tshow len <> " chars)") + pure (Right text) + else do + dbg ("[webreader] HTTP " <> tshow status <> ": " <> url) + pure (Left ("HTTP error: " <> tshow status)) + case result of + Nothing -> do + dbg ("[webreader] Timeout: " <> url) + pure (Left ("Timeout fetching " <> url)) + Just r -> pure r + +-- | Fast single-pass text extraction from HTML +-- Strips all tags in one pass, no expensive operations +extractText :: Text -> Text +extractText html = collapseWhitespace (stripAllTags html) + where + -- Single pass: accumulate text outside of tags + stripAllTags :: Text -> Text + stripAllTags txt = Text.pack (go (Text.unpack txt) False []) + where + go :: [Char] -> Bool -> [Char] -> [Char] + go [] _ acc = reverse acc + go ('<' : rest) _ acc = go rest True acc -- Enter tag + go ('>' : rest) True acc = go rest False (' ' : acc) -- Exit tag, add space + go (_ : rest) True acc = go rest True acc -- Inside tag, skip + go (c : rest) False acc = go rest False (c : acc) -- Outside tag, keep + collapseWhitespace = Text.unwords <. Text.words + +-- | Maximum chars to send for summarization (keep it small for fast LLM response) +maxContentForSummary :: Int +maxContentForSummary = 15000 + +-- | Maximum summary length to return +maxSummaryLength :: Int +maxSummaryLength = 1000 + +-- | Timeout for trafilatura extraction in microseconds (10 seconds) +extractTimeoutMicros :: Int +extractTimeoutMicros = 10 * 1000000 + +-- | Extract article content using trafilatura (Python library) +-- Falls back to naive extractText if trafilatura fails +extractWithTrafilatura :: Text -> IO Text +extractWithTrafilatura html = do + let pythonScript = + "import sys; import trafilatura; " + <> "html = sys.stdin.read(); " + <> "result = trafilatura.extract(html, include_comments=False, include_tables=False); " + <> "print(result if result else '')" + proc = + (Process.proc "python3" ["-c", Text.unpack pythonScript]) + { Process.std_in = Process.CreatePipe, + Process.std_out = Process.CreatePipe, + Process.std_err = Process.CreatePipe + } + result <- + Timeout.timeout extractTimeoutMicros <| do + (exitCode, stdoutStr, _stderrStr) <- Process.readCreateProcessWithExitCode proc (Text.unpack html) + case exitCode of + Exit.ExitSuccess -> pure (Text.strip (Text.pack stdoutStr)) + Exit.ExitFailure _ -> pure "" + case result of + Just txt | not (Text.null txt) -> pure txt + _ -> do + dbg "[webreader] trafilatura failed, falling back to naive extraction" + pure (extractText (Text.take 100000 html)) + +summarizeContent :: Text -> Text -> Text -> IO (Either Text Text) +summarizeContent apiKey url content = do + let truncatedContent = Text.take maxContentForSummary content + haiku = Provider.defaultOpenRouter apiKey "anthropic/claude-haiku-4.5" + dbg ("[webreader] Summarizing: " <> url <> " (" <> tshow (Text.length truncatedContent) <> " chars)") + dbg "[webreader] Calling LLM for summarization..." + startTime <- getCurrentTime + result <- + Timeout.timeout summarizeTimeoutMicros + <| Provider.chat + haiku + [] + [ Provider.Message + Provider.System + ( "You are a webpage summarizer. Extract the key information in 3-5 bullet points. " + <> "Be extremely concise - max 500 characters total. No preamble, just bullets." + ) + Nothing + Nothing, + Provider.Message + Provider.User + ("Summarize: " <> url <> "\n\n" <> truncatedContent) + Nothing + Nothing + ] + endTime <- getCurrentTime + let elapsed = diffUTCTime endTime startTime + dbg ("[webreader] LLM call completed in " <> tshow elapsed) + case result of + Nothing -> do + dbg ("[webreader] Summarize timeout after " <> tshow elapsed <> ": " <> url) + pure (Left ("Timeout summarizing " <> url)) + Just (Left err) -> do + dbg ("[webreader] Summarize error: " <> url <> " - " <> err) + pure (Left ("Summarization failed: " <> err)) + Just (Right msg) -> do + let summary = Text.take maxSummaryLength (Provider.msgContent msg) + dbg ("[webreader] Summarized: " <> url <> " (" <> tshow (Text.length summary) <> " chars)") + pure (Right summary) + +-- | Fetch and summarize a single URL, returning a result object +-- This is the core function used by both single and batch tools +fetchAndSummarize :: Text -> Text -> IO Aeson.Value +fetchAndSummarize apiKey url = do + fetchResult <- fetchWebpage url + case fetchResult of + Left err -> + pure (Aeson.object ["url" .= url, "error" .= err]) + Right html -> do + dbg ("[webreader] Extracting article from: " <> url <> " (" <> tshow (Text.length html) <> " chars HTML)") + extractStart <- getCurrentTime + textContent <- extractWithTrafilatura html + extractEnd <- getCurrentTime + let extractElapsed = diffUTCTime extractEnd extractStart + dbg ("[webreader] Extracted: " <> url <> " (" <> tshow (Text.length textContent) <> " chars text) in " <> tshow extractElapsed) + if Text.null (Text.strip textContent) + then pure (Aeson.object ["url" .= url, "error" .= ("Page appears to be empty or JavaScript-only" :: Text)]) + else do + summaryResult <- summarizeContent apiKey url textContent + case summaryResult of + Left err -> + pure + ( Aeson.object + [ "url" .= url, + "error" .= err, + "raw_content" .= Text.take 2000 textContent + ] + ) + Right summary -> + pure + ( Aeson.object + [ "url" .= url, + "success" .= True, + "summary" .= summary + ] + ) + +-- | Web reader tool - fetches and summarizes webpages in parallel +webReaderTool :: Text -> Engine.Tool +webReaderTool apiKey = + Engine.Tool + { Engine.toolName = "read_webpages", + Engine.toolDescription = + "Fetch and summarize webpages in parallel. Each page is processed independently - " + <> "failures on one page won't affect others. Returns a list of summaries.", + Engine.toolJsonSchema = + Aeson.object + [ "type" .= ("object" :: Text), + "properties" + .= Aeson.object + [ "urls" + .= Aeson.object + [ "type" .= ("array" :: Text), + "items" .= Aeson.object ["type" .= ("string" :: Text)], + "description" .= ("List of URLs to read and summarize" :: Text) + ] + ], + "required" .= (["urls"] :: [Text]) + ], + Engine.toolExecute = executeWebReader apiKey + } + +executeWebReader :: Text -> Aeson.Value -> IO Aeson.Value +executeWebReader apiKey v = + case Aeson.fromJSON v of + Aeson.Error e -> pure (Aeson.object ["error" .= Text.pack e]) + Aeson.Success (args :: WebReaderArgs) -> do + let urls = wrUrls args + dbg ("[webreader] Starting batch: " <> tshow (length urls) <> " URLs") + results <- Sema.mapPool maxConcurrentFetches (fetchAndSummarize apiKey) urls + let succeeded = length (filter isSuccess results) + dbg ("[webreader] Batch complete: " <> tshow succeeded <> "/" <> tshow (length urls) <> " succeeded") + pure + ( Aeson.object + [ "results" .= results, + "total" .= length urls, + "succeeded" .= succeeded + ] + ) + where + isSuccess (Aeson.Object obj) = KeyMap.member "success" obj + isSuccess _ = False + +newtype WebReaderArgs = WebReaderArgs + { wrUrls :: [Text] + } + deriving (Generic) + +instance Aeson.FromJSON WebReaderArgs where + parseJSON = + Aeson.withObject "WebReaderArgs" <| \v -> + WebReaderArgs </ (v Aeson..: "urls") |
