From 4ff40843e7a6801b7785bfff7f4e9e8fff4e27d4 Mon Sep 17 00:00:00 2001 From: Ben Sima Date: Sat, 13 Dec 2025 00:35:24 -0500 Subject: telegram: fix parsing, add webpage reader, use gemini - Fix Provider.hs to strip leading whitespace from OpenRouter responses - Fix FunctionCall parser to handle missing 'arguments' field - Use eitherDecode for better error messages on parse failures - Switch to claude-sonnet-4.5 for main agent - Use gemini-2.0-flash for conversation summarization (cheaper) - Add read_webpage tool for fetching and summarizing URLs - Add tagsoup to Haskell deps (unused, kept for future) --- Omni/Agent/Tools/WebReader.hs | 210 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 210 insertions(+) create mode 100644 Omni/Agent/Tools/WebReader.hs (limited to 'Omni/Agent/Tools/WebReader.hs') diff --git a/Omni/Agent/Tools/WebReader.hs b/Omni/Agent/Tools/WebReader.hs new file mode 100644 index 0000000..9b776ad --- /dev/null +++ b/Omni/Agent/Tools/WebReader.hs @@ -0,0 +1,210 @@ +{-# LANGUAGE DeriveGeneric #-} +{-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE ScopedTypeVariables #-} +{-# LANGUAGE NoImplicitPrelude #-} + +-- | Web page reader tool - fetches and summarizes web pages. +-- +-- : out omni-agent-tools-webreader +-- : dep aeson +-- : dep http-conduit +module Omni.Agent.Tools.WebReader + ( -- * Tool + webReaderTool, + + -- * Direct API + fetchWebpage, + extractText, + + -- * Testing + main, + test, + ) +where + +import Alpha +import Data.Aeson ((.=)) +import qualified Data.Aeson as Aeson +import qualified Data.ByteString.Lazy as BL +import qualified Data.Text as Text +import qualified Data.Text.Encoding as TE +import qualified Network.HTTP.Client as HTTPClient +import qualified Network.HTTP.Simple as HTTP +import qualified Omni.Agent.Engine as Engine +import qualified Omni.Agent.Provider as Provider +import qualified Omni.Test as Test + +main :: IO () +main = Test.run test + +test :: Test.Tree +test = + Test.group + "Omni.Agent.Tools.WebReader" + [ Test.unit "extractText removes HTML tags" <| do + let html = "

Hello world

" + result = extractText html + ("Hello world" `Text.isInfixOf` result) Test.@=? True, + Test.unit "extractText removes script tags" <| do + let html = "

Content

" + result = extractText html + ("alert" `Text.isInfixOf` result) Test.@=? False + ("Content" `Text.isInfixOf` result) Test.@=? True, + Test.unit "webReaderTool has correct schema" <| do + let tool = webReaderTool "test-key" + Engine.toolName tool Test.@=? "read_webpage" + ] + +fetchWebpage :: Text -> IO (Either Text Text) +fetchWebpage url = do + result <- + try <| do + req0 <- HTTP.parseRequest (Text.unpack url) + let req = + HTTP.setRequestMethod "GET" + <| HTTP.setRequestHeader "User-Agent" ["Mozilla/5.0 (compatible; OmniBot/1.0)"] + <| HTTP.setRequestHeader "Accept" ["text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"] + <| HTTP.setRequestResponseTimeout (HTTPClient.responseTimeoutMicro (30 * 1000000)) + <| req0 + HTTP.httpLBS req + case result of + Left (e :: SomeException) -> + pure (Left ("Failed to fetch URL: " <> tshow e)) + Right response -> do + let status = HTTP.getResponseStatusCode response + if status >= 200 && status < 300 + then do + let body = HTTP.getResponseBody response + text = TE.decodeUtf8With (\_ _ -> Just '?') (BL.toStrict body) + pure (Right text) + else pure (Left ("HTTP error: " <> tshow status)) + +extractText :: Text -> Text +extractText html = + let noScript = removeTagContent "script" html + noStyle = removeTagContent "style" noScript + noNoscript = removeTagContent "noscript" noStyle + noTags = stripTags noNoscript + in collapseWhitespace noTags + where + removeTagContent :: Text -> Text -> Text + removeTagContent tag txt = + let openTag = "<" <> tag + closeTag = " tag <> ">" + in removeMatches openTag closeTag txt + + removeMatches :: Text -> Text -> Text -> Text + removeMatches open close txt = + case Text.breakOn open (Text.toLower txt) of + (_, "") -> txt + (before, _) -> + let actualBefore = Text.take (Text.length before) txt + rest = Text.drop (Text.length before) txt + in case Text.breakOn close (Text.toLower rest) of + (_, "") -> actualBefore + (_, afterClose) -> + let skipLen = Text.length close + remaining = Text.drop (Text.length rest - Text.length afterClose + skipLen) txt + in actualBefore <> removeMatches open close remaining + + stripTags :: Text -> Text + stripTags txt = go txt "" + where + go :: Text -> Text -> Text + go remaining acc = + case Text.breakOn "<" remaining of + (before, "") -> acc <> before + (before, rest) -> + case Text.breakOn ">" rest of + (_, "") -> acc <> before + (_, afterTag) -> go (Text.drop 1 afterTag) (acc <> before <> " ") + + collapseWhitespace = Text.unwords <. Text.words + +summarizeContent :: Text -> Text -> Text -> IO (Either Text Text) +summarizeContent apiKey url content = do + let truncatedContent = Text.take 50000 content + gemini = Provider.defaultOpenRouter apiKey "google/gemini-2.0-flash-001" + result <- + Provider.chat + gemini + [] + [ Provider.Message + Provider.System + "You are a webpage summarizer. Provide a concise summary of the webpage content. Focus on the main points and key information. Be brief but comprehensive." + Nothing + Nothing, + Provider.Message + Provider.User + ("Summarize this webpage (" <> url <> "):\n\n" <> truncatedContent) + Nothing + Nothing + ] + case result of + Left err -> pure (Left ("Summarization failed: " <> err)) + Right msg -> pure (Right (Provider.msgContent msg)) + +webReaderTool :: Text -> Engine.Tool +webReaderTool apiKey = + Engine.Tool + { Engine.toolName = "read_webpage", + Engine.toolDescription = + "Fetch and summarize a webpage. Use this when the user shares a URL or link " + <> "and wants to know what it contains. Returns a summary of the page content.", + Engine.toolJsonSchema = + Aeson.object + [ "type" .= ("object" :: Text), + "properties" + .= Aeson.object + [ "url" + .= Aeson.object + [ "type" .= ("string" :: Text), + "description" .= ("The URL of the webpage to read" :: Text) + ] + ], + "required" .= (["url"] :: [Text]) + ], + Engine.toolExecute = executeWebReader apiKey + } + +executeWebReader :: Text -> Aeson.Value -> IO Aeson.Value +executeWebReader apiKey v = + case Aeson.fromJSON v of + Aeson.Error e -> pure (Aeson.object ["error" .= Text.pack e]) + Aeson.Success (args :: WebReaderArgs) -> do + fetchResult <- fetchWebpage (wrUrl args) + case fetchResult of + Left err -> + pure (Aeson.object ["error" .= err]) + Right html -> do + let textContent = extractText html + if Text.null (Text.strip textContent) + then pure (Aeson.object ["error" .= ("Page appears to be empty or JavaScript-only" :: Text)]) + else do + summaryResult <- summarizeContent apiKey (wrUrl args) textContent + case summaryResult of + Left err -> + pure + ( Aeson.object + [ "error" .= err, + "raw_content" .= Text.take 2000 textContent + ] + ) + Right summary -> + pure + ( Aeson.object + [ "success" .= True, + "url" .= wrUrl args, + "summary" .= summary + ] + ) + +newtype WebReaderArgs = WebReaderArgs + { wrUrl :: Text + } + deriving (Generic) + +instance Aeson.FromJSON WebReaderArgs where + parseJSON = + Aeson.withObject "WebReaderArgs" <| \v -> + WebReaderArgs Date: Tue, 16 Dec 2025 13:24:54 -0500 Subject: Batch web_reader tool, much faster Added retry with backoff, parallel proccessing, editing pages down to main content, summarization with haiku. It's so much faster and more reliable now. Plus improved the logging system and distangled the status UI bar from the logging module. --- Omni/Agent/Tools/WebReader.hs | 318 +++++++++++++++++++++++++++--------------- 1 file changed, 208 insertions(+), 110 deletions(-) (limited to 'Omni/Agent/Tools/WebReader.hs') diff --git a/Omni/Agent/Tools/WebReader.hs b/Omni/Agent/Tools/WebReader.hs index 9b776ad..a69e3cf 100644 --- a/Omni/Agent/Tools/WebReader.hs +++ b/Omni/Agent/Tools/WebReader.hs @@ -8,6 +8,7 @@ -- : out omni-agent-tools-webreader -- : dep aeson -- : dep http-conduit +-- : run trafilatura module Omni.Agent.Tools.WebReader ( -- * Tool webReaderTool, @@ -15,6 +16,7 @@ module Omni.Agent.Tools.WebReader -- * Direct API fetchWebpage, extractText, + fetchAndSummarize, -- * Testing main, @@ -23,16 +25,24 @@ module Omni.Agent.Tools.WebReader where import Alpha +import qualified Control.Concurrent.Sema as Sema import Data.Aeson ((.=)) import qualified Data.Aeson as Aeson +import qualified Data.Aeson.KeyMap as KeyMap import qualified Data.ByteString.Lazy as BL import qualified Data.Text as Text import qualified Data.Text.Encoding as TE +import qualified Data.Text.IO as TIO +import Data.Time.Clock (diffUTCTime, getCurrentTime) import qualified Network.HTTP.Client as HTTPClient import qualified Network.HTTP.Simple as HTTP import qualified Omni.Agent.Engine as Engine import qualified Omni.Agent.Provider as Provider import qualified Omni.Test as Test +import qualified System.Exit as Exit +import qualified System.IO as IO +import qualified System.Process as Process +import qualified System.Timeout as Timeout main :: IO () main = Test.run test @@ -52,117 +62,216 @@ test = ("Content" `Text.isInfixOf` result) Test.@=? True, Test.unit "webReaderTool has correct schema" <| do let tool = webReaderTool "test-key" - Engine.toolName tool Test.@=? "read_webpage" + Engine.toolName tool Test.@=? "read_webpages" ] +-- | Fetch timeout in microseconds (15 seconds - short because blocked sites won't respond anyway) +fetchTimeoutMicros :: Int +fetchTimeoutMicros = 15 * 1000000 + +-- | Summarization timeout in microseconds (30 seconds) +summarizeTimeoutMicros :: Int +summarizeTimeoutMicros = 30 * 1000000 + +-- | Maximum concurrent fetches +maxConcurrentFetches :: Int +maxConcurrentFetches = 10 + +-- | Simple debug logging to stderr +dbg :: Text -> IO () +dbg = TIO.hPutStrLn IO.stderr + fetchWebpage :: Text -> IO (Either Text Text) fetchWebpage url = do + dbg ("[webreader] Fetching: " <> url) result <- - try <| do - req0 <- HTTP.parseRequest (Text.unpack url) - let req = - HTTP.setRequestMethod "GET" - <| HTTP.setRequestHeader "User-Agent" ["Mozilla/5.0 (compatible; OmniBot/1.0)"] - <| HTTP.setRequestHeader "Accept" ["text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"] - <| HTTP.setRequestResponseTimeout (HTTPClient.responseTimeoutMicro (30 * 1000000)) - <| req0 - HTTP.httpLBS req + Timeout.timeout fetchTimeoutMicros <| do + innerResult <- + try <| do + req0 <- HTTP.parseRequest (Text.unpack url) + let req = + HTTP.setRequestMethod "GET" + <| HTTP.setRequestHeader "User-Agent" ["Mozilla/5.0 (compatible; OmniBot/1.0)"] + <| HTTP.setRequestHeader "Accept" ["text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"] + <| HTTP.setRequestResponseTimeout (HTTPClient.responseTimeoutMicro fetchTimeoutMicros) + <| req0 + HTTP.httpLBS req + case innerResult of + Left (e :: SomeException) -> do + dbg ("[webreader] Fetch error: " <> url <> " - " <> tshow e) + pure (Left ("Failed to fetch URL: " <> tshow e)) + Right response -> do + let status = HTTP.getResponseStatusCode response + if status >= 200 && status < 300 + then do + let body = HTTP.getResponseBody response + text = TE.decodeUtf8With (\_ _ -> Just '?') (BL.toStrict body) + len = Text.length text + dbg ("[webreader] Fetched: " <> url <> " (" <> tshow len <> " chars)") + pure (Right text) + else do + dbg ("[webreader] HTTP " <> tshow status <> ": " <> url) + pure (Left ("HTTP error: " <> tshow status)) case result of - Left (e :: SomeException) -> - pure (Left ("Failed to fetch URL: " <> tshow e)) - Right response -> do - let status = HTTP.getResponseStatusCode response - if status >= 200 && status < 300 - then do - let body = HTTP.getResponseBody response - text = TE.decodeUtf8With (\_ _ -> Just '?') (BL.toStrict body) - pure (Right text) - else pure (Left ("HTTP error: " <> tshow status)) + Nothing -> do + dbg ("[webreader] Timeout: " <> url) + pure (Left ("Timeout fetching " <> url)) + Just r -> pure r +-- | Fast single-pass text extraction from HTML +-- Strips all tags in one pass, no expensive operations extractText :: Text -> Text -extractText html = - let noScript = removeTagContent "script" html - noStyle = removeTagContent "style" noScript - noNoscript = removeTagContent "noscript" noStyle - noTags = stripTags noNoscript - in collapseWhitespace noTags +extractText html = collapseWhitespace (stripAllTags html) where - removeTagContent :: Text -> Text -> Text - removeTagContent tag txt = - let openTag = "<" <> tag - closeTag = " tag <> ">" - in removeMatches openTag closeTag txt - - removeMatches :: Text -> Text -> Text -> Text - removeMatches open close txt = - case Text.breakOn open (Text.toLower txt) of - (_, "") -> txt - (before, _) -> - let actualBefore = Text.take (Text.length before) txt - rest = Text.drop (Text.length before) txt - in case Text.breakOn close (Text.toLower rest) of - (_, "") -> actualBefore - (_, afterClose) -> - let skipLen = Text.length close - remaining = Text.drop (Text.length rest - Text.length afterClose + skipLen) txt - in actualBefore <> removeMatches open close remaining - - stripTags :: Text -> Text - stripTags txt = go txt "" + -- Single pass: accumulate text outside of tags + stripAllTags :: Text -> Text + stripAllTags txt = Text.pack (go (Text.unpack txt) False []) where - go :: Text -> Text -> Text - go remaining acc = - case Text.breakOn "<" remaining of - (before, "") -> acc <> before - (before, rest) -> - case Text.breakOn ">" rest of - (_, "") -> acc <> before - (_, afterTag) -> go (Text.drop 1 afterTag) (acc <> before <> " ") - + go :: [Char] -> Bool -> [Char] -> [Char] + go [] _ acc = reverse acc + go ('<' : rest) _ acc = go rest True acc -- Enter tag + go ('>' : rest) True acc = go rest False (' ' : acc) -- Exit tag, add space + go (_ : rest) True acc = go rest True acc -- Inside tag, skip + go (c : rest) False acc = go rest False (c : acc) -- Outside tag, keep collapseWhitespace = Text.unwords <. Text.words +-- | Maximum chars to send for summarization (keep it small for fast LLM response) +maxContentForSummary :: Int +maxContentForSummary = 15000 + +-- | Maximum summary length to return +maxSummaryLength :: Int +maxSummaryLength = 1000 + +-- | Timeout for trafilatura extraction in microseconds (10 seconds) +extractTimeoutMicros :: Int +extractTimeoutMicros = 10 * 1000000 + +-- | Extract article content using trafilatura (Python library) +-- Falls back to naive extractText if trafilatura fails +extractWithTrafilatura :: Text -> IO Text +extractWithTrafilatura html = do + let pythonScript = + "import sys; import trafilatura; " + <> "html = sys.stdin.read(); " + <> "result = trafilatura.extract(html, include_comments=False, include_tables=False); " + <> "print(result if result else '')" + proc = + (Process.proc "python3" ["-c", Text.unpack pythonScript]) + { Process.std_in = Process.CreatePipe, + Process.std_out = Process.CreatePipe, + Process.std_err = Process.CreatePipe + } + result <- + Timeout.timeout extractTimeoutMicros <| do + (exitCode, stdoutStr, _stderrStr) <- Process.readCreateProcessWithExitCode proc (Text.unpack html) + case exitCode of + Exit.ExitSuccess -> pure (Text.strip (Text.pack stdoutStr)) + Exit.ExitFailure _ -> pure "" + case result of + Just txt | not (Text.null txt) -> pure txt + _ -> do + dbg "[webreader] trafilatura failed, falling back to naive extraction" + pure (extractText (Text.take 100000 html)) + summarizeContent :: Text -> Text -> Text -> IO (Either Text Text) summarizeContent apiKey url content = do - let truncatedContent = Text.take 50000 content - gemini = Provider.defaultOpenRouter apiKey "google/gemini-2.0-flash-001" + let truncatedContent = Text.take maxContentForSummary content + haiku = Provider.defaultOpenRouter apiKey "anthropic/claude-haiku-4.5" + dbg ("[webreader] Summarizing: " <> url <> " (" <> tshow (Text.length truncatedContent) <> " chars)") + dbg "[webreader] Calling LLM for summarization..." + startTime <- getCurrentTime result <- - Provider.chat - gemini - [] - [ Provider.Message - Provider.System - "You are a webpage summarizer. Provide a concise summary of the webpage content. Focus on the main points and key information. Be brief but comprehensive." - Nothing - Nothing, - Provider.Message - Provider.User - ("Summarize this webpage (" <> url <> "):\n\n" <> truncatedContent) - Nothing - Nothing - ] + Timeout.timeout summarizeTimeoutMicros + <| Provider.chat + haiku + [] + [ Provider.Message + Provider.System + ( "You are a webpage summarizer. Extract the key information in 3-5 bullet points. " + <> "Be extremely concise - max 500 characters total. No preamble, just bullets." + ) + Nothing + Nothing, + Provider.Message + Provider.User + ("Summarize: " <> url <> "\n\n" <> truncatedContent) + Nothing + Nothing + ] + endTime <- getCurrentTime + let elapsed = diffUTCTime endTime startTime + dbg ("[webreader] LLM call completed in " <> tshow elapsed) case result of - Left err -> pure (Left ("Summarization failed: " <> err)) - Right msg -> pure (Right (Provider.msgContent msg)) + Nothing -> do + dbg ("[webreader] Summarize timeout after " <> tshow elapsed <> ": " <> url) + pure (Left ("Timeout summarizing " <> url)) + Just (Left err) -> do + dbg ("[webreader] Summarize error: " <> url <> " - " <> err) + pure (Left ("Summarization failed: " <> err)) + Just (Right msg) -> do + let summary = Text.take maxSummaryLength (Provider.msgContent msg) + dbg ("[webreader] Summarized: " <> url <> " (" <> tshow (Text.length summary) <> " chars)") + pure (Right summary) +-- | Fetch and summarize a single URL, returning a result object +-- This is the core function used by both single and batch tools +fetchAndSummarize :: Text -> Text -> IO Aeson.Value +fetchAndSummarize apiKey url = do + fetchResult <- fetchWebpage url + case fetchResult of + Left err -> + pure (Aeson.object ["url" .= url, "error" .= err]) + Right html -> do + dbg ("[webreader] Extracting article from: " <> url <> " (" <> tshow (Text.length html) <> " chars HTML)") + extractStart <- getCurrentTime + textContent <- extractWithTrafilatura html + extractEnd <- getCurrentTime + let extractElapsed = diffUTCTime extractEnd extractStart + dbg ("[webreader] Extracted: " <> url <> " (" <> tshow (Text.length textContent) <> " chars text) in " <> tshow extractElapsed) + if Text.null (Text.strip textContent) + then pure (Aeson.object ["url" .= url, "error" .= ("Page appears to be empty or JavaScript-only" :: Text)]) + else do + summaryResult <- summarizeContent apiKey url textContent + case summaryResult of + Left err -> + pure + ( Aeson.object + [ "url" .= url, + "error" .= err, + "raw_content" .= Text.take 2000 textContent + ] + ) + Right summary -> + pure + ( Aeson.object + [ "url" .= url, + "success" .= True, + "summary" .= summary + ] + ) + +-- | Web reader tool - fetches and summarizes webpages in parallel webReaderTool :: Text -> Engine.Tool webReaderTool apiKey = Engine.Tool - { Engine.toolName = "read_webpage", + { Engine.toolName = "read_webpages", Engine.toolDescription = - "Fetch and summarize a webpage. Use this when the user shares a URL or link " - <> "and wants to know what it contains. Returns a summary of the page content.", + "Fetch and summarize webpages in parallel. Each page is processed independently - " + <> "failures on one page won't affect others. Returns a list of summaries.", Engine.toolJsonSchema = Aeson.object [ "type" .= ("object" :: Text), "properties" .= Aeson.object - [ "url" + [ "urls" .= Aeson.object - [ "type" .= ("string" :: Text), - "description" .= ("The URL of the webpage to read" :: Text) + [ "type" .= ("array" :: Text), + "items" .= Aeson.object ["type" .= ("string" :: Text)], + "description" .= ("List of URLs to read and summarize" :: Text) ] ], - "required" .= (["url"] :: [Text]) + "required" .= (["urls"] :: [Text]) ], Engine.toolExecute = executeWebReader apiKey } @@ -172,39 +281,28 @@ executeWebReader apiKey v = case Aeson.fromJSON v of Aeson.Error e -> pure (Aeson.object ["error" .= Text.pack e]) Aeson.Success (args :: WebReaderArgs) -> do - fetchResult <- fetchWebpage (wrUrl args) - case fetchResult of - Left err -> - pure (Aeson.object ["error" .= err]) - Right html -> do - let textContent = extractText html - if Text.null (Text.strip textContent) - then pure (Aeson.object ["error" .= ("Page appears to be empty or JavaScript-only" :: Text)]) - else do - summaryResult <- summarizeContent apiKey (wrUrl args) textContent - case summaryResult of - Left err -> - pure - ( Aeson.object - [ "error" .= err, - "raw_content" .= Text.take 2000 textContent - ] - ) - Right summary -> - pure - ( Aeson.object - [ "success" .= True, - "url" .= wrUrl args, - "summary" .= summary - ] - ) + let urls = wrUrls args + dbg ("[webreader] Starting batch: " <> tshow (length urls) <> " URLs") + results <- Sema.mapPool maxConcurrentFetches (fetchAndSummarize apiKey) urls + let succeeded = length (filter isSuccess results) + dbg ("[webreader] Batch complete: " <> tshow succeeded <> "/" <> tshow (length urls) <> " succeeded") + pure + ( Aeson.object + [ "results" .= results, + "total" .= length urls, + "succeeded" .= succeeded + ] + ) + where + isSuccess (Aeson.Object obj) = KeyMap.member "success" obj + isSuccess _ = False newtype WebReaderArgs = WebReaderArgs - { wrUrl :: Text + { wrUrls :: [Text] } deriving (Generic) instance Aeson.FromJSON WebReaderArgs where parseJSON = Aeson.withObject "WebReaderArgs" <| \v -> - WebReaderArgs