From b18bd4eee969681ee532c4898ddaaa0851e6b846 Mon Sep 17 00:00:00 2001 From: Ben Sima Date: Tue, 16 Dec 2025 13:24:54 -0500 Subject: Batch web_reader tool, much faster Added retry with backoff, parallel proccessing, editing pages down to main content, summarization with haiku. It's so much faster and more reliable now. Plus improved the logging system and distangled the status UI bar from the logging module. --- Omni/Agent/Tools/WebReaderTest.hs | 53 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) create mode 100644 Omni/Agent/Tools/WebReaderTest.hs (limited to 'Omni/Agent/Tools/WebReaderTest.hs') diff --git a/Omni/Agent/Tools/WebReaderTest.hs b/Omni/Agent/Tools/WebReaderTest.hs new file mode 100644 index 0000000..ca4c119 --- /dev/null +++ b/Omni/Agent/Tools/WebReaderTest.hs @@ -0,0 +1,53 @@ +{-# LANGUAGE BangPatterns #-} +{-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE NoImplicitPrelude #-} + +-- | Quick test for WebReader to debug hangs +-- +-- : out webreader-test +-- : dep http-conduit +-- : run trafilatura +module Omni.Agent.Tools.WebReaderTest where + +import Alpha +import qualified Data.Text as Text +import qualified Data.Text.IO as TIO +import Data.Time.Clock (diffUTCTime, getCurrentTime) +import qualified Omni.Agent.Tools.WebReader as WebReader + +main :: IO () +main = do + TIO.putStrLn "=== WebReader Debug Test ===" + + TIO.putStrLn "\n--- Test 1: Small page (httpbin) ---" + testUrl "https://httpbin.org/html" + + TIO.putStrLn "\n--- Test 2: Medium page (example.com) ---" + testUrl "https://example.com" + + TIO.putStrLn "\n--- Test 3: Large page (github) ---" + testUrl "https://github.com/anthropics/skills" + + TIO.putStrLn "\n=== Done ===" + +testUrl :: Text -> IO () +testUrl url = do + TIO.putStrLn ("Fetching: " <> url) + + startFetch <- getCurrentTime + result <- WebReader.fetchWebpage url + endFetch <- getCurrentTime + TIO.putStrLn ("Fetch took: " <> tshow (diffUTCTime endFetch startFetch)) + + case result of + Left err -> TIO.putStrLn ("Fetch error: " <> err) + Right html -> do + TIO.putStrLn ("HTML size: " <> tshow (Text.length html) <> " chars") + + TIO.putStrLn "Extracting text (naive, 100k truncated)..." + startExtract <- getCurrentTime + let !text = WebReader.extractText (Text.take 100000 html) + endExtract <- getCurrentTime + TIO.putStrLn ("Extract took: " <> tshow (diffUTCTime endExtract startExtract)) + TIO.putStrLn ("Text size: " <> tshow (Text.length text) <> " chars") + TIO.putStrLn ("Preview: " <> Text.take 200 text) -- cgit v1.2.3