summaryrefslogtreecommitdiff
path: root/Omni/Agent/Tools/WebReaderTest.hs
diff options
context:
space:
mode:
authorBen Sima <ben@bensima.com>2025-12-16 13:24:54 -0500
committerBen Sima <ben@bensima.com>2025-12-16 13:24:54 -0500
commitb18bd4eee969681ee532c4898ddaaa0851e6b846 (patch)
tree0a966754459c5873b9dad4289ea51e901bd4399b /Omni/Agent/Tools/WebReaderTest.hs
parent122d73ac9d2472f91ed00965d03d1e761da72699 (diff)
Batch web_reader tool, much faster
Added retry with backoff, parallel proccessing, editing pages down to main content, summarization with haiku. It's so much faster and more reliable now. Plus improved the logging system and distangled the status UI bar from the logging module.
Diffstat (limited to 'Omni/Agent/Tools/WebReaderTest.hs')
-rw-r--r--Omni/Agent/Tools/WebReaderTest.hs53
1 files changed, 53 insertions, 0 deletions
diff --git a/Omni/Agent/Tools/WebReaderTest.hs b/Omni/Agent/Tools/WebReaderTest.hs
new file mode 100644
index 0000000..ca4c119
--- /dev/null
+++ b/Omni/Agent/Tools/WebReaderTest.hs
@@ -0,0 +1,53 @@
+{-# LANGUAGE BangPatterns #-}
+{-# LANGUAGE OverloadedStrings #-}
+{-# LANGUAGE NoImplicitPrelude #-}
+
+-- | Quick test for WebReader to debug hangs
+--
+-- : out webreader-test
+-- : dep http-conduit
+-- : run trafilatura
+module Omni.Agent.Tools.WebReaderTest where
+
+import Alpha
+import qualified Data.Text as Text
+import qualified Data.Text.IO as TIO
+import Data.Time.Clock (diffUTCTime, getCurrentTime)
+import qualified Omni.Agent.Tools.WebReader as WebReader
+
+main :: IO ()
+main = do
+ TIO.putStrLn "=== WebReader Debug Test ==="
+
+ TIO.putStrLn "\n--- Test 1: Small page (httpbin) ---"
+ testUrl "https://httpbin.org/html"
+
+ TIO.putStrLn "\n--- Test 2: Medium page (example.com) ---"
+ testUrl "https://example.com"
+
+ TIO.putStrLn "\n--- Test 3: Large page (github) ---"
+ testUrl "https://github.com/anthropics/skills"
+
+ TIO.putStrLn "\n=== Done ==="
+
+testUrl :: Text -> IO ()
+testUrl url = do
+ TIO.putStrLn ("Fetching: " <> url)
+
+ startFetch <- getCurrentTime
+ result <- WebReader.fetchWebpage url
+ endFetch <- getCurrentTime
+ TIO.putStrLn ("Fetch took: " <> tshow (diffUTCTime endFetch startFetch))
+
+ case result of
+ Left err -> TIO.putStrLn ("Fetch error: " <> err)
+ Right html -> do
+ TIO.putStrLn ("HTML size: " <> tshow (Text.length html) <> " chars")
+
+ TIO.putStrLn "Extracting text (naive, 100k truncated)..."
+ startExtract <- getCurrentTime
+ let !text = WebReader.extractText (Text.take 100000 html)
+ endExtract <- getCurrentTime
+ TIO.putStrLn ("Extract took: " <> tshow (diffUTCTime endExtract startExtract))
+ TIO.putStrLn ("Text size: " <> tshow (Text.length text) <> " chars")
+ TIO.putStrLn ("Preview: " <> Text.take 200 text)