blob: ca4c11904a8a6559ecfc7be6d88d84f76c62450a (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
|
{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE NoImplicitPrelude #-}
-- | Quick test for WebReader to debug hangs
--
-- : out webreader-test
-- : dep http-conduit
-- : run trafilatura
module Omni.Agent.Tools.WebReaderTest where
import Alpha
import qualified Data.Text as Text
import qualified Data.Text.IO as TIO
import Data.Time.Clock (diffUTCTime, getCurrentTime)
import qualified Omni.Agent.Tools.WebReader as WebReader
main :: IO ()
main = do
TIO.putStrLn "=== WebReader Debug Test ==="
TIO.putStrLn "\n--- Test 1: Small page (httpbin) ---"
testUrl "https://httpbin.org/html"
TIO.putStrLn "\n--- Test 2: Medium page (example.com) ---"
testUrl "https://example.com"
TIO.putStrLn "\n--- Test 3: Large page (github) ---"
testUrl "https://github.com/anthropics/skills"
TIO.putStrLn "\n=== Done ==="
testUrl :: Text -> IO ()
testUrl url = do
TIO.putStrLn ("Fetching: " <> url)
startFetch <- getCurrentTime
result <- WebReader.fetchWebpage url
endFetch <- getCurrentTime
TIO.putStrLn ("Fetch took: " <> tshow (diffUTCTime endFetch startFetch))
case result of
Left err -> TIO.putStrLn ("Fetch error: " <> err)
Right html -> do
TIO.putStrLn ("HTML size: " <> tshow (Text.length html) <> " chars")
TIO.putStrLn "Extracting text (naive, 100k truncated)..."
startExtract <- getCurrentTime
let !text = WebReader.extractText (Text.take 100000 html)
endExtract <- getCurrentTime
TIO.putStrLn ("Extract took: " <> tshow (diffUTCTime endExtract startExtract))
TIO.putStrLn ("Text size: " <> tshow (Text.length text) <> " chars")
TIO.putStrLn ("Preview: " <> Text.take 200 text)
|