summaryrefslogtreecommitdiff
path: root/Omni/Agent/Tools
diff options
context:
space:
mode:
Diffstat (limited to 'Omni/Agent/Tools')
-rw-r--r--Omni/Agent/Tools/WebReader.hs210
1 files changed, 210 insertions, 0 deletions
diff --git a/Omni/Agent/Tools/WebReader.hs b/Omni/Agent/Tools/WebReader.hs
new file mode 100644
index 0000000..9b776ad
--- /dev/null
+++ b/Omni/Agent/Tools/WebReader.hs
@@ -0,0 +1,210 @@
+{-# LANGUAGE DeriveGeneric #-}
+{-# LANGUAGE OverloadedStrings #-}
+{-# LANGUAGE ScopedTypeVariables #-}
+{-# LANGUAGE NoImplicitPrelude #-}
+
+-- | Web page reader tool - fetches and summarizes web pages.
+--
+-- : out omni-agent-tools-webreader
+-- : dep aeson
+-- : dep http-conduit
+module Omni.Agent.Tools.WebReader
+ ( -- * Tool
+ webReaderTool,
+
+ -- * Direct API
+ fetchWebpage,
+ extractText,
+
+ -- * Testing
+ main,
+ test,
+ )
+where
+
+import Alpha
+import Data.Aeson ((.=))
+import qualified Data.Aeson as Aeson
+import qualified Data.ByteString.Lazy as BL
+import qualified Data.Text as Text
+import qualified Data.Text.Encoding as TE
+import qualified Network.HTTP.Client as HTTPClient
+import qualified Network.HTTP.Simple as HTTP
+import qualified Omni.Agent.Engine as Engine
+import qualified Omni.Agent.Provider as Provider
+import qualified Omni.Test as Test
+
+main :: IO ()
+main = Test.run test
+
+test :: Test.Tree
+test =
+ Test.group
+ "Omni.Agent.Tools.WebReader"
+ [ Test.unit "extractText removes HTML tags" <| do
+ let html = "<html><body><p>Hello world</p></body></html>"
+ result = extractText html
+ ("Hello world" `Text.isInfixOf` result) Test.@=? True,
+ Test.unit "extractText removes script tags" <| do
+ let html = "<html><script>alert('hi')</script><p>Content</p></html>"
+ result = extractText html
+ ("alert" `Text.isInfixOf` result) Test.@=? False
+ ("Content" `Text.isInfixOf` result) Test.@=? True,
+ Test.unit "webReaderTool has correct schema" <| do
+ let tool = webReaderTool "test-key"
+ Engine.toolName tool Test.@=? "read_webpage"
+ ]
+
+fetchWebpage :: Text -> IO (Either Text Text)
+fetchWebpage url = do
+ result <-
+ try <| do
+ req0 <- HTTP.parseRequest (Text.unpack url)
+ let req =
+ HTTP.setRequestMethod "GET"
+ <| HTTP.setRequestHeader "User-Agent" ["Mozilla/5.0 (compatible; OmniBot/1.0)"]
+ <| HTTP.setRequestHeader "Accept" ["text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"]
+ <| HTTP.setRequestResponseTimeout (HTTPClient.responseTimeoutMicro (30 * 1000000))
+ <| req0
+ HTTP.httpLBS req
+ case result of
+ Left (e :: SomeException) ->
+ pure (Left ("Failed to fetch URL: " <> tshow e))
+ Right response -> do
+ let status = HTTP.getResponseStatusCode response
+ if status >= 200 && status < 300
+ then do
+ let body = HTTP.getResponseBody response
+ text = TE.decodeUtf8With (\_ _ -> Just '?') (BL.toStrict body)
+ pure (Right text)
+ else pure (Left ("HTTP error: " <> tshow status))
+
+extractText :: Text -> Text
+extractText html =
+ let noScript = removeTagContent "script" html
+ noStyle = removeTagContent "style" noScript
+ noNoscript = removeTagContent "noscript" noStyle
+ noTags = stripTags noNoscript
+ in collapseWhitespace noTags
+ where
+ removeTagContent :: Text -> Text -> Text
+ removeTagContent tag txt =
+ let openTag = "<" <> tag
+ closeTag = "</" <> tag <> ">"
+ in removeMatches openTag closeTag txt
+
+ removeMatches :: Text -> Text -> Text -> Text
+ removeMatches open close txt =
+ case Text.breakOn open (Text.toLower txt) of
+ (_, "") -> txt
+ (before, _) ->
+ let actualBefore = Text.take (Text.length before) txt
+ rest = Text.drop (Text.length before) txt
+ in case Text.breakOn close (Text.toLower rest) of
+ (_, "") -> actualBefore
+ (_, afterClose) ->
+ let skipLen = Text.length close
+ remaining = Text.drop (Text.length rest - Text.length afterClose + skipLen) txt
+ in actualBefore <> removeMatches open close remaining
+
+ stripTags :: Text -> Text
+ stripTags txt = go txt ""
+ where
+ go :: Text -> Text -> Text
+ go remaining acc =
+ case Text.breakOn "<" remaining of
+ (before, "") -> acc <> before
+ (before, rest) ->
+ case Text.breakOn ">" rest of
+ (_, "") -> acc <> before
+ (_, afterTag) -> go (Text.drop 1 afterTag) (acc <> before <> " ")
+
+ collapseWhitespace = Text.unwords <. Text.words
+
+summarizeContent :: Text -> Text -> Text -> IO (Either Text Text)
+summarizeContent apiKey url content = do
+ let truncatedContent = Text.take 50000 content
+ gemini = Provider.defaultOpenRouter apiKey "google/gemini-2.0-flash-001"
+ result <-
+ Provider.chat
+ gemini
+ []
+ [ Provider.Message
+ Provider.System
+ "You are a webpage summarizer. Provide a concise summary of the webpage content. Focus on the main points and key information. Be brief but comprehensive."
+ Nothing
+ Nothing,
+ Provider.Message
+ Provider.User
+ ("Summarize this webpage (" <> url <> "):\n\n" <> truncatedContent)
+ Nothing
+ Nothing
+ ]
+ case result of
+ Left err -> pure (Left ("Summarization failed: " <> err))
+ Right msg -> pure (Right (Provider.msgContent msg))
+
+webReaderTool :: Text -> Engine.Tool
+webReaderTool apiKey =
+ Engine.Tool
+ { Engine.toolName = "read_webpage",
+ Engine.toolDescription =
+ "Fetch and summarize a webpage. Use this when the user shares a URL or link "
+ <> "and wants to know what it contains. Returns a summary of the page content.",
+ Engine.toolJsonSchema =
+ Aeson.object
+ [ "type" .= ("object" :: Text),
+ "properties"
+ .= Aeson.object
+ [ "url"
+ .= Aeson.object
+ [ "type" .= ("string" :: Text),
+ "description" .= ("The URL of the webpage to read" :: Text)
+ ]
+ ],
+ "required" .= (["url"] :: [Text])
+ ],
+ Engine.toolExecute = executeWebReader apiKey
+ }
+
+executeWebReader :: Text -> Aeson.Value -> IO Aeson.Value
+executeWebReader apiKey v =
+ case Aeson.fromJSON v of
+ Aeson.Error e -> pure (Aeson.object ["error" .= Text.pack e])
+ Aeson.Success (args :: WebReaderArgs) -> do
+ fetchResult <- fetchWebpage (wrUrl args)
+ case fetchResult of
+ Left err ->
+ pure (Aeson.object ["error" .= err])
+ Right html -> do
+ let textContent = extractText html
+ if Text.null (Text.strip textContent)
+ then pure (Aeson.object ["error" .= ("Page appears to be empty or JavaScript-only" :: Text)])
+ else do
+ summaryResult <- summarizeContent apiKey (wrUrl args) textContent
+ case summaryResult of
+ Left err ->
+ pure
+ ( Aeson.object
+ [ "error" .= err,
+ "raw_content" .= Text.take 2000 textContent
+ ]
+ )
+ Right summary ->
+ pure
+ ( Aeson.object
+ [ "success" .= True,
+ "url" .= wrUrl args,
+ "summary" .= summary
+ ]
+ )
+
+newtype WebReaderArgs = WebReaderArgs
+ { wrUrl :: Text
+ }
+ deriving (Generic)
+
+instance Aeson.FromJSON WebReaderArgs where
+ parseJSON =
+ Aeson.withObject "WebReaderArgs" <| \v ->
+ WebReaderArgs </ (v Aeson..: "url")