{-# LANGUAGE DeriveGeneric #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE NoImplicitPrelude #-} -- | Hardened Coder Subagent with automatic init/verify/commit phases. -- -- Based on Anthropic's "Effective Harnesses for Long-Running Agents": -- https://www.anthropic.com/engineering/effective-harnesses-for-long-running-agents -- -- Key features: -- - Init phase: Check git status, build namespace to detect broken state -- - Work phase: Standard agent loop with tools -- - Verify phase: Run lint --fix, build namespace, run tests -- - Commit phase: Git commit with namespace-prefixed message -- - Recovery phase: Revert changes on failure -- -- : out omni-agent-subagent-coder -- : dep aeson -- : dep async -- : dep directory -- : dep mustache -- : dep stm -- : dep time module Omni.Agent.Subagent.Coder ( -- * Types CoderConfig (..), CoderContext (..), InitResult (..), VerifyResult (..), -- * Main Entry Point runCoderSubagent, -- * Phases (exported for testing) runCoderInit, runCoderVerify, runCoderCommit, runCoderRecovery, -- * Helpers defaultCoderConfig, coderSystemPrompt, coderTools, -- * Testing main, test, ) where import Alpha import Data.Aeson ((.=)) import qualified Data.Aeson as Aeson import qualified Data.Text as Text import qualified Data.Time.Clock as Clock import qualified Omni.Agent.Engine as Engine import qualified Omni.Agent.Prompts as Prompts import qualified Omni.Agent.Provider as Provider import qualified Omni.Agent.Tools as Tools import qualified Omni.Test as Test import qualified System.Exit as Exit import qualified System.Process as Process main :: IO () main = Test.run test test :: Test.Tree test = Test.group "Omni.Agent.Subagent.Coder" [ Test.unit "CoderConfig has sensible defaults" <| do let cfg = defaultCoderConfig "Omni/Task" "Implement feature X" coderNamespace cfg Test.@=? "Omni/Task" coderTimeout cfg Test.@=? 600, Test.unit "coderSystemPrompt includes namespace" <| do let cfg = defaultCoderConfig "Biz/Cloud" "Fix the bug" let prompt = coderSystemPrompt cfg Nothing Text.isInfixOf "Biz/Cloud" prompt Test.@=? True, Test.unit "runBashCapture returns output" <| do (code, out, _) <- runBashCapture "echo hello" code Test.@=? Exit.ExitSuccess Text.strip out Test.@=? "hello" ] -- | Configuration for a Coder subagent run data CoderConfig = CoderConfig { coderNamespace :: Text, coderTask :: Text, coderContext :: Maybe Text, coderModel :: Text, coderTimeout :: Int, coderMaxCost :: Double, coderMaxTokens :: Int, coderMaxIterations :: Int, coderMaxVerifyRetries :: Int } deriving (Show, Eq, Generic) instance Aeson.ToJSON CoderConfig instance Aeson.FromJSON CoderConfig -- | Runtime context gathered during init phase data CoderContext = CoderContext { ctxGitBranch :: Text, ctxInitiallyBroken :: Bool, ctxChangedFiles :: [Text] } deriving (Show, Eq, Generic) -- | Result of the init phase data InitResult = InitSuccess CoderContext | InitBrokenState Text CoderContext | InitFailed Text deriving (Show, Eq) -- | Result of the verify phase data VerifyResult = VerifySuccess | VerifyLintFailed Text | VerifyTypecheckFailed Text | VerifyBuildFailed Text | VerifyTestFailed Text deriving (Show, Eq) defaultCoderConfig :: Text -> Text -> CoderConfig defaultCoderConfig namespace task = CoderConfig { coderNamespace = namespace, coderTask = task, coderContext = Nothing, coderModel = "anthropic/claude-sonnet-4", coderTimeout = 600, coderMaxCost = 50.0, coderMaxTokens = 100000, coderMaxIterations = 20, coderMaxVerifyRetries = 2 } -- | Run a bash command and capture output -- Uses direnv exec to ensure the nix shell environment is loaded runBashCapture :: Text -> IO (Exit.ExitCode, Text, Text) runBashCapture cmd = do (code, out, err) <- Process.readProcessWithExitCode "direnv" ["exec", ".", "bash", "-c", Text.unpack cmd] "" pure (code, Text.pack out, Text.pack err) -- | Phase 1: Initialize - check environment, detect broken state runCoderInit :: CoderConfig -> IO InitResult runCoderInit cfg = do -- Pull latest changes _ <- runBashCapture "git pull --ff-only 2>&1 || true" -- Check git status (_, branchOut, _) <- runBashCapture "git branch --show-current" let gitBranch = Text.strip branchOut -- Try to build the namespace to detect broken state (buildCode, buildOut, buildErr) <- runBashCapture ("bild " <> coderNamespace cfg <> " 2>&1 || true") let isBroken = buildCode /= Exit.ExitSuccess let ctx = CoderContext { ctxGitBranch = gitBranch, ctxInitiallyBroken = isBroken, ctxChangedFiles = [] } if isBroken then pure <| InitBrokenState ("Build failed: " <> Text.take 500 (buildOut <> buildErr)) ctx else pure <| InitSuccess ctx -- | Phase 3: Verify - run lint --fix, typecheck, build, test -- -- Verification order: -- 1. lint --fix: Auto-apply formatting and simple fixes -- 2. typecheck.sh: Run type checker (works for Python and other languages) -- 3. bild: Build/compile the namespace -- 4. bild --test: Run tests runCoderVerify :: CoderConfig -> IO VerifyResult runCoderVerify cfg = do -- First, run lint --fix to auto-apply formatting _ <- runBashCapture ("lint --fix " <> coderNamespace cfg <> ".hs 2>&1") -- Run typecheck.sh (handles Python mypy, etc.) (typecheckCode, typecheckOut, typecheckErr) <- runBashCapture ("typecheck.sh " <> coderNamespace cfg <> " 2>&1 || true") -- Note: typecheck.sh may not exist for all namespaces, so we use || true -- and only fail if it explicitly returns non-zero with actual errors let typecheckFailed = typecheckCode /= Exit.ExitSuccess && not (Text.null (Text.strip typecheckOut)) && not ("No such file" `Text.isInfixOf` typecheckErr) if typecheckFailed then pure <| VerifyTypecheckFailed (Text.take 1000 (typecheckOut <> typecheckErr)) else do -- Build the namespace (buildCode, buildOut, buildErr) <- runBashCapture ("bild " <> coderNamespace cfg <> " 2>&1") case buildCode of Exit.ExitSuccess -> do -- Run tests if they exist (testCode, testOut, testErr) <- runBashCapture ("bild --test " <> coderNamespace cfg <> " 2>&1") case testCode of Exit.ExitSuccess -> pure VerifySuccess _ -> pure <| VerifyTestFailed (Text.take 1000 (testOut <> testErr)) _ -> pure <| VerifyBuildFailed (Text.take 1000 (buildOut <> buildErr)) -- | Phase 4: Commit - git commit with message runCoderCommit :: CoderConfig -> Text -> IO (Either Text ()) runCoderCommit cfg summary = do let commitMsg = coderNamespace cfg <> ": " <> Text.take 72 summary -- Stage all changes (addCode, _, addErr) <- runBashCapture "git add -A" case addCode of Exit.ExitSuccess -> do -- Commit (commitCode, _, commitErr) <- runBashCapture ("git commit -m " <> quoteShell commitMsg) case commitCode of Exit.ExitSuccess -> pure (Right ()) _ -> pure <| Left ("git commit failed: " <> commitErr) _ -> pure <| Left ("git add failed: " <> addErr) -- | Phase 5: Recovery - revert changes on failure runCoderRecovery :: IO () runCoderRecovery = do -- Revert uncommitted changes _ <- runBashCapture "git checkout -- ." _ <- runBashCapture "git clean -fd" pure () -- | Quote a string for shell quoteShell :: Text -> Text quoteShell t = "'" <> Text.replace "'" "'\\''" t <> "'" -- | Tools available to the Coder subagent coderTools :: [Engine.Tool] coderTools = [ Tools.readFileTool, Tools.writeFileTool, Tools.editFileTool, Tools.runBashTool, Tools.searchCodebaseTool, Tools.searchAndReadTool ] -- | Load system prompt from template, falling back to hardcoded if unavailable loadCoderSystemPrompt :: CoderConfig -> Maybe Text -> IO Text loadCoderSystemPrompt cfg maybeInitState = do let ctx = Aeson.object [ "namespace" .= coderNamespace cfg, "task" .= coderTask cfg, "context" .= coderContext cfg, "init_state" .= maybeInitState ] result <- Prompts.renderPrompt "subagents/coder/system" ctx case result of Right prompt -> pure prompt Left _err -> pure (coderSystemPrompt cfg maybeInitState) -- | Hardcoded fallback system prompt for the Coder subagent coderSystemPrompt :: CoderConfig -> Maybe Text -> Text coderSystemPrompt cfg maybeInitState = Text.unlines [ "You are a specialized Coder subagent with a STRICT TOKEN BUDGET.", "", "## Your Task", coderTask cfg, "", "## Namespace", "You are working in namespace: " <> coderNamespace cfg, "Focus your changes on files in this namespace.", "", maybe "" (\ctx -> "## Context\n" <> ctx <> "\n") (coderContext cfg), maybe "" (\st -> "## Current State\n" <> st <> "\n") maybeInitState, "## TOKEN EFFICIENCY (CRITICAL)", "", "You have LIMITED tokens. Every tool output consumes budget. Violating these rules will cause task failure:", "", "1. NEVER read entire files. Use search_and_read or read_file with start_line/end_line", "2. ALWAYS search first (search_codebase or search_and_read) to find exact locations", "3. Use edit_file for changes - NEVER write_file unless creating new files", "4. Limit bash output: pipe through `head -50` or `tail -50` when possible", "5. One focused action per step - don't chain unnecessary operations", "", "## Tool Strategy", "", "To find code: search_and_read (returns matches + context in one call)", "To understand a function: read_file with start_line/end_line (get just that function)", "To edit: edit_file with old_str/new_str (surgical patch, not full file rewrite)", "To explore structure: search_codebase with small max_results", "", "## Guidelines", "1. Make incremental changes - edit one file at a time", "2. After each edit, consider running `lint " <> coderNamespace cfg <> ".hs` to catch issues early", "3. Use `bild " <> coderNamespace cfg <> "` to check your work compiles", "4. Keep changes focused on the task - don't refactor unrelated code", "", "## Completion", "When done: provide a brief summary. The harness runs lint --fix, build, and tests automatically.", "", "## Important", "- Do NOT commit - the harness handles git operations", "- Do NOT read files you don't need to edit" ] -- | Format verify error for agent retry prompt formatVerifyError :: VerifyResult -> Text formatVerifyError VerifySuccess = "" formatVerifyError (VerifyLintFailed err) = "LINT ERRORS:\n" <> err formatVerifyError (VerifyTypecheckFailed err) = "TYPE ERRORS:\n" <> err formatVerifyError (VerifyBuildFailed err) = "BUILD ERRORS:\n" <> err formatVerifyError (VerifyTestFailed err) = "TEST FAILURES:\n" <> err -- | Main entry point: run the Coder subagent with all phases -- -- This implements Anthropic's pattern of looping back to the agent on -- verification failure. The agent gets up to coderMaxVerifyRetries attempts -- to fix verification errors before we give up. -- -- See: https://www.anthropic.com/engineering/effective-harnesses-for-long-running-agents -- -- NOTE: We may want to change this behavior in the future. For some errors, -- immediate failure might be better than retry loops. But for now, we follow -- Anthropic's recommendation of letting the agent attempt to fix its mistakes. runCoderSubagent :: Text -> CoderConfig -> IO (Either Text Aeson.Value) runCoderSubagent openRouterKey cfg = do startTime <- Clock.getCurrentTime -- Phase 1: Init initResult <- runCoderInit cfg (ctx, initState) <- case initResult of InitFailed err -> pure (Left err, Nothing) InitBrokenState msg ctxVal -> pure (Right ctxVal, Just ("WARNING: Starting from broken state.\n" <> msg)) InitSuccess ctxVal -> pure (Right ctxVal, Nothing) case ctx of Left err -> pure <| Left ("Init failed: " <> err) Right _context -> do result <- runWorkVerifyLoop openRouterKey cfg initState 0 0 0.0 case result of Left err -> do runCoderRecovery pure <| Left err Right (finalMsg, totalTokens, totalCost, totalIters) -> do -- Phase 4: Commit let summary = Text.take 100 finalMsg commitResult <- runCoderCommit cfg summary case commitResult of Left err -> do runCoderRecovery pure <| Left ("Commit failed: " <> err) Right () -> do endTime <- Clock.getCurrentTime let duration = round (Clock.diffUTCTime endTime startTime) pure <| Right <| Aeson.object [ "status" .= ("success" :: Text), "namespace" .= coderNamespace cfg, "summary" .= finalMsg, "tokens_used" .= totalTokens, "cost_cents" .= totalCost, "duration_seconds" .= (duration :: Int), "iterations" .= totalIters ] -- | Work-verify loop with retries on verification failure -- -- Per Anthropic's guidance, we loop back to the agent when verification fails, -- giving it a chance to fix its mistakes. This is more effective than -- immediately failing, as the agent often can fix simple issues. runWorkVerifyLoop :: Text -> CoderConfig -> Maybe Text -> Int -> Int -> Double -> IO (Either Text (Text, Int, Double, Int)) runWorkVerifyLoop openRouterKey cfg initState verifyAttempt accTokens accCost = do let isRetry = verifyAttempt > 0 let maxRetries = coderMaxVerifyRetries cfg -- Build the prompt - include error context if this is a retry let baseTask = coderTask cfg let taskPrompt = if isRetry then baseTask <> "\n\n## VERIFICATION FAILED - PLEASE FIX\nThis is retry attempt " <> tshow verifyAttempt <> " of " <> tshow maxRetries <> "." else baseTask -- Phase 2: Work (run the agent) let provider = Provider.defaultOpenRouter openRouterKey (coderModel cfg) systemPrompt <- loadCoderSystemPrompt cfg initState let guardrails = Engine.Guardrails { Engine.guardrailMaxCostCents = coderMaxCost cfg - accCost, Engine.guardrailMaxTokens = coderMaxTokens cfg - accTokens, Engine.guardrailMaxDuplicateToolCalls = 20, Engine.guardrailMaxTestFailures = 5, Engine.guardrailMaxEditFailures = 10 } let agentConfig = Engine.AgentConfig { Engine.agentModel = coderModel cfg, Engine.agentTools = coderTools, Engine.agentSystemPrompt = systemPrompt, Engine.agentMaxIterations = coderMaxIterations cfg, Engine.agentGuardrails = guardrails } let engineConfig = Engine.defaultEngineConfig let timeoutMicros = coderTimeout cfg * 1000000 agentResult <- race (threadDelay timeoutMicros) (Engine.runAgentWithProvider engineConfig provider agentConfig taskPrompt) case agentResult of Left () -> pure <| Left "Coder subagent timed out" Right (Left err) -> pure <| Left ("Coder agent failed: " <> err) Right (Right result) -> do let newTokens = accTokens + Engine.resultTotalTokens result let newCost = accCost + Engine.resultTotalCost result let newIters = Engine.resultIterations result -- Phase 3: Verify verifyResult <- runCoderVerify cfg case verifyResult of VerifySuccess -> pure <| Right (Engine.resultFinalMessage result, newTokens, newCost, newIters) failure | verifyAttempt >= maxRetries -> do -- Max retries exceeded, give up let errMsg = "Verification failed after " <> tshow (verifyAttempt + 1) <> " attempts: " <> formatVerifyError failure pure <| Left errMsg | otherwise -> do -- Retry: loop back to agent with error context -- NOTE: This follows Anthropic's pattern. We may want to change -- this for certain error types in the future. let errorContext = formatVerifyError failure let retryState = case initState of Just st -> Just (st <> "\n\n" <> errorContext) Nothing -> Just errorContext runWorkVerifyLoop openRouterKey cfg retryState (verifyAttempt + 1) newTokens newCost