diff options
| author | Ben Sima <ben@bensima.com> | 2025-12-18 13:39:40 -0500 |
|---|---|---|
| committer | Ben Sima <ben@bensima.com> | 2025-12-18 13:39:40 -0500 |
| commit | bd7724068938daa44dc74c28ab0aa5c45477bbfd (patch) | |
| tree | 919201025d3f7c95656b66947b48ecc983db5e6c /Omni/Agent/Subagent/Coder.hs | |
| parent | 133df9a099785b5eabb5ad19bcd7daa33eff9afe (diff) | |
Omni/Agent/Subagent/Coder: decouple from jr task system
Remove task_id requirement and all jr task CLI calls. The Coder subagent
now only requires namespace and task description - no external task
tracking needed.
Changes:
- Remove coderTaskId from CoderConfig
- Remove jr task show/update/comment calls
- Commit message uses namespace prefix instead of task ID
- Recovery phase just reverts git, no task comment
- Subagent.hs only validates namespace for Coder role
Diffstat (limited to 'Omni/Agent/Subagent/Coder.hs')
| -rw-r--r-- | Omni/Agent/Subagent/Coder.hs | 415 |
1 files changed, 415 insertions, 0 deletions
diff --git a/Omni/Agent/Subagent/Coder.hs b/Omni/Agent/Subagent/Coder.hs new file mode 100644 index 0000000..0f5a274 --- /dev/null +++ b/Omni/Agent/Subagent/Coder.hs @@ -0,0 +1,415 @@ +{-# LANGUAGE DeriveGeneric #-} +{-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE NoImplicitPrelude #-} + +-- | Hardened Coder Subagent with automatic init/verify/commit phases. +-- +-- Based on Anthropic's "Effective Harnesses for Long-Running Agents": +-- https://www.anthropic.com/engineering/effective-harnesses-for-long-running-agents +-- +-- Key features: +-- - Init phase: Check git status, build namespace to detect broken state +-- - Work phase: Standard agent loop with tools +-- - Verify phase: Run lint --fix, build namespace, run tests +-- - Commit phase: Git commit with namespace-prefixed message +-- - Recovery phase: Revert changes on failure +-- +-- : out omni-agent-subagent-coder +-- : dep aeson +-- : dep async +-- : dep stm +-- : dep time +module Omni.Agent.Subagent.Coder + ( -- * Types + CoderConfig (..), + CoderContext (..), + InitResult (..), + VerifyResult (..), + + -- * Main Entry Point + runCoderSubagent, + + -- * Phases (exported for testing) + runCoderInit, + runCoderVerify, + runCoderCommit, + runCoderRecovery, + + -- * Helpers + defaultCoderConfig, + coderSystemPrompt, + coderTools, + + -- * Testing + main, + test, + ) +where + +import Alpha +import Data.Aeson ((.=)) +import qualified Data.Aeson as Aeson +import qualified Data.Text as Text +import qualified Data.Time.Clock as Clock +import qualified Omni.Agent.Engine as Engine +import qualified Omni.Agent.Provider as Provider +import qualified Omni.Agent.Tools as Tools +import qualified Omni.Test as Test +import qualified System.Exit as Exit +import qualified System.Process as Process + +main :: IO () +main = Test.run test + +test :: Test.Tree +test = + Test.group + "Omni.Agent.Subagent.Coder" + [ Test.unit "CoderConfig has sensible defaults" <| do + let cfg = defaultCoderConfig "Omni/Task" "Implement feature X" + coderNamespace cfg Test.@=? "Omni/Task" + coderTimeout cfg Test.@=? 1200, + Test.unit "coderSystemPrompt includes namespace" <| do + let cfg = defaultCoderConfig "Biz/Cloud" "Fix the bug" + let prompt = coderSystemPrompt cfg Nothing + Text.isInfixOf "Biz/Cloud" prompt Test.@=? True, + Test.unit "runBashCapture returns output" <| do + (code, out, _) <- runBashCapture "echo hello" + code Test.@=? Exit.ExitSuccess + Text.strip out Test.@=? "hello" + ] + +-- | Configuration for a Coder subagent run +data CoderConfig = CoderConfig + { coderNamespace :: Text, + coderTask :: Text, + coderContext :: Maybe Text, + coderModel :: Text, + coderTimeout :: Int, + coderMaxCost :: Double, + coderMaxTokens :: Int, + coderMaxIterations :: Int, + coderMaxVerifyRetries :: Int + } + deriving (Show, Eq, Generic) + +instance Aeson.ToJSON CoderConfig + +instance Aeson.FromJSON CoderConfig + +-- | Runtime context gathered during init phase +data CoderContext = CoderContext + { ctxGitBranch :: Text, + ctxInitiallyBroken :: Bool, + ctxChangedFiles :: [Text] + } + deriving (Show, Eq, Generic) + +-- | Result of the init phase +data InitResult + = InitSuccess CoderContext + | InitBrokenState Text CoderContext + | InitFailed Text + deriving (Show, Eq) + +-- | Result of the verify phase +data VerifyResult + = VerifySuccess + | VerifyLintFailed Text + | VerifyTypecheckFailed Text + | VerifyBuildFailed Text + | VerifyTestFailed Text + deriving (Show, Eq) + +defaultCoderConfig :: Text -> Text -> CoderConfig +defaultCoderConfig namespace task = + CoderConfig + { coderNamespace = namespace, + coderTask = task, + coderContext = Nothing, + coderModel = "anthropic/claude-sonnet-4", + coderTimeout = 1200, + coderMaxCost = 200.0, + coderMaxTokens = 300000, + coderMaxIterations = 30, + coderMaxVerifyRetries = 3 + } + +-- | Run a bash command and capture output +runBashCapture :: Text -> IO (Exit.ExitCode, Text, Text) +runBashCapture cmd = do + (code, out, err) <- Process.readProcessWithExitCode "bash" ["-c", Text.unpack cmd] "" + pure (code, Text.pack out, Text.pack err) + +-- | Phase 1: Initialize - check environment, detect broken state +runCoderInit :: CoderConfig -> IO InitResult +runCoderInit cfg = do + -- Check git status + (_, branchOut, _) <- runBashCapture "git branch --show-current" + let gitBranch = Text.strip branchOut + + -- Try to build the namespace to detect broken state + (buildCode, buildOut, buildErr) <- runBashCapture ("bild " <> coderNamespace cfg <> " 2>&1 || true") + + let isBroken = buildCode /= Exit.ExitSuccess + let ctx = + CoderContext + { ctxGitBranch = gitBranch, + ctxInitiallyBroken = isBroken, + ctxChangedFiles = [] + } + + if isBroken + then pure <| InitBrokenState ("Build failed: " <> Text.take 500 (buildOut <> buildErr)) ctx + else pure <| InitSuccess ctx + +-- | Phase 3: Verify - run lint --fix, typecheck, build, test +-- +-- Verification order: +-- 1. lint --fix: Auto-apply formatting and simple fixes +-- 2. typecheck.sh: Run type checker (works for Python and other languages) +-- 3. bild: Build/compile the namespace +-- 4. bild --test: Run tests +runCoderVerify :: CoderConfig -> IO VerifyResult +runCoderVerify cfg = do + -- First, run lint --fix to auto-apply formatting + _ <- runBashCapture ("lint --fix " <> coderNamespace cfg <> ".hs 2>&1") + + -- Run typecheck.sh (handles Python mypy, etc.) + (typecheckCode, typecheckOut, typecheckErr) <- + runBashCapture ("typecheck.sh " <> coderNamespace cfg <> " 2>&1 || true") + -- Note: typecheck.sh may not exist for all namespaces, so we use || true + -- and only fail if it explicitly returns non-zero with actual errors + let typecheckFailed = + typecheckCode + /= Exit.ExitSuccess + && not (Text.null (Text.strip typecheckOut)) + && not ("No such file" `Text.isInfixOf` typecheckErr) + + if typecheckFailed + then pure <| VerifyTypecheckFailed (Text.take 1000 (typecheckOut <> typecheckErr)) + else do + -- Build the namespace + (buildCode, buildOut, buildErr) <- runBashCapture ("bild " <> coderNamespace cfg <> " 2>&1") + case buildCode of + Exit.ExitSuccess -> do + -- Run tests if they exist + (testCode, testOut, testErr) <- runBashCapture ("bild --test " <> coderNamespace cfg <> " 2>&1") + case testCode of + Exit.ExitSuccess -> pure VerifySuccess + _ -> pure <| VerifyTestFailed (Text.take 1000 (testOut <> testErr)) + _ -> pure <| VerifyBuildFailed (Text.take 1000 (buildOut <> buildErr)) + +-- | Phase 4: Commit - git commit with message +runCoderCommit :: CoderConfig -> Text -> IO (Either Text ()) +runCoderCommit cfg summary = do + let commitMsg = coderNamespace cfg <> ": " <> Text.take 72 summary + + -- Stage all changes + (addCode, _, addErr) <- runBashCapture "git add -A" + case addCode of + Exit.ExitSuccess -> do + -- Commit + (commitCode, _, commitErr) <- runBashCapture ("git commit -m " <> quoteShell commitMsg) + case commitCode of + Exit.ExitSuccess -> pure (Right ()) + _ -> pure <| Left ("git commit failed: " <> commitErr) + _ -> pure <| Left ("git add failed: " <> addErr) + +-- | Phase 5: Recovery - revert changes on failure +runCoderRecovery :: IO () +runCoderRecovery = do + -- Revert uncommitted changes + _ <- runBashCapture "git checkout -- ." + _ <- runBashCapture "git clean -fd" + pure () + +-- | Quote a string for shell +quoteShell :: Text -> Text +quoteShell t = "'" <> Text.replace "'" "'\\''" t <> "'" + +-- | Tools available to the Coder subagent +coderTools :: [Engine.Tool] +coderTools = + [ Tools.readFileTool, + Tools.writeFileTool, + Tools.editFileTool, + Tools.runBashTool, + Tools.searchCodebaseTool, + Tools.searchAndReadTool + ] + +-- | System prompt for the Coder subagent +coderSystemPrompt :: CoderConfig -> Maybe Text -> Text +coderSystemPrompt cfg maybeInitState = + Text.unlines + [ "You are a specialized Coder subagent.", + "", + "## Your Task", + coderTask cfg, + "", + "## Namespace", + "You are working in namespace: " <> coderNamespace cfg, + "Focus your changes on files in this namespace.", + "", + maybe "" (\ctx -> "## Context\n" <> ctx <> "\n") (coderContext cfg), + maybe "" (\st -> "## Current State\n" <> st <> "\n") maybeInitState, + "## Guidelines", + "1. Make incremental changes - edit one file at a time", + "2. After each edit, consider running `lint " <> coderNamespace cfg <> ".hs` to catch issues early", + "3. Use `bild " <> coderNamespace cfg <> "` to check your work compiles", + "4. If tests exist, run `bild --test " <> coderNamespace cfg <> "` to verify", + "5. Keep changes focused on the task - don't refactor unrelated code", + "6. If you find bugs unrelated to your task, note them but don't fix them", + "", + "## Completion", + "When you believe you're done:", + "1. Ensure all edits are complete", + "2. The harness will automatically run lint --fix, build, and tests", + "3. If verification fails, you'll be asked to fix the issues", + "4. Provide a brief summary of what you changed", + "", + "## Important", + "- Do NOT commit - the harness handles git operations", + "- Focus on making the code changes correctly" + ] + +-- | Format verify error for agent retry prompt +formatVerifyError :: VerifyResult -> Text +formatVerifyError VerifySuccess = "" +formatVerifyError (VerifyLintFailed err) = "LINT ERRORS:\n" <> err +formatVerifyError (VerifyTypecheckFailed err) = "TYPE ERRORS:\n" <> err +formatVerifyError (VerifyBuildFailed err) = "BUILD ERRORS:\n" <> err +formatVerifyError (VerifyTestFailed err) = "TEST FAILURES:\n" <> err + +-- | Main entry point: run the Coder subagent with all phases +-- +-- This implements Anthropic's pattern of looping back to the agent on +-- verification failure. The agent gets up to coderMaxVerifyRetries attempts +-- to fix verification errors before we give up. +-- +-- See: https://www.anthropic.com/engineering/effective-harnesses-for-long-running-agents +-- +-- NOTE: We may want to change this behavior in the future. For some errors, +-- immediate failure might be better than retry loops. But for now, we follow +-- Anthropic's recommendation of letting the agent attempt to fix its mistakes. +runCoderSubagent :: Text -> CoderConfig -> IO (Either Text Aeson.Value) +runCoderSubagent openRouterKey cfg = do + startTime <- Clock.getCurrentTime + + -- Phase 1: Init + initResult <- runCoderInit cfg + (ctx, initState) <- case initResult of + InitFailed err -> pure (Left err, Nothing) + InitBrokenState msg ctxVal -> pure (Right ctxVal, Just ("WARNING: Starting from broken state.\n" <> msg)) + InitSuccess ctxVal -> pure (Right ctxVal, Nothing) + + case ctx of + Left err -> pure <| Left ("Init failed: " <> err) + Right _context -> do + result <- runWorkVerifyLoop openRouterKey cfg initState 0 0 0.0 + case result of + Left err -> do + runCoderRecovery + pure <| Left err + Right (finalMsg, totalTokens, totalCost, totalIters) -> do + -- Phase 4: Commit + let summary = Text.take 100 finalMsg + commitResult <- runCoderCommit cfg summary + case commitResult of + Left err -> do + runCoderRecovery + pure <| Left ("Commit failed: " <> err) + Right () -> do + endTime <- Clock.getCurrentTime + let duration = round (Clock.diffUTCTime endTime startTime) + pure + <| Right + <| Aeson.object + [ "status" .= ("success" :: Text), + "namespace" .= coderNamespace cfg, + "summary" .= finalMsg, + "tokens_used" .= totalTokens, + "cost_cents" .= totalCost, + "duration_seconds" .= (duration :: Int), + "iterations" .= totalIters + ] + +-- | Work-verify loop with retries on verification failure +-- +-- Per Anthropic's guidance, we loop back to the agent when verification fails, +-- giving it a chance to fix its mistakes. This is more effective than +-- immediately failing, as the agent often can fix simple issues. +runWorkVerifyLoop :: + Text -> + CoderConfig -> + Maybe Text -> + Int -> + Int -> + Double -> + IO (Either Text (Text, Int, Double, Int)) +runWorkVerifyLoop openRouterKey cfg initState verifyAttempt accTokens accCost = do + let isRetry = verifyAttempt > 0 + let maxRetries = coderMaxVerifyRetries cfg + + -- Build the prompt - include error context if this is a retry + let baseTask = coderTask cfg + let taskPrompt = + if isRetry + then baseTask <> "\n\n## VERIFICATION FAILED - PLEASE FIX\nThis is retry attempt " <> tshow verifyAttempt <> " of " <> tshow maxRetries <> "." + else baseTask + + -- Phase 2: Work (run the agent) + let provider = Provider.defaultOpenRouter openRouterKey (coderModel cfg) + let systemPrompt = coderSystemPrompt cfg initState + let guardrails = + Engine.Guardrails + { Engine.guardrailMaxCostCents = coderMaxCost cfg - accCost, + Engine.guardrailMaxTokens = coderMaxTokens cfg - accTokens, + Engine.guardrailMaxDuplicateToolCalls = 20, + Engine.guardrailMaxTestFailures = 5, + Engine.guardrailMaxEditFailures = 10 + } + let agentConfig = + Engine.AgentConfig + { Engine.agentModel = coderModel cfg, + Engine.agentTools = coderTools, + Engine.agentSystemPrompt = systemPrompt, + Engine.agentMaxIterations = coderMaxIterations cfg, + Engine.agentGuardrails = guardrails + } + let engineConfig = Engine.defaultEngineConfig + + let timeoutMicros = coderTimeout cfg * 1000000 + agentResult <- + race + (threadDelay timeoutMicros) + (Engine.runAgentWithProvider engineConfig provider agentConfig taskPrompt) + + case agentResult of + Left () -> pure <| Left "Coder subagent timed out" + Right (Left err) -> pure <| Left ("Coder agent failed: " <> err) + Right (Right result) -> do + let newTokens = accTokens + Engine.resultTotalTokens result + let newCost = accCost + Engine.resultTotalCost result + let newIters = Engine.resultIterations result + + -- Phase 3: Verify + verifyResult <- runCoderVerify cfg + case verifyResult of + VerifySuccess -> + pure <| Right (Engine.resultFinalMessage result, newTokens, newCost, newIters) + failure + | verifyAttempt >= maxRetries -> do + -- Max retries exceeded, give up + let errMsg = "Verification failed after " <> tshow (verifyAttempt + 1) <> " attempts: " <> formatVerifyError failure + pure <| Left errMsg + | otherwise -> do + -- Retry: loop back to agent with error context + -- NOTE: This follows Anthropic's pattern. We may want to change + -- this for certain error types in the future. + let errorContext = formatVerifyError failure + let retryState = case initState of + Just st -> Just (st <> "\n\n" <> errorContext) + Nothing -> Just errorContext + runWorkVerifyLoop openRouterKey cfg retryState (verifyAttempt + 1) newTokens newCost |
