1 files changed, 415 insertions, 0 deletions
diff --git a/Omni/Agent/Subagent/Coder.hs b/Omni/Agent/Subagent/Coder.hs
new file mode 100644
index 0000000..0f5a274
--- /dev/null
+++ b/Omni/Agent/Subagent/Coder.hs
@@ -0,0 +1,415 @@
+{-# LANGUAGE DeriveGeneric #-}
+{-# LANGUAGE OverloadedStrings #-}
+{-# LANGUAGE NoImplicitPrelude #-}
+
+-- | Hardened Coder Subagent with automatic init/verify/commit phases.
+--
+-- Based on Anthropic's "Effective Harnesses for Long-Running Agents":
+-- https://www.anthropic.com/engineering/effective-harnesses-for-long-running-agents
+--
+-- Key features:
+-- - Init phase: Check git status, build namespace to detect broken state
+-- - Work phase: Standard agent loop with tools
+-- - Verify phase: Run lint --fix, build namespace, run tests
+-- - Commit phase: Git commit with namespace-prefixed message
+-- - Recovery phase: Revert changes on failure
+--
+-- : out omni-agent-subagent-coder
+-- : dep aeson
+-- : dep async
+-- : dep stm
+-- : dep time
+module Omni.Agent.Subagent.Coder
+  ( -- * Types
+    CoderConfig (..),
+    CoderContext (..),
+    InitResult (..),
+    VerifyResult (..),
+
+    -- * Main Entry Point
+    runCoderSubagent,
+
+    -- * Phases (exported for testing)
+    runCoderInit,
+    runCoderVerify,
+    runCoderCommit,
+    runCoderRecovery,
+
+    -- * Helpers
+    defaultCoderConfig,
+    coderSystemPrompt,
+    coderTools,
+
+    -- * Testing
+    main,
+    test,
+  )
+where
+
+import Alpha
+import Data.Aeson ((.=))
+import qualified Data.Aeson as Aeson
+import qualified Data.Text as Text
+import qualified Data.Time.Clock as Clock
+import qualified Omni.Agent.Engine as Engine
+import qualified Omni.Agent.Provider as Provider
+import qualified Omni.Agent.Tools as Tools
+import qualified Omni.Test as Test
+import qualified System.Exit as Exit
+import qualified System.Process as Process
+
+main :: IO ()
+main = Test.run test
+
+test :: Test.Tree
+test =
+  Test.group
+    "Omni.Agent.Subagent.Coder"
+    [ Test.unit "CoderConfig has sensible defaults" <| do
+        let cfg = defaultCoderConfig "Omni/Task" "Implement feature X"
+        coderNamespace cfg Test.@=? "Omni/Task"
+        coderTimeout cfg Test.@=? 1200,
+      Test.unit "coderSystemPrompt includes namespace" <| do
+        let cfg = defaultCoderConfig "Biz/Cloud" "Fix the bug"
+        let prompt = coderSystemPrompt cfg Nothing
+        Text.isInfixOf "Biz/Cloud" prompt Test.@=? True,
+      Test.unit "runBashCapture returns output" <| do
+        (code, out, _) <- runBashCapture "echo hello"
+        code Test.@=? Exit.ExitSuccess
+        Text.strip out Test.@=? "hello"
+    ]
+
+-- | Configuration for a Coder subagent run
+data CoderConfig = CoderConfig
+  { coderNamespace :: Text,
+    coderTask :: Text,
+    coderContext :: Maybe Text,
+    coderModel :: Text,
+    coderTimeout :: Int,
+    coderMaxCost :: Double,
+    coderMaxTokens :: Int,
+    coderMaxIterations :: Int,
+    coderMaxVerifyRetries :: Int
+  }
+  deriving (Show, Eq, Generic)
+
+instance Aeson.ToJSON CoderConfig
+
+instance Aeson.FromJSON CoderConfig
+
+-- | Runtime context gathered during init phase
+data CoderContext = CoderContext
+  { ctxGitBranch :: Text,
+    ctxInitiallyBroken :: Bool,
+    ctxChangedFiles :: [Text]
+  }
+  deriving (Show, Eq, Generic)
+
+-- | Result of the init phase
+data InitResult
+  = InitSuccess CoderContext
+  | InitBrokenState Text CoderContext
+  | InitFailed Text
+  deriving (Show, Eq)
+
+-- | Result of the verify phase
+data VerifyResult
+  = VerifySuccess
+  | VerifyLintFailed Text
+  | VerifyTypecheckFailed Text
+  | VerifyBuildFailed Text
+  | VerifyTestFailed Text
+  deriving (Show, Eq)
+
+defaultCoderConfig :: Text -> Text -> CoderConfig
+defaultCoderConfig namespace task =
+  CoderConfig
+    { coderNamespace = namespace,
+      coderTask = task,
+      coderContext = Nothing,
+      coderModel = "anthropic/claude-sonnet-4",
+      coderTimeout = 1200,
+      coderMaxCost = 200.0,
+      coderMaxTokens = 300000,
+      coderMaxIterations = 30,
+      coderMaxVerifyRetries = 3
+    }
+
+-- | Run a bash command and capture output
+runBashCapture :: Text -> IO (Exit.ExitCode, Text, Text)
+runBashCapture cmd = do
+  (code, out, err) <- Process.readProcessWithExitCode "bash" ["-c", Text.unpack cmd] ""
+  pure (code, Text.pack out, Text.pack err)
+
+-- | Phase 1: Initialize - check environment, detect broken state
+runCoderInit :: CoderConfig -> IO InitResult
+runCoderInit cfg = do
+  -- Check git status
+  (_, branchOut, _) <- runBashCapture "git branch --show-current"
+  let gitBranch = Text.strip branchOut
+
+  -- Try to build the namespace to detect broken state
+  (buildCode, buildOut, buildErr) <- runBashCapture ("bild " <> coderNamespace cfg <> " 2>&1 || true")
+
+  let isBroken = buildCode /= Exit.ExitSuccess
+  let ctx =
+        CoderContext
+          { ctxGitBranch = gitBranch,
+            ctxInitiallyBroken = isBroken,
+            ctxChangedFiles = []
+          }
+
+  if isBroken
+    then pure <| InitBrokenState ("Build failed: " <> Text.take 500 (buildOut <> buildErr)) ctx
+    else pure <| InitSuccess ctx
+
+-- | Phase 3: Verify - run lint --fix, typecheck, build, test
+--
+-- Verification order:
+-- 1. lint --fix: Auto-apply formatting and simple fixes
+-- 2. typecheck.sh: Run type checker (works for Python and other languages)
+-- 3. bild: Build/compile the namespace
+-- 4. bild --test: Run tests
+runCoderVerify :: CoderConfig -> IO VerifyResult
+runCoderVerify cfg = do
+  -- First, run lint --fix to auto-apply formatting
+  _ <- runBashCapture ("lint --fix " <> coderNamespace cfg <> ".hs 2>&1")
+
+  -- Run typecheck.sh (handles Python mypy, etc.)
+  (typecheckCode, typecheckOut, typecheckErr) <-
+    runBashCapture ("typecheck.sh " <> coderNamespace cfg <> " 2>&1 || true")
+  -- Note: typecheck.sh may not exist for all namespaces, so we use || true
+  -- and only fail if it explicitly returns non-zero with actual errors
+  let typecheckFailed =
+        typecheckCode
+          /= Exit.ExitSuccess
+          && not (Text.null (Text.strip typecheckOut))
+          && not ("No such file" `Text.isInfixOf` typecheckErr)
+
+  if typecheckFailed
+    then pure <| VerifyTypecheckFailed (Text.take 1000 (typecheckOut <> typecheckErr))
+    else do
+      -- Build the namespace
+      (buildCode, buildOut, buildErr) <- runBashCapture ("bild " <> coderNamespace cfg <> " 2>&1")
+      case buildCode of
+        Exit.ExitSuccess -> do
+          -- Run tests if they exist
+          (testCode, testOut, testErr) <- runBashCapture ("bild --test " <> coderNamespace cfg <> " 2>&1")
+          case testCode of
+            Exit.ExitSuccess -> pure VerifySuccess
+            _ -> pure <| VerifyTestFailed (Text.take 1000 (testOut <> testErr))
+        _ -> pure <| VerifyBuildFailed (Text.take 1000 (buildOut <> buildErr))
+
+-- | Phase 4: Commit - git commit with message
+runCoderCommit :: CoderConfig -> Text -> IO (Either Text ())
+runCoderCommit cfg summary = do
+  let commitMsg = coderNamespace cfg <> ": " <> Text.take 72 summary
+
+  -- Stage all changes
+  (addCode, _, addErr) <- runBashCapture "git add -A"
+  case addCode of
+    Exit.ExitSuccess -> do
+      -- Commit
+      (commitCode, _, commitErr) <- runBashCapture ("git commit -m " <> quoteShell commitMsg)
+      case commitCode of
+        Exit.ExitSuccess -> pure (Right ())
+        _ -> pure <| Left ("git commit failed: " <> commitErr)
+    _ -> pure <| Left ("git add failed: " <> addErr)
+
+-- | Phase 5: Recovery - revert changes on failure
+runCoderRecovery :: IO ()
+runCoderRecovery = do
+  -- Revert uncommitted changes
+  _ <- runBashCapture "git checkout -- ."
+  _ <- runBashCapture "git clean -fd"
+  pure ()
+
+-- | Quote a string for shell
+quoteShell :: Text -> Text
+quoteShell t = "'" <> Text.replace "'" "'\\''" t <> "'"
+
+-- | Tools available to the Coder subagent
+coderTools :: [Engine.Tool]
+coderTools =
+  [ Tools.readFileTool,
+    Tools.writeFileTool,
+    Tools.editFileTool,
+    Tools.runBashTool,
+    Tools.searchCodebaseTool,
+    Tools.searchAndReadTool
+  ]
+
+-- | System prompt for the Coder subagent
+coderSystemPrompt :: CoderConfig -> Maybe Text -> Text
+coderSystemPrompt cfg maybeInitState =
+  Text.unlines
+    [ "You are a specialized Coder subagent.",
+      "",
+      "## Your Task",
+      coderTask cfg,
+      "",
+      "## Namespace",
+      "You are working in namespace: " <> coderNamespace cfg,
+      "Focus your changes on files in this namespace.",
+      "",
+      maybe "" (\ctx -> "## Context\n" <> ctx <> "\n") (coderContext cfg),
+      maybe "" (\st -> "## Current State\n" <> st <> "\n") maybeInitState,
+      "## Guidelines",
+      "1. Make incremental changes - edit one file at a time",
+      "2. After each edit, consider running `lint " <> coderNamespace cfg <> ".hs` to catch issues early",
+      "3. Use `bild " <> coderNamespace cfg <> "` to check your work compiles",
+      "4. If tests exist, run `bild --test " <> coderNamespace cfg <> "` to verify",
+      "5. Keep changes focused on the task - don't refactor unrelated code",
+      "6. If you find bugs unrelated to your task, note them but don't fix them",
+      "",
+      "## Completion",
+      "When you believe you're done:",
+      "1. Ensure all edits are complete",
+      "2. The harness will automatically run lint --fix, build, and tests",
+      "3. If verification fails, you'll be asked to fix the issues",
+      "4. Provide a brief summary of what you changed",
+      "",
+      "## Important",
+      "- Do NOT commit - the harness handles git operations",
+      "- Focus on making the code changes correctly"
+    ]
+
+-- | Format verify error for agent retry prompt
+formatVerifyError :: VerifyResult -> Text
+formatVerifyError VerifySuccess = ""
+formatVerifyError (VerifyLintFailed err) = "LINT ERRORS:\n" <> err
+formatVerifyError (VerifyTypecheckFailed err) = "TYPE ERRORS:\n" <> err
+formatVerifyError (VerifyBuildFailed err) = "BUILD ERRORS:\n" <> err
+formatVerifyError (VerifyTestFailed err) = "TEST FAILURES:\n" <> err
+
+-- | Main entry point: run the Coder subagent with all phases
+--
+-- This implements Anthropic's pattern of looping back to the agent on
+-- verification failure. The agent gets up to coderMaxVerifyRetries attempts
+-- to fix verification errors before we give up.
+--
+-- See: https://www.anthropic.com/engineering/effective-harnesses-for-long-running-agents
+--
+-- NOTE: We may want to change this behavior in the future. For some errors,
+-- immediate failure might be better than retry loops. But for now, we follow
+-- Anthropic's recommendation of letting the agent attempt to fix its mistakes.
+runCoderSubagent :: Text -> CoderConfig -> IO (Either Text Aeson.Value)
+runCoderSubagent openRouterKey cfg = do
+  startTime <- Clock.getCurrentTime
+
+  -- Phase 1: Init
+  initResult <- runCoderInit cfg
+  (ctx, initState) <- case initResult of
+    InitFailed err -> pure (Left err, Nothing)
+    InitBrokenState msg ctxVal -> pure (Right ctxVal, Just ("WARNING: Starting from broken state.\n" <> msg))
+    InitSuccess ctxVal -> pure (Right ctxVal, Nothing)
+
+  case ctx of
+    Left err -> pure <| Left ("Init failed: " <> err)
+    Right _context -> do
+      result <- runWorkVerifyLoop openRouterKey cfg initState 0 0 0.0
+      case result of
+        Left err -> do
+          runCoderRecovery
+          pure <| Left err
+        Right (finalMsg, totalTokens, totalCost, totalIters) -> do
+          -- Phase 4: Commit
+          let summary = Text.take 100 finalMsg
+          commitResult <- runCoderCommit cfg summary
+          case commitResult of
+            Left err -> do
+              runCoderRecovery
+              pure <| Left ("Commit failed: " <> err)
+            Right () -> do
+              endTime <- Clock.getCurrentTime
+              let duration = round (Clock.diffUTCTime endTime startTime)
+              pure
+                <| Right
+                <| Aeson.object
+                  [ "status" .= ("success" :: Text),
+                    "namespace" .= coderNamespace cfg,
+                    "summary" .= finalMsg,
+                    "tokens_used" .= totalTokens,
+                    "cost_cents" .= totalCost,
+                    "duration_seconds" .= (duration :: Int),
+                    "iterations" .= totalIters
+                  ]
+
+-- | Work-verify loop with retries on verification failure
+--
+-- Per Anthropic's guidance, we loop back to the agent when verification fails,
+-- giving it a chance to fix its mistakes. This is more effective than
+-- immediately failing, as the agent often can fix simple issues.
+runWorkVerifyLoop ::
+  Text ->
+  CoderConfig ->
+  Maybe Text ->
+  Int ->
+  Int ->
+  Double ->
+  IO (Either Text (Text, Int, Double, Int))
+runWorkVerifyLoop openRouterKey cfg initState verifyAttempt accTokens accCost = do
+  let isRetry = verifyAttempt > 0
+  let maxRetries = coderMaxVerifyRetries cfg
+
+  -- Build the prompt - include error context if this is a retry
+  let baseTask = coderTask cfg
+  let taskPrompt =
+        if isRetry
+          then baseTask <> "\n\n## VERIFICATION FAILED - PLEASE FIX\nThis is retry attempt " <> tshow verifyAttempt <> " of " <> tshow maxRetries <> "."
+          else baseTask
+
+  -- Phase 2: Work (run the agent)
+  let provider = Provider.defaultOpenRouter openRouterKey (coderModel cfg)
+  let systemPrompt = coderSystemPrompt cfg initState
+  let guardrails =
+        Engine.Guardrails
+          { Engine.guardrailMaxCostCents = coderMaxCost cfg - accCost,
+            Engine.guardrailMaxTokens = coderMaxTokens cfg - accTokens,
+            Engine.guardrailMaxDuplicateToolCalls = 20,
+            Engine.guardrailMaxTestFailures = 5,
+            Engine.guardrailMaxEditFailures = 10
+          }
+  let agentConfig =
+        Engine.AgentConfig
+          { Engine.agentModel = coderModel cfg,
+            Engine.agentTools = coderTools,
+            Engine.agentSystemPrompt = systemPrompt,
+            Engine.agentMaxIterations = coderMaxIterations cfg,
+            Engine.agentGuardrails = guardrails
+          }
+  let engineConfig = Engine.defaultEngineConfig
+
+  let timeoutMicros = coderTimeout cfg * 1000000
+  agentResult <-
+    race
+      (threadDelay timeoutMicros)
+      (Engine.runAgentWithProvider engineConfig provider agentConfig taskPrompt)
+
+  case agentResult of
+    Left () -> pure <| Left "Coder subagent timed out"
+    Right (Left err) -> pure <| Left ("Coder agent failed: " <> err)
+    Right (Right result) -> do
+      let newTokens = accTokens + Engine.resultTotalTokens result
+      let newCost = accCost + Engine.resultTotalCost result
+      let newIters = Engine.resultIterations result
+
+      -- Phase 3: Verify
+      verifyResult <- runCoderVerify cfg
+      case verifyResult of
+        VerifySuccess ->
+          pure <| Right (Engine.resultFinalMessage result, newTokens, newCost, newIters)
+        failure
+          | verifyAttempt >= maxRetries -> do
+              -- Max retries exceeded, give up
+              let errMsg = "Verification failed after " <> tshow (verifyAttempt + 1) <> " attempts: " <> formatVerifyError failure
+              pure <| Left errMsg
+          | otherwise -> do
+              -- Retry: loop back to agent with error context
+              -- NOTE: This follows Anthropic's pattern. We may want to change
+              -- this for certain error types in the future.
+              let errorContext = formatVerifyError failure
+              let retryState = case initState of
+                    Just st -> Just (st <> "\n\n" <> errorContext)
+                    Nothing -> Just errorContext
+              runWorkVerifyLoop openRouterKey cfg retryState (verifyAttempt + 1) newTokens newCost