Omni/Agent/Subagent/Coder.hs


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436

{-# LANGUAGE DeriveGeneric #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE NoImplicitPrelude #-}

-- | Hardened Coder Subagent with automatic init/verify/commit phases.
--
-- Based on Anthropic's "Effective Harnesses for Long-Running Agents":
-- https://www.anthropic.com/engineering/effective-harnesses-for-long-running-agents
--
-- Key features:
-- - Init phase: Check git status, build namespace to detect broken state
-- - Work phase: Standard agent loop with tools
-- - Verify phase: Run lint --fix, build namespace, run tests
-- - Commit phase: Git commit with namespace-prefixed message
-- - Recovery phase: Revert changes on failure
--
-- : out omni-agent-subagent-coder
-- : dep aeson
-- : dep async
-- : dep directory
-- : dep mustache
-- : dep stm
-- : dep time
module Omni.Agent.Subagent.Coder
  ( -- * Types
    CoderConfig (..),
    CoderContext (..),
    InitResult (..),
    VerifyResult (..),

    -- * Main Entry Point
    runCoderSubagent,

    -- * Phases (exported for testing)
    runCoderInit,
    runCoderVerify,
    runCoderCommit,
    runCoderRecovery,

    -- * Helpers
    defaultCoderConfig,
    coderSystemPrompt,
    coderTools,

    -- * Testing
    main,
    test,
  )
where

import Alpha
import Data.Aeson ((.=))
import qualified Data.Aeson as Aeson
import qualified Data.Text as Text
import qualified Data.Time.Clock as Clock
import qualified Omni.Agent.Engine as Engine
import qualified Omni.Agent.Prompts as Prompts
import qualified Omni.Agent.Provider as Provider
import qualified Omni.Agent.Tools as Tools
import qualified Omni.Test as Test
import qualified System.Exit as Exit
import qualified System.Process as Process

main :: IO ()
main = Test.run test

test :: Test.Tree
test =
  Test.group
    "Omni.Agent.Subagent.Coder"
    [ Test.unit "CoderConfig has sensible defaults" <| do
        let cfg = defaultCoderConfig "Omni/Task" "Implement feature X"
        coderNamespace cfg Test.@=? "Omni/Task"
        coderTimeout cfg Test.@=? 1200,
      Test.unit "coderSystemPrompt includes namespace" <| do
        let cfg = defaultCoderConfig "Biz/Cloud" "Fix the bug"
        let prompt = coderSystemPrompt cfg Nothing
        Text.isInfixOf "Biz/Cloud" prompt Test.@=? True,
      Test.unit "runBashCapture returns output" <| do
        (code, out, _) <- runBashCapture "echo hello"
        code Test.@=? Exit.ExitSuccess
        Text.strip out Test.@=? "hello"
    ]

-- | Configuration for a Coder subagent run
data CoderConfig = CoderConfig
  { coderNamespace :: Text,
    coderTask :: Text,
    coderContext :: Maybe Text,
    coderModel :: Text,
    coderTimeout :: Int,
    coderMaxCost :: Double,
    coderMaxTokens :: Int,
    coderMaxIterations :: Int,
    coderMaxVerifyRetries :: Int
  }
  deriving (Show, Eq, Generic)

instance Aeson.ToJSON CoderConfig

instance Aeson.FromJSON CoderConfig

-- | Runtime context gathered during init phase
data CoderContext = CoderContext
  { ctxGitBranch :: Text,
    ctxInitiallyBroken :: Bool,
    ctxChangedFiles :: [Text]
  }
  deriving (Show, Eq, Generic)

-- | Result of the init phase
data InitResult
  = InitSuccess CoderContext
  | InitBrokenState Text CoderContext
  | InitFailed Text
  deriving (Show, Eq)

-- | Result of the verify phase
data VerifyResult
  = VerifySuccess
  | VerifyLintFailed Text
  | VerifyTypecheckFailed Text
  | VerifyBuildFailed Text
  | VerifyTestFailed Text
  deriving (Show, Eq)

defaultCoderConfig :: Text -> Text -> CoderConfig
defaultCoderConfig namespace task =
  CoderConfig
    { coderNamespace = namespace,
      coderTask = task,
      coderContext = Nothing,
      coderModel = "anthropic/claude-sonnet-4",
      coderTimeout = 1200,
      coderMaxCost = 200.0,
      coderMaxTokens = 300000,
      coderMaxIterations = 30,
      coderMaxVerifyRetries = 3
    }

-- | Run a bash command and capture output
runBashCapture :: Text -> IO (Exit.ExitCode, Text, Text)
runBashCapture cmd = do
  (code, out, err) <- Process.readProcessWithExitCode "bash" ["-c", Text.unpack cmd] ""
  pure (code, Text.pack out, Text.pack err)

-- | Phase 1: Initialize - check environment, detect broken state
runCoderInit :: CoderConfig -> IO InitResult
runCoderInit cfg = do
  -- Pull latest changes
  _ <- runBashCapture "git pull --ff-only 2>&1 || true"

  -- Check git status
  (_, branchOut, _) <- runBashCapture "git branch --show-current"
  let gitBranch = Text.strip branchOut

  -- Try to build the namespace to detect broken state
  (buildCode, buildOut, buildErr) <- runBashCapture ("bild " <> coderNamespace cfg <> " 2>&1 || true")

  let isBroken = buildCode /= Exit.ExitSuccess
  let ctx =
        CoderContext
          { ctxGitBranch = gitBranch,
            ctxInitiallyBroken = isBroken,
            ctxChangedFiles = []
          }

  if isBroken
    then pure <| InitBrokenState ("Build failed: " <> Text.take 500 (buildOut <> buildErr)) ctx
    else pure <| InitSuccess ctx

-- | Phase 3: Verify - run lint --fix, typecheck, build, test
--
-- Verification order:
-- 1. lint --fix: Auto-apply formatting and simple fixes
-- 2. typecheck.sh: Run type checker (works for Python and other languages)
-- 3. bild: Build/compile the namespace
-- 4. bild --test: Run tests
runCoderVerify :: CoderConfig -> IO VerifyResult
runCoderVerify cfg = do
  -- First, run lint --fix to auto-apply formatting
  _ <- runBashCapture ("lint --fix " <> coderNamespace cfg <> ".hs 2>&1")

  -- Run typecheck.sh (handles Python mypy, etc.)
  (typecheckCode, typecheckOut, typecheckErr) <-
    runBashCapture ("typecheck.sh " <> coderNamespace cfg <> " 2>&1 || true")
  -- Note: typecheck.sh may not exist for all namespaces, so we use || true
  -- and only fail if it explicitly returns non-zero with actual errors
  let typecheckFailed =
        typecheckCode
          /= Exit.ExitSuccess
          && not (Text.null (Text.strip typecheckOut))
          && not ("No such file" `Text.isInfixOf` typecheckErr)

  if typecheckFailed
    then pure <| VerifyTypecheckFailed (Text.take 1000 (typecheckOut <> typecheckErr))
    else do
      -- Build the namespace
      (buildCode, buildOut, buildErr) <- runBashCapture ("bild " <> coderNamespace cfg <> " 2>&1")
      case buildCode of
        Exit.ExitSuccess -> do
          -- Run tests if they exist
          (testCode, testOut, testErr) <- runBashCapture ("bild --test " <> coderNamespace cfg <> " 2>&1")
          case testCode of
            Exit.ExitSuccess -> pure VerifySuccess
            _ -> pure <| VerifyTestFailed (Text.take 1000 (testOut <> testErr))
        _ -> pure <| VerifyBuildFailed (Text.take 1000 (buildOut <> buildErr))

-- | Phase 4: Commit - git commit with message
runCoderCommit :: CoderConfig -> Text -> IO (Either Text ())
runCoderCommit cfg summary = do
  let commitMsg = coderNamespace cfg <> ": " <> Text.take 72 summary

  -- Stage all changes
  (addCode, _, addErr) <- runBashCapture "git add -A"
  case addCode of
    Exit.ExitSuccess -> do
      -- Commit
      (commitCode, _, commitErr) <- runBashCapture ("git commit -m " <> quoteShell commitMsg)
      case commitCode of
        Exit.ExitSuccess -> pure (Right ())
        _ -> pure <| Left ("git commit failed: " <> commitErr)
    _ -> pure <| Left ("git add failed: " <> addErr)

-- | Phase 5: Recovery - revert changes on failure
runCoderRecovery :: IO ()
runCoderRecovery = do
  -- Revert uncommitted changes
  _ <- runBashCapture "git checkout -- ."
  _ <- runBashCapture "git clean -fd"
  pure ()

-- | Quote a string for shell
quoteShell :: Text -> Text
quoteShell t = "'" <> Text.replace "'" "'\\''" t <> "'"

-- | Tools available to the Coder subagent
coderTools :: [Engine.Tool]
coderTools =
  [ Tools.readFileTool,
    Tools.writeFileTool,
    Tools.editFileTool,
    Tools.runBashTool,
    Tools.searchCodebaseTool,
    Tools.searchAndReadTool
  ]

-- | Load system prompt from template, falling back to hardcoded if unavailable
loadCoderSystemPrompt :: CoderConfig -> Maybe Text -> IO Text
loadCoderSystemPrompt cfg maybeInitState = do
  let ctx =
        Aeson.object
          [ "namespace" .= coderNamespace cfg,
            "task" .= coderTask cfg,
            "context" .= coderContext cfg,
            "init_state" .= maybeInitState
          ]
  result <- Prompts.renderPrompt "subagents/coder/system" ctx
  case result of
    Right prompt -> pure prompt
    Left _err -> pure (coderSystemPrompt cfg maybeInitState)

-- | Hardcoded fallback system prompt for the Coder subagent
coderSystemPrompt :: CoderConfig -> Maybe Text -> Text
coderSystemPrompt cfg maybeInitState =
  Text.unlines
    [ "You are a specialized Coder subagent.",
      "",
      "## Your Task",
      coderTask cfg,
      "",
      "## Namespace",
      "You are working in namespace: " <> coderNamespace cfg,
      "Focus your changes on files in this namespace.",
      "",
      maybe "" (\ctx -> "## Context\n" <> ctx <> "\n") (coderContext cfg),
      maybe "" (\st -> "## Current State\n" <> st <> "\n") maybeInitState,
      "## Guidelines",
      "1. Make incremental changes - edit one file at a time",
      "2. After each edit, consider running `lint " <> coderNamespace cfg <> ".hs` to catch issues early",
      "3. Use `bild " <> coderNamespace cfg <> "` to check your work compiles",
      "4. If tests exist, run `bild --test " <> coderNamespace cfg <> "` to verify",
      "5. Keep changes focused on the task - don't refactor unrelated code",
      "6. If you find bugs unrelated to your task, note them but don't fix them",
      "",
      "## Completion",
      "When you believe you're done:",
      "1. Ensure all edits are complete",
      "2. The harness will automatically run lint --fix, build, and tests",
      "3. If verification fails, you'll be asked to fix the issues",
      "4. Provide a brief summary of what you changed",
      "",
      "## Important",
      "- Do NOT commit - the harness handles git operations",
      "- Focus on making the code changes correctly"
    ]

-- | Format verify error for agent retry prompt
formatVerifyError :: VerifyResult -> Text
formatVerifyError VerifySuccess = ""
formatVerifyError (VerifyLintFailed err) = "LINT ERRORS:\n" <> err
formatVerifyError (VerifyTypecheckFailed err) = "TYPE ERRORS:\n" <> err
formatVerifyError (VerifyBuildFailed err) = "BUILD ERRORS:\n" <> err
formatVerifyError (VerifyTestFailed err) = "TEST FAILURES:\n" <> err

-- | Main entry point: run the Coder subagent with all phases
--
-- This implements Anthropic's pattern of looping back to the agent on
-- verification failure. The agent gets up to coderMaxVerifyRetries attempts
-- to fix verification errors before we give up.
--
-- See: https://www.anthropic.com/engineering/effective-harnesses-for-long-running-agents
--
-- NOTE: We may want to change this behavior in the future. For some errors,
-- immediate failure might be better than retry loops. But for now, we follow
-- Anthropic's recommendation of letting the agent attempt to fix its mistakes.
runCoderSubagent :: Text -> CoderConfig -> IO (Either Text Aeson.Value)
runCoderSubagent openRouterKey cfg = do
  startTime <- Clock.getCurrentTime

  -- Phase 1: Init
  initResult <- runCoderInit cfg
  (ctx, initState) <- case initResult of
    InitFailed err -> pure (Left err, Nothing)
    InitBrokenState msg ctxVal -> pure (Right ctxVal, Just ("WARNING: Starting from broken state.\n" <> msg))
    InitSuccess ctxVal -> pure (Right ctxVal, Nothing)

  case ctx of
    Left err -> pure <| Left ("Init failed: " <> err)
    Right _context -> do
      result <- runWorkVerifyLoop openRouterKey cfg initState 0 0 0.0
      case result of
        Left err -> do
          runCoderRecovery
          pure <| Left err
        Right (finalMsg, totalTokens, totalCost, totalIters) -> do
          -- Phase 4: Commit
          let summary = Text.take 100 finalMsg
          commitResult <- runCoderCommit cfg summary
          case commitResult of
            Left err -> do
              runCoderRecovery
              pure <| Left ("Commit failed: " <> err)
            Right () -> do
              endTime <- Clock.getCurrentTime
              let duration = round (Clock.diffUTCTime endTime startTime)
              pure
                <| Right
                <| Aeson.object
                  [ "status" .= ("success" :: Text),
                    "namespace" .= coderNamespace cfg,
                    "summary" .= finalMsg,
                    "tokens_used" .= totalTokens,
                    "cost_cents" .= totalCost,
                    "duration_seconds" .= (duration :: Int),
                    "iterations" .= totalIters
                  ]

-- | Work-verify loop with retries on verification failure
--
-- Per Anthropic's guidance, we loop back to the agent when verification fails,
-- giving it a chance to fix its mistakes. This is more effective than
-- immediately failing, as the agent often can fix simple issues.
runWorkVerifyLoop ::
  Text ->
  CoderConfig ->
  Maybe Text ->
  Int ->
  Int ->
  Double ->
  IO (Either Text (Text, Int, Double, Int))
runWorkVerifyLoop openRouterKey cfg initState verifyAttempt accTokens accCost = do
  let isRetry = verifyAttempt > 0
  let maxRetries = coderMaxVerifyRetries cfg

  -- Build the prompt - include error context if this is a retry
  let baseTask = coderTask cfg
  let taskPrompt =
        if isRetry
          then baseTask <> "\n\n## VERIFICATION FAILED - PLEASE FIX\nThis is retry attempt " <> tshow verifyAttempt <> " of " <> tshow maxRetries <> "."
          else baseTask

  -- Phase 2: Work (run the agent)
  let provider = Provider.defaultOpenRouter openRouterKey (coderModel cfg)
  systemPrompt <- loadCoderSystemPrompt cfg initState
  let guardrails =
        Engine.Guardrails
          { Engine.guardrailMaxCostCents = coderMaxCost cfg - accCost,
            Engine.guardrailMaxTokens = coderMaxTokens cfg - accTokens,
            Engine.guardrailMaxDuplicateToolCalls = 20,
            Engine.guardrailMaxTestFailures = 5,
            Engine.guardrailMaxEditFailures = 10
          }
  let agentConfig =
        Engine.AgentConfig
          { Engine.agentModel = coderModel cfg,
            Engine.agentTools = coderTools,
            Engine.agentSystemPrompt = systemPrompt,
            Engine.agentMaxIterations = coderMaxIterations cfg,
            Engine.agentGuardrails = guardrails
          }
  let engineConfig = Engine.defaultEngineConfig

  let timeoutMicros = coderTimeout cfg * 1000000
  agentResult <-
    race
      (threadDelay timeoutMicros)
      (Engine.runAgentWithProvider engineConfig provider agentConfig taskPrompt)

  case agentResult of
    Left () -> pure <| Left "Coder subagent timed out"
    Right (Left err) -> pure <| Left ("Coder agent failed: " <> err)
    Right (Right result) -> do
      let newTokens = accTokens + Engine.resultTotalTokens result
      let newCost = accCost + Engine.resultTotalCost result
      let newIters = Engine.resultIterations result

      -- Phase 3: Verify
      verifyResult <- runCoderVerify cfg
      case verifyResult of
        VerifySuccess ->
          pure <| Right (Engine.resultFinalMessage result, newTokens, newCost, newIters)
        failure
          | verifyAttempt >= maxRetries -> do
              -- Max retries exceeded, give up
              let errMsg = "Verification failed after " <> tshow (verifyAttempt + 1) <> " attempts: " <> formatVerifyError failure
              pure <| Left errMsg
          | otherwise -> do
              -- Retry: loop back to agent with error context
              -- NOTE: This follows Anthropic's pattern. We may want to change
              -- this for certain error types in the future.
              let errorContext = formatVerifyError failure
              let retryState = case initState of
                    Just st -> Just (st <> "\n\n" <> errorContext)
                    Nothing -> Just errorContext
              runWorkVerifyLoop openRouterKey cfg retryState (verifyAttempt + 1) newTokens newCost