first commit

2026-02-25 23:49:54 -05:00
commit 4d097161cb
1775 changed files with 452827 additions and 0 deletions
--- a/tests/init.py
+++ b/tests/init.py
--- a/tests/test_architecture_tracker.py
+++ b/tests/test_architecture_tracker.py
@@ -0,0 +1,283 @@
+"""Tests for ArchitectureTracker."""
+
+import asyncio
+import json
+import os
+import tempfile
+from unittest.mock import AsyncMock, MagicMock, patch
+
+import pytest
+
+from app_factory.core.architecture_tracker import ArchitectureTracker
+
+
+# ---------------------------------------------------------------------------
+# Fixtures
+# ---------------------------------------------------------------------------
+
+@pytest.fixture
+def tmp_data_dir(tmp_path):
+    """Temporary data directory."""
+    return str(tmp_path / "data")
+
+
+@pytest.fixture
+def tracker(tmp_data_dir):
+    """ArchitectureTracker with no Claude SDK client."""
+    return ArchitectureTracker(data_dir=tmp_data_dir)
+
+
+@pytest.fixture
+def tracker_with_client(tmp_data_dir):
+    """ArchitectureTracker with a mocked Claude SDK client."""
+    mock_client = AsyncMock()
+    t = ArchitectureTracker(data_dir=tmp_data_dir)
+    t._client = mock_client
+    return t, mock_client
+
+
+# ---------------------------------------------------------------------------
+# Initialization
+# ---------------------------------------------------------------------------
+
+class TestInitialization:
+    def test_creates_default_architecture(self, tracker):
+        arch = tracker._architecture
+        assert "modules" in arch
+        assert "utilities" in arch
+        assert "design_patterns" in arch
+        assert "naming_conventions" in arch
+        assert "tech_stack" in arch
+        assert arch["version"] == 1
+        assert "last_updated" in arch
+
+    def test_creates_data_directory(self, tmp_data_dir):
+        tracker = ArchitectureTracker(data_dir=tmp_data_dir)
+        assert os.path.isdir(tmp_data_dir)
+
+    def test_default_architecture_has_empty_lists(self, tracker):
+        assert tracker._architecture["modules"] == []
+        assert tracker._architecture["utilities"] == []
+        assert tracker._architecture["design_patterns"] == []
+
+    def test_default_naming_conventions(self, tracker):
+        conventions = tracker._architecture["naming_conventions"]
+        assert conventions["variables"] == "snake_case"
+        assert conventions["classes"] == "PascalCase"
+
+
+# ---------------------------------------------------------------------------
+# Load / Save Persistence
+# ---------------------------------------------------------------------------
+
+class TestPersistence:
+    def test_save_and_load(self, tmp_data_dir):
+        tracker1 = ArchitectureTracker(data_dir=tmp_data_dir)
+        tracker1.add_module("TestModule", "A test module", "test.py")
+
+        tracker2 = ArchitectureTracker(data_dir=tmp_data_dir)
+        assert len(tracker2._architecture["modules"]) == 1
+        assert tracker2._architecture["modules"][0]["name"] == "TestModule"
+
+    def test_save_updates_timestamp(self, tracker):
+        old_ts = tracker._architecture.get("last_updated", "")
+        tracker.save_architecture(tracker._architecture)
+        new_ts = tracker._architecture["last_updated"]
+        # Timestamp should be updated (or at least present)
+        assert new_ts is not None
+        assert len(new_ts) > 0
+
+    def test_load_corrupt_file_returns_default(self, tmp_data_dir):
+        os.makedirs(tmp_data_dir, exist_ok=True)
+        corrupt_path = os.path.join(tmp_data_dir, "global_architecture.json")
+        with open(corrupt_path, "w") as f:
+            f.write("NOT VALID JSON {{{")
+
+        tracker = ArchitectureTracker(data_dir=tmp_data_dir)
+        assert tracker._architecture["version"] == 1
+        assert tracker._architecture["modules"] == []
+
+
+# ---------------------------------------------------------------------------
+# add_module
+# ---------------------------------------------------------------------------
+
+class TestAddModule:
+    def test_add_module(self, tracker):
+        tracker.add_module("MyModule", "Does something", "src/my_module.py")
+        assert len(tracker._architecture["modules"]) == 1
+        mod = tracker._architecture["modules"][0]
+        assert mod["name"] == "MyModule"
+        assert mod["purpose"] == "Does something"
+        assert mod["file_path"] == "src/my_module.py"
+
+    def test_add_module_persists(self, tmp_data_dir):
+        t1 = ArchitectureTracker(data_dir=tmp_data_dir)
+        t1.add_module("Persisted", "persists", "p.py")
+
+        t2 = ArchitectureTracker(data_dir=tmp_data_dir)
+        assert any(m["name"] == "Persisted" for m in t2._architecture["modules"])
+
+
+# ---------------------------------------------------------------------------
+# add_utility
+# ---------------------------------------------------------------------------
+
+class TestAddUtility:
+    def test_add_utility(self, tracker):
+        tracker.add_utility("helper_func", "Helps with things", "utils.py")
+        assert len(tracker._architecture["utilities"]) == 1
+        util = tracker._architecture["utilities"][0]
+        assert util["name"] == "helper_func"
+        assert util["description"] == "Helps with things"
+        assert util["file_path"] == "utils.py"
+
+    def test_add_utility_persists(self, tmp_data_dir):
+        t1 = ArchitectureTracker(data_dir=tmp_data_dir)
+        t1.add_utility("persisted_func", "persists", "p.py")
+
+        t2 = ArchitectureTracker(data_dir=tmp_data_dir)
+        assert any(u["name"] == "persisted_func" for u in t2._architecture["utilities"])
+
+
+# ---------------------------------------------------------------------------
+# get_architecture_summary
+# ---------------------------------------------------------------------------
+
+class TestGetArchitectureSummary:
+    def test_returns_formatted_string(self, tracker):
+        tracker.add_module("GraphEngine", "Runs the graph", "graph.py")
+        tracker.add_utility("parse_json", "Parses JSON input", "utils.py")
+        summary = tracker.get_architecture_summary()
+        assert isinstance(summary, str)
+        assert "Project Architecture Summary" in summary
+        assert "GraphEngine" in summary
+        assert "parse_json" in summary
+
+    def test_includes_tech_stack(self, tracker):
+        summary = tracker.get_architecture_summary()
+        assert "Tech Stack" in summary
+        assert "Python" in summary
+
+    def test_includes_naming_conventions(self, tracker):
+        summary = tracker.get_architecture_summary()
+        assert "Naming Conventions" in summary
+        assert "snake_case" in summary
+
+    def test_respects_max_tokens_limit(self, tracker):
+        # Add many modules to produce a large summary
+        for i in range(200):
+            tracker._architecture["modules"].append({
+                "name": f"Module_{i}",
+                "purpose": f"Purpose of module {i} with extra text for padding",
+                "file_path": f"src/module_{i}.py",
+            })
+        # Very small token limit
+        summary = tracker.get_architecture_summary(max_tokens=50)
+        # 50 tokens * 4 chars = 200 chars max
+        assert len(summary) <= 200
+
+    def test_empty_architecture_still_returns_summary(self, tracker):
+        summary = tracker.get_architecture_summary()
+        assert "Project Architecture Summary" in summary
+
+
+# ---------------------------------------------------------------------------
+# update_architecture (async)
+# ---------------------------------------------------------------------------
+
+class TestUpdateArchitecture:
+    def test_basic_extraction_no_client(self, tracker, tmp_path):
+        # Create a sample Python file
+        py_file = tmp_path / "sample.py"
+        py_file.write_text(
+            '"""Sample module."""\n\n'
+            'class SampleClass:\n'
+            '    """A sample class for testing."""\n'
+            '    pass\n\n'
+            'def sample_function():\n'
+            '    """A sample utility function."""\n'
+            '    pass\n\n'
+            'def _private_function():\n'
+            '    """Should be skipped."""\n'
+            '    pass\n'
+        )
+
+        task = {"title": "Test task", "description": "Testing extraction"}
+        asyncio.run(tracker.update_architecture(task, [str(py_file)]))
+
+        modules = tracker._architecture["modules"]
+        utilities = tracker._architecture["utilities"]
+        assert any(m["name"] == "SampleClass" for m in modules)
+        assert any(u["name"] == "sample_function" for u in utilities)
+        # Private functions should be skipped
+        assert not any(u["name"] == "_private_function" for u in utilities)
+
+    def test_skips_non_python_files(self, tracker, tmp_path):
+        txt_file = tmp_path / "readme.txt"
+        txt_file.write_text("Not a Python file")
+
+        task = {"title": "Test task"}
+        asyncio.run(tracker.update_architecture(task, [str(txt_file)]))
+        assert tracker._architecture["modules"] == []
+        assert tracker._architecture["utilities"] == []
+
+    def test_skips_nonexistent_files(self, tracker):
+        task = {"title": "Test task"}
+        asyncio.run(tracker.update_architecture(task, ["/nonexistent/file.py"]))
+        assert tracker._architecture["modules"] == []
+
+    def test_ai_extraction_with_mock_client(self, tracker_with_client, tmp_path):
+        tracker, mock_client = tracker_with_client
+
+        py_file = tmp_path / "ai_sample.py"
+        py_file.write_text(
+            'class AIExtracted:\n'
+            '    """Extracted by AI."""\n'
+            '    pass\n'
+        )
+
+        mock_response = MagicMock()
+        mock_response.text = json.dumps({
+            "classes": [{"name": "AIExtracted", "purpose": "AI detected class"}],
+            "functions": [{"name": "ai_helper", "description": "AI detected function"}],
+        })
+        mock_client.complete.return_value = mock_response
+
+        task = {"title": "AI test task"}
+        asyncio.run(tracker.update_architecture(task, [str(py_file)]))
+
+        modules = tracker._architecture["modules"]
+        utilities = tracker._architecture["utilities"]
+        assert any(m["name"] == "AIExtracted" for m in modules)
+        assert any(u["name"] == "ai_helper" for u in utilities)
+
+    def test_ai_extraction_fallback_on_failure(self, tracker_with_client, tmp_path):
+        tracker, mock_client = tracker_with_client
+
+        py_file = tmp_path / "fallback_sample.py"
+        py_file.write_text(
+            'class FallbackClass:\n'
+            '    """Falls back to basic extraction."""\n'
+            '    pass\n'
+        )
+
+        mock_client.complete.side_effect = Exception("API error")
+
+        task = {"title": "Fallback test"}
+        asyncio.run(tracker.update_architecture(task, [str(py_file)]))
+
+        # Should fall back to basic extraction
+        modules = tracker._architecture["modules"]
+        assert any(m["name"] == "FallbackClass" for m in modules)
+
+    def test_no_duplicate_modules(self, tracker, tmp_path):
+        py_file = tmp_path / "dup.py"
+        py_file.write_text('class DupClass:\n    pass\n')
+
+        task = {"title": "Dup test"}
+        asyncio.run(tracker.update_architecture(task, [str(py_file)]))
+        asyncio.run(tracker.update_architecture(task, [str(py_file)]))
+
+        names = [m["name"] for m in tracker._architecture["modules"]]
+        assert names.count("DupClass") == 1
--- a/tests/test_claude_client.py
+++ b/tests/test_claude_client.py
@@ -0,0 +1,345 @@
+"""Tests for Claude Agent SDK client adapter."""
+
+from types import SimpleNamespace
+from unittest.mock import AsyncMock, MagicMock, call, patch
+
+import pytest
+
+from app_factory.core.claude_client import ClaudeSDKClient
+
+
+class _FakeOptions:
+    def __init__(self, **kwargs):
+        self.kwargs = kwargs
+
+
+class _FakeProcessError(RuntimeError):
+    def __init__(self, message: str, stderr: str):
+        super().__init__(message)
+        self.stderr = stderr
+
+
+class TestClaudeSDKClient:
+    @pytest.mark.asyncio
+    async def test_complete_uses_result_text_and_usage_tokens(self):
+        client = ClaudeSDKClient()
+        client._options_cls = _FakeOptions
+
+        async def _fake_query(*, prompt, options):
+            yield SimpleNamespace(
+                content=[SimpleNamespace(text="intermediate answer")],
+                model="claude-sonnet-4-6",
+                usage=None,
+                is_error=False,
+            )
+            yield SimpleNamespace(
+                result="final answer",
+                usage={"input_tokens": 12, "output_tokens": 34},
+                is_error=False,
+            )
+
+        client._query = _fake_query
+        result = await client.complete(
+            "hello",
+            model="claude-sonnet-4-6",
+            system_prompt="You are a tester.",
+            max_turns=1,
+        )
+
+        assert result.text == "final answer"
+        assert result.input_tokens == 12
+        assert result.output_tokens == 34
+
+    @pytest.mark.asyncio
+    async def test_complete_raises_on_error_result(self):
+        client = ClaudeSDKClient()
+        client._options_cls = _FakeOptions
+
+        async def _fake_query(*, prompt, options):
+            yield SimpleNamespace(
+                result="something went wrong",
+                usage={"input_tokens": 1, "output_tokens": 1},
+                is_error=True,
+            )
+
+        client._query = _fake_query
+
+        with pytest.raises(RuntimeError, match="something went wrong"):
+            await client.complete("hello")
+
+    @pytest.mark.asyncio
+    async def test_complete_falls_back_to_content_blocks(self):
+        client = ClaudeSDKClient()
+        client._options_cls = _FakeOptions
+
+        async def _fake_query(*, prompt, options):
+            yield SimpleNamespace(
+                content=[SimpleNamespace(text="first"), SimpleNamespace(text="second")],
+                model="claude-sonnet-4-6",
+                usage={"cache_creation_input_tokens": 7, "output_tokens": 2},
+                is_error=False,
+            )
+
+        client._query = _fake_query
+        result = await client.complete("hello")
+
+        assert result.text == "first\nsecond"
+        assert result.input_tokens == 7
+        assert result.output_tokens == 2
+
+    @pytest.mark.asyncio
+    async def test_complete_rejects_error_subtype(self):
+        client = ClaudeSDKClient()
+        client._options_cls = _FakeOptions
+
+        async def _fake_query(*, prompt, options):
+            yield SimpleNamespace(
+                subtype="error_during_execution",
+                is_error=False,
+                result=None,
+                usage={"input_tokens": 0, "output_tokens": 0},
+            )
+
+        client._query = _fake_query
+
+        with pytest.raises(RuntimeError, match="error_during_execution"):
+            await client.complete("hello")
+
+    @pytest.mark.asyncio
+    async def test_complete_ignores_user_content_blocks(self):
+        client = ClaudeSDKClient()
+        client._options_cls = _FakeOptions
+
+        async def _fake_query(*, prompt, options):
+            # Simulates a user/system message content block that should not be treated as model output.
+            yield SimpleNamespace(
+                content=[SimpleNamespace(text="[Request interrupted by user]")],
+                is_error=False,
+                usage=None,
+            )
+            yield SimpleNamespace(
+                result="actual assistant result",
+                subtype="success",
+                is_error=False,
+                usage={"input_tokens": 1, "output_tokens": 2},
+            )
+
+        client._query = _fake_query
+        result = await client.complete("hello")
+
+        assert result.text == "actual assistant result"
+
+    @pytest.mark.asyncio
+    async def test_complete_wraps_query_exception_with_hint(self):
+        client = ClaudeSDKClient()
+        client._options_cls = _FakeOptions
+
+        async def _fake_query(*, prompt, options):
+            raise RuntimeError("Command failed with exit code 1")
+            yield  # pragma: no cover
+
+        client._query = _fake_query
+
+        with pytest.raises(RuntimeError, match="Hint: verify Claude auth is available"):
+            await client.complete("hello")
+
+    @pytest.mark.asyncio
+    async def test_complete_passes_debug_to_stderr_flag(self):
+        client = ClaudeSDKClient(enable_debug=True)
+        client._options_cls = _FakeOptions
+        captured_options = None
+
+        async def _fake_query(*, prompt, options):
+            nonlocal captured_options
+            captured_options = options
+            yield SimpleNamespace(
+                result="ok",
+                subtype="success",
+                is_error=False,
+                usage={"input_tokens": 0, "output_tokens": 0},
+            )
+
+        client._query = _fake_query
+        await client.complete("hello")
+
+        assert captured_options is not None
+        assert captured_options.kwargs["extra_args"] == {"debug-to-stderr": None}
+        assert hasattr(captured_options.kwargs["debug_stderr"], "fileno")
+
+    @pytest.mark.asyncio
+    async def test_complete_omits_debug_stderr_without_debug_flag(self):
+        client = ClaudeSDKClient(enable_debug=False)
+        client._options_cls = _FakeOptions
+        captured_options = None
+
+        async def _fake_query(*, prompt, options):
+            nonlocal captured_options
+            captured_options = options
+            yield SimpleNamespace(
+                result="ok",
+                subtype="success",
+                is_error=False,
+                usage={"input_tokens": 0, "output_tokens": 0},
+            )
+
+        client._query = _fake_query
+        await client.complete("hello")
+
+        assert captured_options is not None
+        assert "extra_args" not in captured_options.kwargs
+        assert "debug_stderr" not in captured_options.kwargs
+
+    @pytest.mark.asyncio
+    async def test_complete_includes_exception_stderr_detail(self):
+        client = ClaudeSDKClient()
+        client._options_cls = _FakeOptions
+
+        async def _fake_query(*, prompt, options):
+            raise _FakeProcessError(
+                "Command failed with exit code 1",
+                "permission denied writing ~/.claude.json",
+            )
+            yield  # pragma: no cover
+
+        client._query = _fake_query
+
+        with pytest.raises(RuntimeError, match="permission denied writing ~/.claude.json"):
+            await client.complete("hello")
+
+    @pytest.mark.asyncio
+    async def test_complete_uses_fallback_home_when_default_home_not_writable(self, tmp_path):
+        client = ClaudeSDKClient()
+        client._options_cls = _FakeOptions
+        captured_options = None
+        cwd_dir = tmp_path / "workspace"
+        cwd_dir.mkdir(parents=True, exist_ok=True)
+        expected_home = cwd_dir / ".app_factory" / "claude_home"
+        expected_home.mkdir(parents=True, exist_ok=True)
+
+        client._claude_home_is_writable = lambda home: False
+        client._prepare_fallback_claude_home = (
+            lambda source_home, fallback_home: expected_home
+        )
+
+        async def _fake_query(*, prompt, options):
+            nonlocal captured_options
+            captured_options = options
+            yield SimpleNamespace(
+                result="ok",
+                subtype="success",
+                is_error=False,
+                usage={"input_tokens": 0, "output_tokens": 0},
+            )
+
+        client._query = _fake_query
+        await client.complete("hello", cwd=str(cwd_dir))
+
+        assert captured_options is not None
+        assert captured_options.kwargs["env"]["HOME"] == str(expected_home)
+
+    @pytest.mark.asyncio
+    async def test_complete_retries_rate_limit_event_then_succeeds(self):
+        client = ClaudeSDKClient()
+        client._options_cls = _FakeOptions
+        attempts = 0
+
+        async def _fake_query(*, prompt, options):
+            nonlocal attempts
+            attempts += 1
+            if attempts == 1:
+                raise RuntimeError("Unknown message type: rate_limit_event")
+            yield SimpleNamespace(
+                result="ok",
+                subtype="success",
+                is_error=False,
+                usage={"input_tokens": 2, "output_tokens": 3},
+            )
+
+        client._query = _fake_query
+
+        with patch("app_factory.core.claude_client.asyncio.sleep", new=AsyncMock()) as mock_sleep:
+            result = await client.complete("hello")
+
+        assert result.text == "ok"
+        assert attempts == 2
+        mock_sleep.assert_awaited_once_with(0.2)
+
+    @pytest.mark.asyncio
+    async def test_complete_retries_rate_limit_event_exhausts_then_fails(self):
+        client = ClaudeSDKClient()
+        client._options_cls = _FakeOptions
+        attempts = 0
+
+        async def _fake_query(*, prompt, options):
+            nonlocal attempts
+            attempts += 1
+            raise RuntimeError("Unknown message type: rate_limit_event")
+            yield  # pragma: no cover
+
+        client._query = _fake_query
+
+        with patch("app_factory.core.claude_client.asyncio.sleep", new=AsyncMock()) as mock_sleep:
+            with pytest.raises(RuntimeError, match="rate_limit_event"):
+                await client.complete("hello")
+
+        assert attempts == 4
+        assert mock_sleep.await_args_list == [call(0.2), call(0.8), call(4.0)]
+
+    @pytest.mark.asyncio
+    async def test_complete_emits_observability_events(self):
+        client = ClaudeSDKClient()
+        client._options_cls = _FakeOptions
+        mock_observability = MagicMock()
+
+        async def _fake_query(*, prompt, options):
+            yield SimpleNamespace(
+                content=[
+                    SimpleNamespace(
+                        id="toolu_1",
+                        name="Bash",
+                        input={"command": "echo hi", "api_key": "secret"},
+                    ),
+                    SimpleNamespace(tool_use_id="toolu_1", content="done", is_error=False),
+                    SimpleNamespace(text="Tool finished"),
+                ],
+                model="claude-sonnet-4-6",
+                session_id="session-123",
+                usage=None,
+                is_error=False,
+            )
+            yield SimpleNamespace(
+                subtype="success",
+                duration_ms=250,
+                duration_api_ms=200,
+                is_error=False,
+                num_turns=1,
+                session_id="session-123",
+                usage={"input_tokens": 3, "output_tokens": 4},
+                result="ok",
+            )
+
+        client._query = _fake_query
+        result = await client.complete(
+            "hello",
+            observability=mock_observability,
+            agent_name="pm_agent",
+            task_id="expand_prd",
+        )
+
+        assert result.text == "ok"
+        event_types = [
+            call.kwargs["event_type"]
+            for call in mock_observability.log_claude_event.call_args_list
+        ]
+        assert "request_start" in event_types
+        assert "tool_use" in event_types
+        assert "tool_result" in event_types
+        assert "result_message" in event_types
+        assert "request_complete" in event_types
+
+        tool_use_calls = [
+            c.kwargs for c in mock_observability.log_claude_event.call_args_list
+            if c.kwargs.get("event_type") == "tool_use"
+        ]
+        assert tool_use_calls
+        assert tool_use_calls[0]["payload"]["tool_input"]["api_key"] == "[REDACTED]"
--- a/tests/test_dev_agent.py
+++ b/tests/test_dev_agent.py
@@ -0,0 +1,279 @@
+"""Tests for DevAgentManager."""
+
+import os
+from unittest.mock import MagicMock, patch, mock_open
+
+import pexpect
+import pytest
+
+from app_factory.agents.dev_agent import DevAgentManager, PROMPT_TEMPLATE_PATH
+
+
+@pytest.fixture
+def mock_docker_client():
+    return MagicMock()
+
+
+@pytest.fixture
+def manager(mock_docker_client):
+    return DevAgentManager(docker_client=mock_docker_client, max_retries=3, timeout=60)
+
+
+@pytest.fixture
+def sample_task():
+    return {
+        "task_id": "42",
+        "title": "Implement login endpoint",
+        "description": "Create a POST /login endpoint with JWT",
+        "details": "Use bcrypt for password hashing, return JWT token",
+        "testStrategy": "Unit test auth flow, integration test endpoint",
+    }
+
+
+class TestInit:
+    def test_init_with_client(self, mock_docker_client):
+        mgr = DevAgentManager(docker_client=mock_docker_client, max_retries=5, timeout=900)
+        assert mgr.docker_client is mock_docker_client
+        assert mgr.max_retries == 5
+        assert mgr.timeout == 900
+        assert mgr._retry_counts == {}
+
+    def test_init_defaults(self, mock_docker_client):
+        mgr = DevAgentManager(docker_client=mock_docker_client)
+        assert mgr.max_retries == 3
+        assert mgr.timeout == 1800
+
+    def test_init_creates_docker_client_from_env(self):
+        mock_client = MagicMock()
+        mock_docker = MagicMock()
+        mock_docker.from_env.return_value = mock_client
+        with patch.dict("sys.modules", {"docker": mock_docker}):
+            mgr = DevAgentManager()
+        assert mgr.docker_client is mock_client
+
+
+class TestPrepareTaskPrompt:
+    def test_includes_all_fields(self, manager, sample_task):
+        prompt = manager.prepare_task_prompt(sample_task, global_arch="Microservice arch")
+        assert "42" in prompt
+        assert "Implement login endpoint" in prompt
+        assert "Create a POST /login endpoint with JWT" in prompt
+        assert "Use bcrypt for password hashing" in prompt
+        assert "Unit test auth flow" in prompt
+        assert "Microservice arch" in prompt
+
+    def test_without_global_arch(self, manager, sample_task):
+        prompt = manager.prepare_task_prompt(sample_task)
+        assert "No architecture context provided." in prompt
+        assert "42" in prompt
+        assert "Implement login endpoint" in prompt
+
+    def test_with_empty_global_arch(self, manager, sample_task):
+        prompt = manager.prepare_task_prompt(sample_task, global_arch="")
+        assert "No architecture context provided." in prompt
+
+    def test_uses_id_fallback(self, manager):
+        task = {"id": "99", "title": "Fallback task", "description": "desc"}
+        prompt = manager.prepare_task_prompt(task)
+        assert "99" in prompt
+
+    def test_template_file_exists(self):
+        assert PROMPT_TEMPLATE_PATH.exists(), f"Template not found at {PROMPT_TEMPLATE_PATH}"
+
+
+@pytest.mark.asyncio
+class TestExecuteTask:
+    async def test_success_path(self, manager, sample_task, tmp_path):
+        worktree = str(tmp_path)
+        mock_child = MagicMock()
+        mock_child.before = "Created src/auth.py\nModified tests/test_auth.py\n2 passed"
+        mock_child.exitstatus = 0
+
+        with patch("app_factory.agents.dev_agent.pexpect.spawn", return_value=mock_child):
+            result = await manager.execute_task(sample_task, "container123", worktree)
+
+        assert result["status"] == "success"
+        assert result["exit_code"] == 0
+        assert "src/auth.py" in result["files_changed"]
+        assert isinstance(result["output"], str)
+
+    async def test_failure_path(self, manager, sample_task, tmp_path):
+        worktree = str(tmp_path)
+        mock_child = MagicMock()
+        mock_child.before = "Error: compilation failed\n1 failed"
+        mock_child.exitstatus = 1
+
+        with patch("app_factory.agents.dev_agent.pexpect.spawn", return_value=mock_child):
+            result = await manager.execute_task(sample_task, "container123", worktree)
+
+        assert result["status"] == "failed"
+        assert result["exit_code"] == 1
+
+    async def test_timeout(self, manager, sample_task, tmp_path):
+        worktree = str(tmp_path)
+        mock_child = MagicMock()
+        mock_child.expect.side_effect = pexpect.TIMEOUT("timed out")
+        mock_child.close.return_value = None
+
+        with patch("app_factory.agents.dev_agent.pexpect.spawn", return_value=mock_child):
+            result = await manager.execute_task(sample_task, "container123", worktree)
+
+        assert result["status"] == "failed"
+        assert result["output"] == "timeout"
+        assert result["exit_code"] == -1
+        assert result["files_changed"] == []
+
+    async def test_writes_prompt_file(self, manager, sample_task, tmp_path):
+        worktree = str(tmp_path)
+        mock_child = MagicMock()
+        mock_child.before = "done"
+        mock_child.exitstatus = 0
+
+        written_content = None
+
+        original_open = open
+
+        def capturing_open(path, *args, **kwargs):
+            nonlocal written_content
+            if str(path).endswith(".task_prompt.txt") and "w" in (args[0] if args else ""):
+                result = original_open(path, *args, **kwargs)
+                # We'll check the file exists during execution
+                return result
+            return original_open(path, *args, **kwargs)
+
+        with patch("app_factory.agents.dev_agent.pexpect.spawn", return_value=mock_child):
+            await manager.execute_task(sample_task, "cid", worktree)
+
+        # Prompt file should be cleaned up after execution
+        prompt_path = os.path.join(worktree, ".task_prompt.txt")
+        assert not os.path.exists(prompt_path), "Prompt file should be cleaned up"
+
+    async def test_spawns_correct_command(self, manager, sample_task, tmp_path):
+        worktree = str(tmp_path)
+        mock_child = MagicMock()
+        mock_child.before = ""
+        mock_child.exitstatus = 0
+
+        with patch("app_factory.agents.dev_agent.pexpect.spawn", return_value=mock_child) as mock_spawn:
+            await manager.execute_task(sample_task, "abc123", worktree)
+
+        mock_spawn.assert_called_once_with(
+            "docker exec abc123 claude --print --prompt-file /workspace/.task_prompt.txt",
+            timeout=60,
+            encoding="utf-8",
+        )
+
+    async def test_none_exitstatus_treated_as_error(self, manager, sample_task, tmp_path):
+        worktree = str(tmp_path)
+        mock_child = MagicMock()
+        mock_child.before = "output"
+        mock_child.exitstatus = None
+
+        with patch("app_factory.agents.dev_agent.pexpect.spawn", return_value=mock_child):
+            result = await manager.execute_task(sample_task, "cid", worktree)
+
+        assert result["status"] == "failed"
+        assert result["exit_code"] == -1
+
+
+class TestParseClaudeOutput:
+    def test_extracts_files_changed(self, manager):
+        output = "Created src/auth.py\nModified tests/test_auth.py\nUpdated config.json"
+        result = manager.parse_claude_output(output)
+        assert "src/auth.py" in result["files_changed"]
+        assert "tests/test_auth.py" in result["files_changed"]
+        assert "config.json" in result["files_changed"]
+
+    def test_extracts_test_results(self, manager):
+        output = "Results: 5 passed, 2 failed"
+        result = manager.parse_claude_output(output)
+        assert result["test_results"]["passed"] == 5
+        assert result["test_results"]["failed"] == 2
+
+    def test_extracts_errors(self, manager):
+        output = "Error: could not import module\nFAILED: assertion mismatch"
+        result = manager.parse_claude_output(output)
+        assert len(result["errors"]) >= 1
+
+    def test_empty_output(self, manager):
+        result = manager.parse_claude_output("")
+        assert result["files_changed"] == []
+        assert result["test_results"] == {}
+        assert result["errors"] == []
+
+    def test_no_test_results(self, manager):
+        output = "Created app.py\nDone."
+        result = manager.parse_claude_output(output)
+        assert result["test_results"] == {}
+
+    def test_deduplicates_files(self, manager):
+        output = "Created app.py\nEditing app.py"
+        result = manager.parse_claude_output(output)
+        assert result["files_changed"].count("app.py") == 1
+
+
+@pytest.mark.asyncio
+class TestExecuteWithRetry:
+    async def test_succeeds_on_first_try(self, manager, sample_task, tmp_path):
+        worktree = str(tmp_path)
+        mock_child = MagicMock()
+        mock_child.before = "All good\n3 passed"
+        mock_child.exitstatus = 0
+
+        with patch("app_factory.agents.dev_agent.pexpect.spawn", return_value=mock_child):
+            result = await manager.execute_with_retry(sample_task, "cid", worktree)
+
+        assert result["status"] == "success"
+        assert manager.get_retry_count("42") == 1
+
+    async def test_succeeds_after_failures(self, manager, sample_task, tmp_path):
+        worktree = str(tmp_path)
+
+        # First two calls fail, third succeeds
+        fail_child = MagicMock()
+        fail_child.before = "Error: build failed"
+        fail_child.exitstatus = 1
+
+        success_child = MagicMock()
+        success_child.before = "All good"
+        success_child.exitstatus = 0
+
+        with patch(
+            "app_factory.agents.dev_agent.pexpect.spawn",
+            side_effect=[fail_child, fail_child, success_child],
+        ):
+            result = await manager.execute_with_retry(sample_task, "cid", worktree)
+
+        assert result["status"] == "success"
+        assert manager.get_retry_count("42") == 3
+
+    async def test_max_retries_exceeded(self, manager, sample_task, tmp_path):
+        worktree = str(tmp_path)
+        mock_child = MagicMock()
+        mock_child.before = "Error: persistent failure"
+        mock_child.exitstatus = 1
+
+        with patch("app_factory.agents.dev_agent.pexpect.spawn", return_value=mock_child):
+            result = await manager.execute_with_retry(sample_task, "cid", worktree)
+
+        assert result["status"] == "needs_clarification"
+        assert manager.get_retry_count("42") == 3
+
+
+class TestRetryCounters:
+    def test_get_retry_count_default(self, manager):
+        assert manager.get_retry_count("unknown") == 0
+
+    def test_get_retry_count_after_set(self, manager):
+        manager._retry_counts["task-1"] = 2
+        assert manager.get_retry_count("task-1") == 2
+
+    def test_reset_retry_count(self, manager):
+        manager._retry_counts["task-1"] = 3
+        manager.reset_retry_count("task-1")
+        assert manager.get_retry_count("task-1") == 0
+
+    def test_reset_nonexistent_task(self, manager):
+        # Should not raise
+        manager.reset_retry_count("nonexistent")
+        assert manager.get_retry_count("nonexistent") == 0
--- a/tests/test_graph.py
+++ b/tests/test_graph.py
@@ -0,0 +1,575 @@
+"""Tests for AppFactoryOrchestrator (LangGraph state machine)."""
+
+import asyncio
+import json
+import os
+import tempfile
+from unittest.mock import AsyncMock, MagicMock, patch
+
+import pytest
+
+from app_factory.core.graph import AppFactoryOrchestrator, AppFactoryState
+
+
+# ---------------------------------------------------------------------------
+# Fixtures
+# ---------------------------------------------------------------------------
+
+
+@pytest.fixture
+def mock_pm_agent():
+    agent = AsyncMock()
+    agent.expand_prompt_to_prd = AsyncMock(return_value="# Generated PRD\n\nObjective: Build an app")
+    agent.handle_clarification_request = AsyncMock(return_value="Clarification: use REST API")
+    return agent
+
+
+@pytest.fixture
+def mock_task_agent():
+    agent = AsyncMock()
+    agent.parse_prd = AsyncMock(return_value={"tasks": []})
+    agent.get_unblocked_tasks = AsyncMock(return_value=[
+        {"id": 1, "title": "Task 1", "status": "pending", "dependencies": []},
+        {"id": 2, "title": "Task 2", "status": "pending", "dependencies": []},
+    ])
+    agent.update_task_status = AsyncMock()
+    agent.get_task_details = AsyncMock(return_value={"id": 1, "title": "Task 1"})
+    return agent
+
+
+@pytest.fixture
+def mock_dev_manager():
+    manager = AsyncMock()
+    manager.execute_with_retry = AsyncMock(return_value={
+        "status": "success",
+        "output": "Task completed",
+        "files_changed": ["app.py"],
+        "exit_code": 0,
+    })
+    return manager
+
+
+@pytest.fixture
+def mock_qa_agent():
+    agent = AsyncMock()
+    agent.max_retries = 3
+    agent.review_and_merge = AsyncMock(return_value={
+        "status": "merged",
+        "commit_sha": "abc123",
+        "review_summary": "LGTM",
+    })
+    return agent
+
+
+@pytest.fixture
+def mock_workspace_manager():
+    manager = AsyncMock()
+    manager.create_worktree = AsyncMock(return_value="/tmp/worktree/task-1")
+    manager.spin_up_clean_room = AsyncMock(return_value=MagicMock(id="container-123"))
+    manager.cleanup_workspace = AsyncMock()
+    return manager
+
+
+@pytest.fixture
+def mock_observability():
+    obs = MagicMock()
+    obs.log_state_transition = MagicMock()
+    obs.log_error = MagicMock()
+    return obs
+
+
+@pytest.fixture
+def orchestrator(mock_pm_agent, mock_task_agent, mock_dev_manager,
+                 mock_qa_agent, mock_workspace_manager, mock_observability):
+    return AppFactoryOrchestrator(
+        pm_agent=mock_pm_agent,
+        task_agent=mock_task_agent,
+        dev_manager=mock_dev_manager,
+        qa_agent=mock_qa_agent,
+        workspace_manager=mock_workspace_manager,
+        observability=mock_observability,
+    )
+
+
+# ---------------------------------------------------------------------------
+# State Schema Tests
+# ---------------------------------------------------------------------------
+
+
+class TestStateSchema:
+    def test_state_has_all_required_fields(self):
+        required_fields = {
+            "user_input", "prd", "tasks", "active_tasks", "completed_tasks",
+            "blocked_tasks", "clarification_requests", "global_architecture",
+            "iteration_count", "max_iterations", "errors",
+        }
+        assert required_fields == set(AppFactoryState.__annotations__.keys())
+
+    def test_state_can_be_instantiated(self):
+        state: AppFactoryState = {
+            "user_input": "build a todo app",
+            "prd": "",
+            "tasks": [],
+            "active_tasks": {},
+            "completed_tasks": [],
+            "blocked_tasks": {},
+            "clarification_requests": [],
+            "global_architecture": "",
+            "iteration_count": 0,
+            "max_iterations": 50,
+            "errors": [],
+        }
+        assert state["user_input"] == "build a todo app"
+        assert state["max_iterations"] == 50
+
+
+# ---------------------------------------------------------------------------
+# Graph Construction Tests
+# ---------------------------------------------------------------------------
+
+
+class TestGraphBuild:
+    def test_graph_builds_without_errors(self):
+        orch = AppFactoryOrchestrator()
+        compiled = orch.build_graph()
+        assert compiled is not None
+
+    def test_graph_builds_with_all_agents(self, orchestrator):
+        compiled = orchestrator.build_graph()
+        assert compiled is not None
+
+    def test_graph_builds_with_none_agents(self):
+        orch = AppFactoryOrchestrator(
+            pm_agent=None, task_agent=None, dev_manager=None,
+            qa_agent=None, workspace_manager=None, observability=None,
+        )
+        compiled = orch.build_graph()
+        assert compiled is not None
+
+
+# ---------------------------------------------------------------------------
+# Node Tests
+# ---------------------------------------------------------------------------
+
+
+class TestPMNode:
+    @pytest.mark.asyncio
+    async def test_pm_node_sets_prd(self, orchestrator, mock_pm_agent):
+        state = {"user_input": "build a todo app", "errors": []}
+        result = await orchestrator._pm_node(state)
+        assert "prd" in result
+        assert result["prd"] == "# Generated PRD\n\nObjective: Build an app"
+        mock_pm_agent.expand_prompt_to_prd.assert_called_once_with("build a todo app")
+
+    @pytest.mark.asyncio
+    async def test_pm_node_handles_no_input(self, orchestrator):
+        state = {"user_input": "", "errors": []}
+        result = await orchestrator._pm_node(state)
+        assert result["prd"] == ""
+        assert any("No user input" in e for e in result["errors"])
+
+    @pytest.mark.asyncio
+    async def test_pm_node_handles_agent_error(self, orchestrator, mock_pm_agent):
+        mock_pm_agent.expand_prompt_to_prd.side_effect = RuntimeError("API down")
+        state = {"user_input": "build app", "errors": []}
+        result = await orchestrator._pm_node(state)
+        assert result["prd"] == ""
+        assert any("PM agent error" in e for e in result["errors"])
+
+    @pytest.mark.asyncio
+    async def test_pm_node_without_agent(self):
+        orch = AppFactoryOrchestrator()
+        state = {"user_input": "build a todo app", "errors": []}
+        result = await orch._pm_node(state)
+        assert "Mock PRD" in result["prd"]
+
+
+class TestTaskNode:
+    @pytest.mark.asyncio
+    async def test_task_node_populates_tasks(self, orchestrator, mock_task_agent):
+        state = {"prd": "some PRD", "tasks": [], "iteration_count": 0, "max_iterations": 50, "errors": []}
+        result = await orchestrator._task_node(state)
+        assert "tasks" in result
+        assert len(result["tasks"]) == 2
+        mock_task_agent.parse_prd.assert_called_once()
+
+    @pytest.mark.asyncio
+    async def test_task_node_increments_iteration(self, orchestrator):
+        state = {"prd": "PRD", "tasks": [], "iteration_count": 5, "max_iterations": 50, "errors": []}
+        result = await orchestrator._task_node(state)
+        assert result["iteration_count"] == 6
+
+    @pytest.mark.asyncio
+    async def test_task_node_refreshes_on_subsequent_passes(self, orchestrator, mock_task_agent):
+        state = {
+            "prd": "PRD",
+            "tasks": [{"id": 1, "title": "existing", "status": "pending", "dependencies": []}],
+            "iteration_count": 1,
+            "max_iterations": 50,
+            "errors": [],
+        }
+        result = await orchestrator._task_node(state)
+        mock_task_agent.parse_prd.assert_not_called()
+        mock_task_agent.get_unblocked_tasks.assert_called()
+
+    @pytest.mark.asyncio
+    async def test_task_node_stops_at_max_iterations(self, orchestrator):
+        state = {"prd": "PRD", "tasks": [], "iteration_count": 49, "max_iterations": 50, "errors": []}
+        result = await orchestrator._task_node(state)
+        assert result["iteration_count"] == 50
+        assert any("Max iterations" in e for e in result["errors"])
+
+
+class TestDevDispatchNode:
+    @pytest.mark.asyncio
+    async def test_dev_dispatch_spawns_concurrent_tasks(self, orchestrator, mock_dev_manager, mock_workspace_manager):
+        state = {
+            "tasks": [
+                {"id": 1, "title": "Task 1", "status": "pending", "dependencies": []},
+                {"id": 2, "title": "Task 2", "status": "pending", "dependencies": []},
+            ],
+            "completed_tasks": [],
+            "active_tasks": {},
+            "errors": [],
+            "clarification_requests": [],
+            "global_architecture": "",
+        }
+        result = await orchestrator._dev_dispatch_node(state)
+        assert "1" in result["completed_tasks"]
+        assert "2" in result["completed_tasks"]
+        assert mock_dev_manager.execute_with_retry.call_count == 2
+        assert mock_workspace_manager.create_worktree.call_count == 2
+
+    @pytest.mark.asyncio
+    async def test_dev_dispatch_handles_needs_clarification(self, orchestrator, mock_dev_manager):
+        mock_dev_manager.execute_with_retry.return_value = {
+            "status": "needs_clarification",
+            "output": "Cannot figure out API format",
+            "files_changed": [],
+            "exit_code": -1,
+        }
+        state = {
+            "tasks": [{"id": 1, "title": "Task 1", "status": "pending", "dependencies": []}],
+            "completed_tasks": [],
+            "active_tasks": {},
+            "errors": [],
+            "clarification_requests": [],
+            "global_architecture": "",
+        }
+        result = await orchestrator._dev_dispatch_node(state)
+        assert len(result["clarification_requests"]) == 1
+        assert result["clarification_requests"][0]["task_id"] == "1"
+
+    @pytest.mark.asyncio
+    async def test_dev_dispatch_skips_completed(self, orchestrator):
+        state = {
+            "tasks": [{"id": 1, "title": "Task 1", "status": "pending", "dependencies": []}],
+            "completed_tasks": ["1"],
+            "active_tasks": {},
+            "errors": [],
+            "clarification_requests": [],
+            "global_architecture": "",
+        }
+        result = await orchestrator._dev_dispatch_node(state)
+        assert result == {}
+
+    @pytest.mark.asyncio
+    async def test_dev_dispatch_without_agents(self):
+        orch = AppFactoryOrchestrator()
+        state = {
+            "tasks": [
+                {"id": 1, "title": "Task 1", "status": "pending", "dependencies": []},
+            ],
+            "completed_tasks": [],
+            "active_tasks": {},
+            "errors": [],
+            "clarification_requests": [],
+            "global_architecture": "",
+        }
+        result = await orch._dev_dispatch_node(state)
+        assert "1" in result["completed_tasks"]
+
+
+class TestQANode:
+    @pytest.mark.asyncio
+    async def test_qa_node_processes_results(self, orchestrator, mock_qa_agent, mock_task_agent):
+        state = {
+            "tasks": [{"id": "1", "title": "Task 1"}],
+            "active_tasks": {"1": {"status": "success", "worktree_path": "/tmp/wt"}},
+            "completed_tasks": ["1"],
+            "errors": [],
+            "clarification_requests": [],
+            "blocked_tasks": {},
+        }
+        result = await orchestrator._qa_node(state)
+        mock_qa_agent.review_and_merge.assert_called_once()
+        assert result["active_tasks"]["1"]["status"] == "merged"
+
+    @pytest.mark.asyncio
+    async def test_qa_node_no_tasks_for_qa(self, orchestrator):
+        state = {
+            "tasks": [],
+            "active_tasks": {},
+            "completed_tasks": [],
+            "errors": [],
+            "clarification_requests": [],
+            "blocked_tasks": {},
+        }
+        result = await orchestrator._qa_node(state)
+        assert result == {}
+
+    @pytest.mark.asyncio
+    async def test_qa_node_handles_qa_failure(self, orchestrator, mock_qa_agent):
+        mock_qa_agent.review_and_merge.return_value = {
+            "status": "tests_failed",
+            "retry_count": 3,
+        }
+        state = {
+            "tasks": [{"id": "1", "title": "Task 1"}],
+            "active_tasks": {"1": {"status": "success", "worktree_path": "/tmp/wt"}},
+            "completed_tasks": ["1"],
+            "errors": [],
+            "clarification_requests": [],
+            "blocked_tasks": {},
+        }
+        result = await orchestrator._qa_node(state)
+        assert len(result["clarification_requests"]) == 1
+
+
+# ---------------------------------------------------------------------------
+# Routing Tests
+# ---------------------------------------------------------------------------
+
+
+class TestRoutingAfterTasks:
+    def test_routes_to_dev_dispatch_when_unblocked(self, orchestrator):
+        state = {
+            "tasks": [{"id": 1, "status": "pending", "dependencies": []}],
+            "completed_tasks": [],
+            "blocked_tasks": {},
+            "clarification_requests": [],
+            "iteration_count": 1,
+            "max_iterations": 50,
+        }
+        assert orchestrator._should_continue_after_tasks(state) == "dev_dispatch"
+
+    def test_routes_to_end_when_all_done(self, orchestrator):
+        state = {
+            "tasks": [{"id": 1, "status": "done", "dependencies": []}],
+            "completed_tasks": ["1"],
+            "blocked_tasks": {},
+            "clarification_requests": [],
+            "iteration_count": 1,
+            "max_iterations": 50,
+        }
+        assert orchestrator._should_continue_after_tasks(state) == "end"
+
+    def test_routes_to_clarification_when_blocked(self, orchestrator):
+        state = {
+            "tasks": [{"id": 1, "status": "pending", "dependencies": [2]}],
+            "completed_tasks": [],
+            "blocked_tasks": {"1": "dependency not met"},
+            "clarification_requests": [],
+            "iteration_count": 1,
+            "max_iterations": 50,
+        }
+        assert orchestrator._should_continue_after_tasks(state) == "clarification"
+
+    def test_routes_to_end_at_max_iterations(self, orchestrator):
+        state = {
+            "tasks": [{"id": 1, "status": "pending", "dependencies": []}],
+            "completed_tasks": [],
+            "blocked_tasks": {},
+            "clarification_requests": [],
+            "iteration_count": 50,
+            "max_iterations": 50,
+        }
+        assert orchestrator._should_continue_after_tasks(state) == "end"
+
+    def test_routes_to_end_when_no_tasks(self, orchestrator):
+        state = {
+            "tasks": [],
+            "completed_tasks": [],
+            "blocked_tasks": {},
+            "clarification_requests": [],
+            "iteration_count": 1,
+            "max_iterations": 50,
+        }
+        assert orchestrator._should_continue_after_tasks(state) == "end"
+
+
+class TestRoutingAfterQA:
+    def test_routes_to_task_node(self, orchestrator):
+        state = {
+            "clarification_requests": [],
+            "iteration_count": 1,
+            "max_iterations": 50,
+        }
+        assert orchestrator._should_continue_after_qa(state) == "task_node"
+
+    def test_routes_to_clarification(self, orchestrator):
+        state = {
+            "clarification_requests": [{"question": "What API?"}],
+            "iteration_count": 1,
+            "max_iterations": 50,
+        }
+        assert orchestrator._should_continue_after_qa(state) == "clarification"
+
+    def test_routes_to_end_at_max_iterations(self, orchestrator):
+        state = {
+            "clarification_requests": [],
+            "iteration_count": 50,
+            "max_iterations": 50,
+        }
+        assert orchestrator._should_continue_after_qa(state) == "end"
+
+
+# ---------------------------------------------------------------------------
+# Iteration Safety Tests
+# ---------------------------------------------------------------------------
+
+
+class TestIterationSafety:
+    @pytest.mark.asyncio
+    async def test_iteration_limit_prevents_infinite_loop(self, orchestrator):
+        state = {
+            "prd": "PRD",
+            "tasks": [],
+            "iteration_count": 49,
+            "max_iterations": 50,
+            "errors": [],
+        }
+        result = await orchestrator._task_node(state)
+        assert result["iteration_count"] == 50
+        assert any("Max iterations" in e for e in result["errors"])
+
+
+# ---------------------------------------------------------------------------
+# State Persistence Tests
+# ---------------------------------------------------------------------------
+
+
+class TestStatePersistence:
+    def test_save_state(self, orchestrator):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            path = os.path.join(tmpdir, "state.json")
+            state = {
+                "user_input": "test",
+                "prd": "PRD content",
+                "tasks": [{"id": 1}],
+                "active_tasks": {},
+                "completed_tasks": ["1"],
+                "blocked_tasks": {},
+                "clarification_requests": [],
+                "global_architecture": "",
+                "iteration_count": 5,
+                "max_iterations": 50,
+                "errors": [],
+            }
+            orchestrator.save_state(state, path)
+            assert os.path.exists(path)
+            with open(path) as f:
+                loaded = json.load(f)
+            assert loaded["user_input"] == "test"
+            assert loaded["iteration_count"] == 5
+
+    def test_load_state(self, orchestrator):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            path = os.path.join(tmpdir, "state.json")
+            data = {"user_input": "hello", "prd": "PRD", "iteration_count": 3}
+            with open(path, "w") as f:
+                json.dump(data, f)
+            loaded = orchestrator.load_state(path)
+            assert loaded["user_input"] == "hello"
+            assert loaded["iteration_count"] == 3
+
+    def test_save_state_creates_directory(self, orchestrator):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            path = os.path.join(tmpdir, "nested", "dir", "state.json")
+            orchestrator.save_state({"user_input": "test"}, path)
+            assert os.path.exists(path)
+
+
+# ---------------------------------------------------------------------------
+# Clarification Node Tests
+# ---------------------------------------------------------------------------
+
+
+class TestClarificationNode:
+    @pytest.mark.asyncio
+    async def test_clarification_resolves_requests(self, orchestrator, mock_pm_agent):
+        state = {
+            "clarification_requests": [
+                {"requesting_agent": "dev", "task_id": "1", "question": "What format?", "context": ""},
+            ],
+            "blocked_tasks": {"1": "needs clarification"},
+            "errors": [],
+        }
+        result = await orchestrator._clarification_node(state)
+        assert result["clarification_requests"] == []
+        assert "1" not in result["blocked_tasks"]
+        mock_pm_agent.handle_clarification_request.assert_called_once()
+
+    @pytest.mark.asyncio
+    async def test_clarification_handles_empty_requests(self, orchestrator):
+        state = {"clarification_requests": [], "blocked_tasks": {}, "errors": []}
+        result = await orchestrator._clarification_node(state)
+        assert result["clarification_requests"] == []
+
+    @pytest.mark.asyncio
+    async def test_clarification_without_agent(self):
+        orch = AppFactoryOrchestrator()
+        state = {
+            "clarification_requests": [{"task_id": "1", "question": "?"}],
+            "blocked_tasks": {"1": "blocked"},
+            "errors": [],
+        }
+        result = await orch._clarification_node(state)
+        assert result["clarification_requests"] == []
+
+
+# ---------------------------------------------------------------------------
+# End-to-End Run Test
+# ---------------------------------------------------------------------------
+
+
+class TestEndToEnd:
+    @pytest.mark.asyncio
+    async def test_run_executes_end_to_end(self, mock_pm_agent, mock_task_agent,
+                                            mock_dev_manager, mock_qa_agent,
+                                            mock_workspace_manager, mock_observability):
+        # After first get_unblocked_tasks returns tasks, the second call returns empty
+        # to terminate the loop.
+        mock_task_agent.get_unblocked_tasks = AsyncMock(
+            side_effect=[
+                [{"id": 1, "title": "Task 1", "status": "pending", "dependencies": []}],
+                [],  # No more tasks - triggers end
+            ]
+        )
+
+        orch = AppFactoryOrchestrator(
+            pm_agent=mock_pm_agent,
+            task_agent=mock_task_agent,
+            dev_manager=mock_dev_manager,
+            qa_agent=mock_qa_agent,
+            workspace_manager=mock_workspace_manager,
+            observability=mock_observability,
+        )
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            state_path = os.path.join(tmpdir, "state.json")
+            with patch.object(orch, "save_state") as mock_save:
+                result = await orch.run("build a todo app")
+
+        assert result["prd"] != ""
+        mock_pm_agent.expand_prompt_to_prd.assert_called_once_with("build a todo app")
+        mock_task_agent.parse_prd.assert_called_once()
+
+    @pytest.mark.asyncio
+    async def test_run_with_no_agents(self):
+        orch = AppFactoryOrchestrator()
+        with tempfile.TemporaryDirectory() as tmpdir:
+            state_path = os.path.join(tmpdir, "state.json")
+            with patch.object(orch, "save_state"):
+                result = await orch.run("build something")
+        assert "Mock PRD" in result["prd"]
--- a/tests/test_main.py
+++ b/tests/test_main.py
@@ -0,0 +1,456 @@
+"""Tests for main.py entry point, error handling, and integration."""
+
+import asyncio
+import os
+import signal
+import sys
+from datetime import datetime, timezone
+from io import StringIO
+from unittest.mock import AsyncMock, MagicMock, patch
+
+import pytest
+
+from main import (
+    AppFactoryError,
+    ClarificationTimeout,
+    ConfigurationError,
+    DockerDaemonError,
+    GracefulShutdown,
+    GitError,
+    MCPConnectionError,
+    main,
+    parse_args,
+    print_summary,
+    run_factory,
+    validate_environment,
+)
+
+
+# ---------------------------------------------------------------------------
+# parse_args tests
+# ---------------------------------------------------------------------------
+
+
+class TestParseArgs:
+    def test_prompt_required(self):
+        with pytest.raises(SystemExit):
+            parse_args([])
+
+    def test_prompt_only(self):
+        args = parse_args(["--prompt", "Build a REST API"])
+        assert args.prompt == "Build a REST API"
+        assert args.repo_path == os.getcwd()
+        assert args.max_concurrent_tasks == 5
+        assert args.debug is False
+        assert args.dry_run is False
+
+    def test_all_options(self):
+        args = parse_args([
+            "--prompt", "Build an app",
+            "--repo-path", "/tmp/project",
+            "--max-concurrent-tasks", "3",
+            "--debug",
+            "--dry-run",
+        ])
+        assert args.prompt == "Build an app"
+        assert args.repo_path == "/tmp/project"
+        assert args.max_concurrent_tasks == 3
+        assert args.debug is True
+        assert args.dry_run is True
+
+    def test_max_concurrent_tasks_default(self):
+        args = parse_args(["--prompt", "test"])
+        assert args.max_concurrent_tasks == 5
+
+    def test_repo_path_default_is_cwd(self):
+        args = parse_args(["--prompt", "test"])
+        assert args.repo_path == os.getcwd()
+
+
+# ---------------------------------------------------------------------------
+# validate_environment tests
+# ---------------------------------------------------------------------------
+
+
+class TestValidateEnvironment:
+    def test_valid_config(self):
+        env = {
+            "ANTHROPIC_API_KEY": "sk-test-key",
+            "LANGSMITH_API_KEY": "ls-key",
+            "LANGSMITH_PROJECT": "my-project",
+        }
+        with patch.dict(os.environ, env, clear=False), \
+             patch("main.subprocess.run") as mock_run, \
+             patch("main.shutil.which", return_value="/usr/bin/git"):
+            mock_run.return_value = MagicMock(returncode=0)
+            config = validate_environment()
+
+        assert config["api_key"] == "sk-test-key"
+        assert config["auth_token"] == ""
+        assert config["langsmith_api_key"] == "ls-key"
+        assert config["langsmith_project"] == "my-project"
+
+    def test_missing_api_key_still_works(self):
+        """API key is optional (Claude Code OAuth supported)."""
+        with patch.dict(os.environ, {}, clear=True), \
+             patch("main.subprocess.run") as mock_run, \
+             patch("main.shutil.which", return_value="/usr/bin/git"):
+            mock_run.return_value = MagicMock(returncode=0)
+            config = validate_environment()
+        assert config["api_key"] == ""
+        assert config["auth_token"] == ""
+
+    def test_docker_not_running(self):
+        with patch.dict(os.environ, {"ANTHROPIC_API_KEY": "sk-test-key"}, clear=False), \
+             patch("main.subprocess.run") as mock_run:
+            mock_run.return_value = MagicMock(returncode=1)
+            with pytest.raises(DockerDaemonError, match="not running"):
+                validate_environment()
+
+    def test_docker_not_found(self):
+        with patch.dict(os.environ, {"ANTHROPIC_API_KEY": "sk-test-key"}, clear=False), \
+             patch("main.subprocess.run", side_effect=FileNotFoundError):
+            with pytest.raises(DockerDaemonError, match="not found"):
+                validate_environment()
+
+    def test_docker_timeout(self):
+        import subprocess as sp
+
+        with patch.dict(os.environ, {"ANTHROPIC_API_KEY": "sk-test-key"}, clear=False), \
+             patch("main.subprocess.run", side_effect=sp.TimeoutExpired("docker", 10)):
+            with pytest.raises(DockerDaemonError, match="not responding"):
+                validate_environment()
+
+    def test_git_not_found(self):
+        with patch.dict(os.environ, {"ANTHROPIC_API_KEY": "sk-test-key"}, clear=False), \
+             patch("main.subprocess.run") as mock_run, \
+             patch("main.shutil.which", return_value=None):
+            mock_run.return_value = MagicMock(returncode=0)
+            with pytest.raises(GitError, match="git not found"):
+                validate_environment()
+
+
+# ---------------------------------------------------------------------------
+# print_summary tests
+# ---------------------------------------------------------------------------
+
+
+class TestPrintSummary:
+    def test_basic_summary(self, capsys):
+        result = {
+            "completed_tasks": ["1", "2"],
+            "tasks": [{"id": 1}, {"id": 2}, {"id": 3}],
+            "errors": [],
+            "iteration_count": 5,
+        }
+        start = datetime.now(timezone.utc)
+        print_summary(result, start)
+        captured = capsys.readouterr().out
+        assert "2 / 3" in captured
+        assert "Iterations" in captured
+
+    def test_summary_with_errors(self, capsys):
+        result = {
+            "completed_tasks": [],
+            "tasks": [{"id": 1}],
+            "errors": ["Error one", "Error two"],
+            "iteration_count": 1,
+        }
+        start = datetime.now(timezone.utc)
+        print_summary(result, start)
+        captured = capsys.readouterr().out
+        assert "Errors" in captured
+        assert "Error one" in captured
+
+    def test_summary_truncates_many_errors(self, capsys):
+        result = {
+            "completed_tasks": [],
+            "tasks": [],
+            "errors": [f"Error {i}" for i in range(10)],
+            "iteration_count": 0,
+        }
+        start = datetime.now(timezone.utc)
+        print_summary(result, start)
+        captured = capsys.readouterr().out
+        assert "and 5 more" in captured
+
+    def test_summary_with_langsmith(self, capsys):
+        result = {
+            "completed_tasks": [],
+            "tasks": [],
+            "errors": [],
+            "iteration_count": 0,
+        }
+        start = datetime.now(timezone.utc)
+        with patch.dict(os.environ, {"LANGSMITH_PROJECT": "test-proj"}):
+            print_summary(result, start)
+        captured = capsys.readouterr().out
+        assert "test-proj" in captured
+
+    def test_summary_empty_result(self, capsys):
+        result = {
+            "completed_tasks": [],
+            "tasks": [],
+            "errors": [],
+            "iteration_count": 0,
+        }
+        start = datetime.now(timezone.utc)
+        print_summary(result, start)
+        captured = capsys.readouterr().out
+        assert "0 / 0" in captured
+
+
+# ---------------------------------------------------------------------------
+# GracefulShutdown tests
+# ---------------------------------------------------------------------------
+
+
+class TestGracefulShutdown:
+    def test_initial_state(self):
+        with patch("main.signal.signal"):
+            gs = GracefulShutdown()
+        assert gs.shutdown_requested is False
+        assert gs.workspace_manager is None
+
+    def test_registers_signals(self):
+        with patch("main.signal.signal") as mock_signal:
+            gs = GracefulShutdown()
+        calls = [c[0] for c in mock_signal.call_args_list]
+        assert (signal.SIGINT, gs._handler) in calls
+        assert (signal.SIGTERM, gs._handler) in calls
+
+    def test_first_signal_sets_flag(self):
+        with patch("main.signal.signal"):
+            gs = GracefulShutdown()
+        # Simulate first signal
+        with patch("builtins.print"):
+            gs._handler(signal.SIGINT, None)
+        assert gs.shutdown_requested is True
+
+    def test_second_signal_force_exits(self):
+        with patch("main.signal.signal"):
+            gs = GracefulShutdown()
+        gs.shutdown_requested = True
+        with patch("builtins.print"), pytest.raises(SystemExit):
+            gs._handler(signal.SIGINT, None)
+
+    def test_first_signal_triggers_cleanup(self):
+        mock_ws = MagicMock()
+        mock_ws.cleanup_all = AsyncMock()
+        with patch("main.signal.signal"):
+            gs = GracefulShutdown(workspace_manager=mock_ws)
+
+        # Simulate handler with a running loop
+        loop = asyncio.new_event_loop()
+
+        async def _run():
+            with patch("builtins.print"):
+                gs._handler(signal.SIGINT, None)
+
+        loop.run_until_complete(_run())
+        loop.close()
+        assert gs.shutdown_requested is True
+
+
+# ---------------------------------------------------------------------------
+# run_factory tests
+# ---------------------------------------------------------------------------
+
+
+class TestRunFactory:
+    @pytest.mark.asyncio
+    async def test_initializes_all_components(self):
+        mock_orchestrator_instance = MagicMock()
+        mock_orchestrator_instance.run = AsyncMock(return_value={
+            "completed_tasks": ["1"],
+            "tasks": [{"id": 1}],
+            "errors": [],
+            "iteration_count": 1,
+        })
+
+        args = MagicMock()
+        args.prompt = "Build a REST API"
+        args.repo_path = "/tmp/test-repo"
+
+        config = {
+            "api_key": "sk-test-key",
+            "auth_token": "",
+            "langsmith_api_key": "",
+            "langsmith_project": "app-factory",
+        }
+
+        with patch("app_factory.core.observability.ObservabilityManager") as mock_obs, \
+             patch("app_factory.core.workspace.WorkspaceManager") as mock_ws, \
+             patch("app_factory.core.architecture_tracker.ArchitectureTracker") as mock_arch, \
+             patch("app_factory.agents.pm_agent.PMAgent") as mock_pm, \
+             patch("app_factory.agents.task_agent.TaskMasterAgent") as mock_task, \
+             patch("app_factory.agents.dev_agent.DevAgentManager") as mock_dev, \
+             patch("app_factory.agents.qa_agent.QAAgent") as mock_qa, \
+             patch("app_factory.core.graph.AppFactoryOrchestrator") as mock_orch, \
+             patch("main.GracefulShutdown", create=True):
+            mock_orch.return_value = mock_orchestrator_instance
+            mock_ws.return_value = MagicMock(docker_client=MagicMock())
+
+            result = await run_factory(args, config)
+
+        assert result["completed_tasks"] == ["1"]
+        mock_obs.assert_called_once()
+        mock_ws.assert_called_once_with(repo_path="/tmp/test-repo")
+        mock_arch.assert_called_once_with(
+            api_key="sk-test-key",
+            auth_token=None,
+            debug=False,
+            observability=mock_obs.return_value,
+        )
+        mock_pm.assert_called_once_with(
+            api_key="sk-test-key",
+            auth_token=None,
+            debug=False,
+            observability=mock_obs.return_value,
+        )
+        mock_task.assert_called_once_with(project_root="/tmp/test-repo")
+        mock_dev.assert_called_once()
+        mock_qa.assert_called_once_with(
+            repo_path="/tmp/test-repo",
+            api_key="sk-test-key",
+            auth_token=None,
+            debug=False,
+            observability=mock_obs.return_value,
+        )
+        mock_orch.assert_called_once()
+        mock_orchestrator_instance.run.assert_awaited_once_with("Build a REST API")
+
+
+# ---------------------------------------------------------------------------
+# main() integration tests
+# ---------------------------------------------------------------------------
+
+
+class TestMainEntryPoint:
+    def test_dry_run_validates_without_executing(self):
+        with patch("main.load_dotenv"), \
+             patch("main.parse_args") as mock_args, \
+             patch("main.validate_environment") as mock_validate, \
+             patch("builtins.print") as mock_print:
+            mock_args.return_value = MagicMock(
+                prompt="test", debug=False, dry_run=True,
+            )
+            mock_validate.return_value = {"api_key": "sk-test"}
+            main()
+
+        mock_print.assert_called_with(
+            "Dry-run: configuration is valid. All checks passed."
+        )
+
+    def test_configuration_error_exits(self):
+        with patch("main.load_dotenv"), \
+             patch("main.parse_args") as mock_args, \
+             patch("main.validate_environment", side_effect=ConfigurationError("no key")), \
+             patch("builtins.print"), \
+             pytest.raises(SystemExit) as exc_info:
+            mock_args.return_value = MagicMock(debug=False, dry_run=False)
+            main()
+        assert exc_info.value.code == 1
+
+    def test_docker_error_exits(self):
+        with patch("main.load_dotenv"), \
+             patch("main.parse_args") as mock_args, \
+             patch("main.validate_environment", side_effect=DockerDaemonError("not running")), \
+             patch("builtins.print"), \
+             pytest.raises(SystemExit) as exc_info:
+            mock_args.return_value = MagicMock(debug=False, dry_run=False)
+            main()
+        assert exc_info.value.code == 1
+
+    def test_git_error_exits(self):
+        with patch("main.load_dotenv"), \
+             patch("main.parse_args") as mock_args, \
+             patch("main.validate_environment", side_effect=GitError("no git")), \
+             patch("builtins.print"), \
+             pytest.raises(SystemExit) as exc_info:
+            mock_args.return_value = MagicMock(debug=False, dry_run=False)
+            main()
+        assert exc_info.value.code == 1
+
+    def test_clarification_timeout_exits(self):
+        with patch("main.load_dotenv"), \
+             patch("main.parse_args") as mock_args, \
+             patch("main.validate_environment") as mock_validate, \
+             patch("main.asyncio.run", side_effect=ClarificationTimeout("task 5")), \
+             patch("builtins.print"), \
+             pytest.raises(SystemExit) as exc_info:
+            mock_args.return_value = MagicMock(
+                prompt="test", debug=False, dry_run=False,
+            )
+            mock_validate.return_value = {"api_key": "sk-test"}
+            main()
+        assert exc_info.value.code == 1
+
+    def test_generic_exception_exits(self):
+        with patch("main.load_dotenv"), \
+             patch("main.parse_args") as mock_args, \
+             patch("main.validate_environment") as mock_validate, \
+             patch("main.asyncio.run", side_effect=RuntimeError("boom")), \
+             patch("builtins.print"), \
+             pytest.raises(SystemExit) as exc_info:
+            mock_args.return_value = MagicMock(
+                prompt="test", debug=False, dry_run=False,
+            )
+            mock_validate.return_value = {"api_key": "sk-test"}
+            main()
+        assert exc_info.value.code == 1
+
+    def test_debug_flag_sets_logging(self):
+        with patch("main.load_dotenv"), \
+             patch("main.parse_args") as mock_args, \
+             patch("main.validate_environment") as mock_validate, \
+             patch("main.logging.basicConfig") as mock_logging, \
+             patch("builtins.print"):
+            mock_args.return_value = MagicMock(
+                prompt="test", debug=True, dry_run=True,
+            )
+            mock_validate.return_value = {"api_key": "sk-test"}
+            main()
+
+        mock_logging.assert_called_once()
+        call_kwargs = mock_logging.call_args[1]
+        assert call_kwargs["level"] == 10  # logging.DEBUG
+
+    def test_successful_run(self):
+        mock_result = {
+            "completed_tasks": ["1"],
+            "tasks": [{"id": 1}],
+            "errors": [],
+            "iteration_count": 3,
+        }
+        with patch("main.load_dotenv"), \
+             patch("main.parse_args") as mock_args, \
+             patch("main.validate_environment") as mock_validate, \
+             patch("main.asyncio.run", return_value=mock_result), \
+             patch("main.print_summary") as mock_summary:
+            mock_args.return_value = MagicMock(
+                prompt="test", debug=False, dry_run=False,
+            )
+            mock_validate.return_value = {"api_key": "sk-test"}
+            main()
+
+        mock_summary.assert_called_once()
+        # Verify the result was passed to print_summary
+        assert mock_summary.call_args[0][0] == mock_result
+
+
+# ---------------------------------------------------------------------------
+# Exception hierarchy tests
+# ---------------------------------------------------------------------------
+
+
+class TestExceptionHierarchy:
+    def test_all_exceptions_inherit_from_base(self):
+        assert issubclass(ClarificationTimeout, AppFactoryError)
+        assert issubclass(DockerDaemonError, AppFactoryError)
+        assert issubclass(GitError, AppFactoryError)
+        assert issubclass(MCPConnectionError, AppFactoryError)
+        assert issubclass(ConfigurationError, AppFactoryError)
+
+    def test_base_inherits_from_exception(self):
+        assert issubclass(AppFactoryError, Exception)
--- a/tests/test_observability.py
+++ b/tests/test_observability.py
@@ -0,0 +1,426 @@
+"""Tests for ObservabilityManager."""
+
+import asyncio
+import logging
+import os
+import sys
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+from app_factory.core.observability import ObservabilityManager, _StructuredFormatter
+
+
+# ---------------------------------------------------------------------------
+# Fixtures
+# ---------------------------------------------------------------------------
+
+@pytest.fixture
+def obs():
+    """ObservabilityManager with LangSmith disabled (import fails)."""
+    # langsmith may not be installed; simulate import failure
+    with patch.dict(sys.modules, {"langsmith": None}):
+        manager = ObservabilityManager("test-project")
+    assert manager._client is None
+    return manager
+
+
+@pytest.fixture
+def obs_with_client():
+    """ObservabilityManager with a mocked LangSmith client."""
+    mock_client = MagicMock()
+    mock_langsmith = MagicMock()
+    mock_langsmith.Client.return_value = mock_client
+    with patch.dict(sys.modules, {"langsmith": mock_langsmith}):
+        manager = ObservabilityManager("test-project")
+    return manager, mock_client
+
+
+# ---------------------------------------------------------------------------
+# Initialization
+# ---------------------------------------------------------------------------
+
+class TestInitialization:
+    def test_explicit_project_name(self, obs):
+        assert obs.project_name == "test-project"
+
+    def test_default_project_from_env(self):
+        with patch.dict(os.environ, {"LANGSMITH_PROJECT": "env-project"}):
+            with patch.dict(sys.modules, {"langsmith": None}):
+                manager = ObservabilityManager()
+        assert manager.project_name == "env-project"
+
+    def test_fallback_project_name(self):
+        env = os.environ.copy()
+        env.pop("LANGSMITH_PROJECT", None)
+        with patch.dict(os.environ, env, clear=True):
+            with patch.dict(sys.modules, {"langsmith": None}):
+                manager = ObservabilityManager()
+        assert manager.project_name == "app-factory"
+
+    def test_graceful_degradation_no_langsmith(self, obs):
+        assert obs._client is None
+
+    def test_logger_created(self, obs):
+        assert obs.logger is not None
+        assert obs.logger.level == logging.DEBUG
+        assert obs.logger.propagate is False
+
+    def test_claude_event_mode_from_env(self):
+        with patch.dict(os.environ, {"APP_FACTORY_CLAUDE_EVENT_MODE": "verbose"}):
+            with patch.dict(sys.modules, {"langsmith": None}):
+                manager = ObservabilityManager("test-project")
+        assert manager._claude_event_mode == "verbose"
+
+
+# ---------------------------------------------------------------------------
+# Tracing lifecycle
+# ---------------------------------------------------------------------------
+
+class TestTracing:
+    def test_start_trace_returns_run_id(self, obs):
+        run_id = obs.start_trace("pm_agent", "1.2")
+        assert isinstance(run_id, str)
+        assert len(run_id) == 32  # hex uuid
+
+    def test_start_and_end_trace(self, obs):
+        run_id = obs.start_trace("pm_agent", "1.2", inputs={"key": "val"})
+        obs.end_trace(run_id, outputs={"result": "ok"})
+        assert run_id not in obs._active_runs
+
+    def test_end_trace_with_error(self, obs):
+        run_id = obs.start_trace("pm_agent", "1.2")
+        obs.end_trace(run_id, error="something went wrong")
+        metrics = obs.get_metrics()
+        assert metrics["total_errors"] == 1
+
+    def test_langsmith_create_run_called(self, obs_with_client):
+        manager, mock_client = obs_with_client
+        manager.start_trace("agent", "1.1", inputs={"x": 1})
+        mock_client.create_run.assert_called_once()
+
+    def test_langsmith_update_run_called(self, obs_with_client):
+        manager, mock_client = obs_with_client
+        run_id = manager.start_trace("agent", "1.1")
+        manager.end_trace(run_id, outputs={"y": 2})
+        mock_client.update_run.assert_called_once()
+
+    def test_langsmith_failure_does_not_raise(self):
+        mock_client = MagicMock()
+        mock_client.create_run.side_effect = Exception("network error")
+        mock_langsmith = MagicMock()
+        mock_langsmith.Client.return_value = mock_client
+        with patch.dict(sys.modules, {"langsmith": mock_langsmith}):
+            manager = ObservabilityManager("test-project")
+        # Should not raise
+        run_id = manager.start_trace("agent", "1.1")
+        assert isinstance(run_id, str)
+
+
+# ---------------------------------------------------------------------------
+# Decorator
+# ---------------------------------------------------------------------------
+
+class TestDecorator:
+    def test_sync_decorator(self, obs):
+        @obs.trace_agent_execution("agent", "1.1")
+        def my_func(x):
+            return x * 2
+
+        result = my_func(5)
+        assert result == 10
+        assert obs.get_metrics()["total_traces"] == 1
+
+    def test_async_decorator(self, obs):
+        @obs.trace_agent_execution("agent", "1.1")
+        async def my_async_func(x):
+            return x + 1
+
+        result = asyncio.run(my_async_func(10))
+        assert result == 11
+        assert obs.get_metrics()["total_traces"] == 1
+
+    def test_decorator_records_error(self, obs):
+        @obs.trace_agent_execution("agent", "1.1")
+        def failing():
+            raise ValueError("boom")
+
+        with pytest.raises(ValueError, match="boom"):
+            failing()
+        assert obs.get_metrics()["total_errors"] == 1
+
+    def test_decorator_preserves_function_name(self, obs):
+        @obs.trace_agent_execution("agent", "1.1")
+        def my_named_func():
+            pass
+
+        assert my_named_func.__name__ == "my_named_func"
+
+
+# ---------------------------------------------------------------------------
+# Async context manager
+# ---------------------------------------------------------------------------
+
+class TestAsyncContextManager:
+    def test_trace_context_success(self, obs):
+        async def run():
+            async with obs.trace_context("agent", "1.1") as run_id:
+                assert isinstance(run_id, str)
+            return run_id
+
+        run_id = asyncio.run(run())
+        assert run_id not in obs._active_runs
+        assert obs.get_metrics()["total_traces"] == 1
+
+    def test_trace_context_exception(self, obs):
+        async def run():
+            with pytest.raises(RuntimeError):
+                async with obs.trace_context("agent", "1.1") as run_id:
+                    raise RuntimeError("test error")
+            return run_id
+
+        asyncio.run(run())
+        assert obs.get_metrics()["total_errors"] == 1
+
+
+# ---------------------------------------------------------------------------
+# Async trace_agent helper
+# ---------------------------------------------------------------------------
+
+class TestTraceAgent:
+    def test_trace_agent_success(self, obs):
+        async def do_test():
+            async def my_coro():
+                return 42
+            return await obs.trace_agent("agent", "1.1", my_coro)
+
+        result = asyncio.run(do_test())
+        assert result == 42
+        assert obs.get_metrics()["total_traces"] == 1
+
+
+# ---------------------------------------------------------------------------
+# Logging format
+# ---------------------------------------------------------------------------
+
+class TestLoggingFormat:
+    def test_structured_formatter(self):
+        formatter = _StructuredFormatter(use_color=False)
+        record = logging.LogRecord(
+            name="test",
+            level=logging.INFO,
+            pathname="",
+            lineno=0,
+            msg="hello world",
+            args=(),
+            exc_info=None,
+        )
+        record.agent_name = "PM_AGENT"
+        record.task_id = "task-1.2"
+        formatted = formatter.format(record)
+        assert "[PM_AGENT]" in formatted
+        assert "[task-1.2]" in formatted
+        assert "[INFO]" in formatted
+        assert "hello world" in formatted
+        # Check ISO timestamp pattern
+        assert formatted.startswith("[20")
+        assert "\x1b[" not in formatted
+
+    def test_formatter_defaults(self):
+        formatter = _StructuredFormatter(use_color=False)
+        record = logging.LogRecord(
+            name="test",
+            level=logging.WARNING,
+            pathname="",
+            lineno=0,
+            msg="warn msg",
+            args=(),
+            exc_info=None,
+        )
+        formatted = formatter.format(record)
+        assert "[SYSTEM]" in formatted
+        assert "[-]" in formatted
+
+    def test_formatter_can_render_colors(self):
+        formatter = _StructuredFormatter(use_color=True)
+        record = logging.LogRecord(
+            name="test",
+            level=logging.INFO,
+            pathname="",
+            lineno=0,
+            msg="Trace started: run_id=abc123",
+            args=(),
+            exc_info=None,
+        )
+        record.agent_name = "PM_AGENT"
+        record.task_id = "task-1.2"
+        formatted = formatter.format(record)
+        assert "\x1b[" in formatted
+
+
+# ---------------------------------------------------------------------------
+# Logging methods
+# ---------------------------------------------------------------------------
+
+class TestLoggingMethods:
+    class _CaptureHandler(logging.Handler):
+        def __init__(self):
+            super().__init__()
+            self.messages: list[str] = []
+
+        def emit(self, record):
+            self.messages.append(record.getMessage())
+
+    def test_log_state_transition(self, obs):
+        capture = self._CaptureHandler()
+        obs.logger.addHandler(capture)
+        obs.log_state_transition("idle", "processing", {"reason": "new task"})
+        obs.logger.removeHandler(capture)
+        assert any("idle -> processing" in m for m in capture.messages)
+
+    def test_log_token_usage(self, obs):
+        obs.log_token_usage("pm_agent", "1.1", input_tokens=100, output_tokens=50, model="claude-3")
+        metrics = obs.get_metrics()
+        assert metrics["total_tokens"] == 150
+        assert metrics["per_agent"]["pm_agent"]["tokens"] == 150
+
+    def test_log_error(self, obs):
+        try:
+            raise ValueError("test error")
+        except ValueError as e:
+            obs.log_error("pm_agent", "1.1", e, context={"step": "parse"})
+        metrics = obs.get_metrics()
+        assert metrics["total_errors"] == 1
+        assert metrics["per_agent"]["pm_agent"]["errors"] == 1
+
+    def test_log_claude_event(self, obs):
+        obs.log_claude_event("pm_agent", "1.1", "tool_use", {"tool_name": "Bash"})
+        obs.log_claude_event("pm_agent", "1.1", "result_message", {"subtype": "success"})
+        metrics = obs.get_metrics()
+        assert metrics["total_claude_events"] == 2
+        assert metrics["total_tool_calls"] == 1
+        assert metrics["per_agent"]["pm_agent"]["claude_events"] == 2
+        assert metrics["per_agent"]["pm_agent"]["tool_calls"] == 1
+
+    def test_log_claude_event_readable_format(self, obs):
+        capture = self._CaptureHandler()
+        obs.logger.addHandler(capture)
+        obs.log_claude_event(
+            "pm_agent",
+            "1.1",
+            "tool_use",
+            {"tool_name": "Bash", "tool_use_id": "toolu_1", "tool_input": {"command": "pwd"}},
+        )
+        obs.logger.removeHandler(capture)
+        combined = "\n".join(capture.messages)
+        assert "Claude tool call: Bash" in combined
+        assert "command=pwd" in combined
+
+    def test_log_claude_event_noisy_tool_result_is_suppressed_on_success(self, obs):
+        capture = self._CaptureHandler()
+        obs.logger.addHandler(capture)
+        obs.log_claude_event(
+            "pm_agent",
+            "1.1",
+            "tool_use",
+            {"tool_name": "Read", "tool_use_id": "toolu_1", "tool_input": {"file_path": "README.md"}},
+        )
+        obs.log_claude_event(
+            "pm_agent",
+            "1.1",
+            "tool_result",
+            {"tool_use_id": "toolu_1", "is_error": False, "content": "x" * 1000},
+        )
+        obs.logger.removeHandler(capture)
+        combined = "\n".join(capture.messages)
+        assert "Claude tool call: Read path=README.md" in combined
+        assert "Claude tool result: Read" not in combined
+
+    def test_log_claude_event_noisy_tool_result_logs_errors_with_context(self, obs):
+        capture = self._CaptureHandler()
+        obs.logger.addHandler(capture)
+        obs.log_claude_event(
+            "pm_agent",
+            "1.1",
+            "tool_use",
+            {"tool_name": "Read", "tool_use_id": "toolu_1", "tool_input": {"file_path": "README.md"}},
+        )
+        obs.log_claude_event(
+            "pm_agent",
+            "1.1",
+            "tool_result",
+            {"tool_use_id": "toolu_1", "is_error": True, "content": "permission denied"},
+        )
+        obs.logger.removeHandler(capture)
+        combined = "\n".join(capture.messages)
+        assert "Claude tool result: Read status=error path=README.md error=permission denied" in combined
+
+    def test_log_claude_event_filters_noise_by_default(self, obs):
+        capture = self._CaptureHandler()
+        obs.logger.addHandler(capture)
+        obs.log_claude_event("pm_agent", "1.1", "stream_event", {"event": "delta"})
+        obs.logger.removeHandler(capture)
+        combined = "\n".join(capture.messages)
+        assert "stream_event" not in combined
+
+    def test_log_claude_event_verbose_mode_includes_noise(self):
+        with patch.dict(sys.modules, {"langsmith": None}):
+            manager = ObservabilityManager("test-project", claude_event_mode="verbose")
+
+        capture = self._CaptureHandler()
+        manager.logger.addHandler(capture)
+        manager.log_claude_event("pm_agent", "1.1", "stream_event", {"event": "delta"})
+        manager.logger.removeHandler(capture)
+        combined = "\n".join(capture.messages)
+        assert "Claude event: type=stream_event" in combined
+
+
+# ---------------------------------------------------------------------------
+# Metrics
+# ---------------------------------------------------------------------------
+
+class TestMetrics:
+    def test_initial_metrics(self, obs):
+        metrics = obs.get_metrics()
+        assert metrics["total_tokens"] == 0
+        assert metrics["total_traces"] == 0
+        assert metrics["total_errors"] == 0
+        assert metrics["total_claude_events"] == 0
+        assert metrics["total_tool_calls"] == 0
+
+    def test_metrics_accumulate(self, obs):
+        obs.start_trace("a1", "1.1")
+        obs.start_trace("a1", "1.2")
+        obs.start_trace("a2", "2.1")
+        obs.log_token_usage("a1", "1.1", 10, 20)
+        obs.log_token_usage("a2", "2.1", 5, 5)
+
+        metrics = obs.get_metrics()
+        assert metrics["total_traces"] == 3
+        assert metrics["total_tokens"] == 40
+        assert metrics["per_agent"]["a1"]["traces"] == 2
+        assert metrics["per_agent"]["a2"]["traces"] == 1
+        assert metrics["per_agent"]["a1"]["tokens"] == 30
+        assert metrics["per_agent"]["a2"]["tokens"] == 10
+
+
+# ---------------------------------------------------------------------------
+# Graceful degradation
+# ---------------------------------------------------------------------------
+
+class TestGracefulDegradation:
+    def test_all_operations_work_without_langsmith(self, obs):
+        """Every public method should work fine with _client=None."""
+        assert obs._client is None
+
+        run_id = obs.start_trace("agent", "1.1", inputs={"a": 1})
+        obs.end_trace(run_id, outputs={"b": 2})
+        obs.end_trace("nonexistent-id", error="whatever")
+        obs.log_state_transition("a", "b")
+        obs.log_token_usage("agent", "1.1", 10, 20)
+        try:
+            raise RuntimeError("test")
+        except RuntimeError as e:
+            obs.log_error("agent", "1.1", e)
+        metrics = obs.get_metrics()
+        assert metrics["total_traces"] == 1
--- a/tests/test_pm_agent.py
+++ b/tests/test_pm_agent.py
@@ -0,0 +1,257 @@
+"""Tests for PMAgent."""
+
+import os
+import tempfile
+from types import SimpleNamespace
+from unittest.mock import AsyncMock, patch
+
+import pytest
+
+from app_factory.agents.pm_agent import PMAgent
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+def _make_api_response(text, input_tokens=10, output_tokens=20):
+    """Build a fake Claude SDK completion response."""
+    return SimpleNamespace(text=text, input_tokens=input_tokens, output_tokens=output_tokens)
+
+
+def _build_agent(**kwargs):
+    """Create a PMAgent with a mocked Claude SDK client."""
+    with patch("app_factory.agents.pm_agent.ClaudeSDKClient") as mock_mod:
+        mock_client = AsyncMock()
+        mock_mod.return_value = mock_client
+        agent = PMAgent(api_key="test-key", **kwargs)
+    agent.client = mock_client
+    return agent, mock_client
+
+
+# ---------------------------------------------------------------------------
+# Initialization
+# ---------------------------------------------------------------------------
+
+class TestInitialization:
+    def test_no_api_key_uses_default_client(self):
+        with patch.dict(os.environ, {}, clear=True):
+            env = os.environ.copy()
+            env.pop("ANTHROPIC_API_KEY", None)
+            with patch.dict(os.environ, env, clear=True), \
+                 patch("app_factory.agents.pm_agent.ClaudeSDKClient") as mock_mod:
+                mock_mod.return_value = AsyncMock()
+                agent = PMAgent()
+                mock_mod.assert_called_once_with(
+                    api_key=None,
+                    auth_token=None,
+                    enable_debug=False,
+                )
+
+    def test_api_key_from_param(self):
+        agent, _ = _build_agent()
+        assert agent.model == "claude-opus-4-6"
+
+    def test_custom_model(self):
+        agent, _ = _build_agent(model="claude-opus-4-6")
+        assert agent.model == "claude-opus-4-6"
+
+
+# ---------------------------------------------------------------------------
+# Template loading
+# ---------------------------------------------------------------------------
+
+class TestTemplateLoading:
+    def test_load_prd_template(self):
+        agent, _ = _build_agent()
+        template = agent._load_template("pm_prd_expansion.txt")
+        assert "Product Manager" in template
+        assert "Objective" in template
+
+    def test_load_clarification_template(self):
+        agent, _ = _build_agent()
+        template = agent._load_template("pm_clarification.txt")
+        assert "{requesting_agent}" in template
+        assert "ESCALATE_TO_HUMAN" in template
+
+    def test_load_missing_template_raises(self):
+        agent, _ = _build_agent()
+        with pytest.raises(FileNotFoundError):
+            agent._load_template("nonexistent.txt")
+
+
+# ---------------------------------------------------------------------------
+# PRD expansion
+# ---------------------------------------------------------------------------
+
+class TestExpandPromptToPrd:
+    @pytest.mark.asyncio
+    async def test_returns_prd_markdown(self):
+        agent, mock_client = _build_agent()
+        prd_text = (
+            "# Objective\nBuild a todo app\n"
+            "# Core Requirements\n1. Add tasks\n"
+            "# Technical Architecture\nMonolith\n"
+            "# Tech Stack\nPython, FastAPI\n"
+            "# Success Criteria\nAll tests pass\n"
+            "# Non-Functional Requirements\n<1s response"
+        )
+        mock_client.complete = AsyncMock(
+            return_value=_make_api_response(prd_text, 15, 100)
+        )
+
+        result = await agent.expand_prompt_to_prd("Build a todo app")
+
+        assert "Objective" in result
+        assert "Core Requirements" in result
+        assert "Technical Architecture" in result
+        assert "Tech Stack" in result
+        assert "Success Criteria" in result
+        assert "Non-Functional Requirements" in result
+        mock_client.complete.assert_awaited_once()
+
+    @pytest.mark.asyncio
+    async def test_tracks_token_usage(self):
+        agent, mock_client = _build_agent()
+        mock_client.complete = AsyncMock(
+            return_value=_make_api_response("prd content", 50, 200)
+        )
+
+        await agent.expand_prompt_to_prd("some input")
+
+        usage = agent.get_token_usage()
+        assert usage["input_tokens"] == 50
+        assert usage["output_tokens"] == 200
+        assert usage["total_tokens"] == 250
+
+
+# ---------------------------------------------------------------------------
+# Clarification handling
+# ---------------------------------------------------------------------------
+
+class TestHandleClarification:
+    @pytest.mark.asyncio
+    async def test_auto_resolve(self):
+        agent, mock_client = _build_agent()
+        mock_client.complete = AsyncMock(
+            return_value=_make_api_response("Use PostgreSQL for the database.")
+        )
+
+        result = await agent.handle_clarification_request({
+            "requesting_agent": "dev_agent",
+            "task_id": "2.1",
+            "question": "Which database should I use?",
+            "context": "PRD says relational DB",
+        })
+
+        assert result == "Use PostgreSQL for the database."
+
+    @pytest.mark.asyncio
+    async def test_escalate_to_human(self):
+        agent, mock_client = _build_agent()
+        mock_client.complete = AsyncMock(
+            return_value=_make_api_response("ESCALATE_TO_HUMAN")
+        )
+
+        with patch("builtins.input", return_value="Use MySQL") as mock_input:
+            result = await agent.handle_clarification_request({
+                "requesting_agent": "dev_agent",
+                "task_id": "3.1",
+                "question": "Which vendor should we pick?",
+                "context": "",
+            })
+
+        assert result == "Use MySQL"
+        mock_input.assert_called_once()
+
+    @pytest.mark.asyncio
+    async def test_tracks_tokens(self):
+        agent, mock_client = _build_agent()
+        mock_client.complete = AsyncMock(
+            return_value=_make_api_response("answer", 5, 10)
+        )
+
+        await agent.handle_clarification_request({
+            "requesting_agent": "qa",
+            "task_id": "1.1",
+            "question": "q",
+            "context": "c",
+        })
+
+        usage = agent.get_token_usage()
+        assert usage["input_tokens"] == 5
+        assert usage["output_tokens"] == 10
+
+
+# ---------------------------------------------------------------------------
+# PRD updates
+# ---------------------------------------------------------------------------
+
+class TestUpdatePrd:
+    def test_appends_with_version_header(self):
+        agent, _ = _build_agent()
+
+        with tempfile.NamedTemporaryFile(mode="w", suffix=".md", delete=False) as f:
+            f.write("# Original PRD\nSome content\n")
+            prd_path = f.name
+
+        try:
+            agent.update_prd(prd_path, "Added authentication requirement.")
+
+            with open(prd_path) as f:
+                content = f.read()
+
+            assert "# Original PRD" in content
+            assert "## PRD Update -" in content
+            assert "Added authentication requirement." in content
+        finally:
+            os.unlink(prd_path)
+
+    def test_multiple_updates_maintain_history(self):
+        agent, _ = _build_agent()
+
+        with tempfile.NamedTemporaryFile(mode="w", suffix=".md", delete=False) as f:
+            f.write("# PRD v1\n")
+            prd_path = f.name
+
+        try:
+            agent.update_prd(prd_path, "Update 1")
+            agent.update_prd(prd_path, "Update 2")
+
+            with open(prd_path) as f:
+                content = f.read()
+
+            assert content.count("## PRD Update -") == 2
+            assert "Update 1" in content
+            assert "Update 2" in content
+        finally:
+            os.unlink(prd_path)
+
+
+# ---------------------------------------------------------------------------
+# Token usage
+# ---------------------------------------------------------------------------
+
+class TestTokenUsage:
+    def test_initial_usage_zero(self):
+        agent, _ = _build_agent()
+        usage = agent.get_token_usage()
+        assert usage == {"input_tokens": 0, "output_tokens": 0, "total_tokens": 0}
+
+    @pytest.mark.asyncio
+    async def test_accumulates_across_calls(self):
+        agent, mock_client = _build_agent()
+        mock_client.complete = AsyncMock(
+            side_effect=[
+                _make_api_response("r1", 10, 20),
+                _make_api_response("r2", 30, 40),
+            ]
+        )
+
+        await agent.expand_prompt_to_prd("first call")
+        await agent.expand_prompt_to_prd("second call")
+
+        usage = agent.get_token_usage()
+        assert usage["input_tokens"] == 40
+        assert usage["output_tokens"] == 60
+        assert usage["total_tokens"] == 100
--- a/tests/test_qa_agent.py
+++ b/tests/test_qa_agent.py
@@ -0,0 +1,550 @@
+"""Tests for QAAgent."""
+
+import os
+import subprocess
+from types import SimpleNamespace
+from unittest.mock import AsyncMock, MagicMock, patch
+
+import git as gitmod
+import pytest
+
+from app_factory.agents.qa_agent import QAAgent
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+def _make_review_response(text, input_tokens=10, output_tokens=20):
+    """Build a fake Claude SDK completion response."""
+    return SimpleNamespace(text=text, input_tokens=input_tokens, output_tokens=output_tokens)
+
+
+def _build_agent(repo_path="/fake/repo", **kwargs):
+    """Create a QAAgent with mocked git.Repo and Claude SDK client."""
+    with patch("app_factory.agents.qa_agent.git.Repo") as mock_repo_cls, \
+         patch("app_factory.agents.qa_agent.ClaudeSDKClient") as mock_sdk_client:
+        mock_repo = MagicMock()
+        mock_repo_cls.return_value = mock_repo
+        mock_client = AsyncMock()
+        mock_sdk_client.return_value = mock_client
+        agent = QAAgent(repo_path=repo_path, api_key="test-key", **kwargs)
+    agent.client = mock_client
+    agent.repo = mock_repo
+    return agent, mock_repo, mock_client
+
+
+APPROVED_REVIEW = """\
+APPROVED: true
+ISSUES:
+- [severity: info] Minor style suggestion
+SUMMARY: Code looks good overall."""
+
+REJECTED_REVIEW = """\
+APPROVED: false
+ISSUES:
+- [severity: critical] SQL injection in query builder
+- [severity: warning] Missing input validation
+SUMMARY: Critical security issue found."""
+
+
+# ---------------------------------------------------------------------------
+# Initialization
+# ---------------------------------------------------------------------------
+
+class TestInitialization:
+    def test_no_api_key_uses_default_client(self):
+        with patch("app_factory.agents.qa_agent.git.Repo"), \
+             patch("app_factory.agents.qa_agent.ClaudeSDKClient") as mock_sdk_client, \
+             patch.dict(os.environ, {}, clear=True):
+            env = os.environ.copy()
+            env.pop("ANTHROPIC_API_KEY", None)
+            with patch.dict(os.environ, env, clear=True):
+                mock_sdk_client.return_value = AsyncMock()
+                agent = QAAgent(repo_path="/fake")
+                mock_sdk_client.assert_called_once_with(
+                    api_key=None,
+                    auth_token=None,
+                    enable_debug=False,
+                )
+
+    def test_creates_with_api_key(self):
+        agent, mock_repo, _ = _build_agent()
+        assert agent.max_retries == 3
+
+    def test_custom_max_retries(self):
+        agent, _, _ = _build_agent(max_retries=5)
+        assert agent.max_retries == 5
+
+
+# ---------------------------------------------------------------------------
+# Rebase
+# ---------------------------------------------------------------------------
+
+class TestRebaseOntoMain:
+    @pytest.mark.asyncio
+    async def test_rebase_success(self):
+        agent, _, _ = _build_agent()
+        mock_wt_repo = MagicMock()
+        with patch("app_factory.agents.qa_agent.git.Repo", return_value=mock_wt_repo):
+            result = await agent.rebase_onto_main("/worktree/path", "task-1")
+
+        assert result["success"] is True
+        assert result["conflicts"] == []
+        mock_wt_repo.git.rebase.assert_called_once_with("main")
+
+    @pytest.mark.asyncio
+    async def test_rebase_conflict_unresolvable(self):
+        agent, _, _ = _build_agent()
+        mock_wt_repo = MagicMock()
+        mock_wt_repo.git.rebase.side_effect = gitmod.GitCommandError("rebase", "CONFLICT")
+        mock_wt_repo.git.status.return_value = "UU conflicted_file.py"
+
+        with patch("app_factory.agents.qa_agent.git.Repo", return_value=mock_wt_repo), \
+             patch.object(agent, "auto_resolve_conflicts", return_value=False):
+            result = await agent.rebase_onto_main("/worktree/path", "task-1")
+
+        assert result["success"] is False
+        assert "conflicted_file.py" in result["conflicts"]
+
+    @pytest.mark.asyncio
+    async def test_rebase_conflict_auto_resolved(self):
+        agent, _, _ = _build_agent()
+        mock_wt_repo = MagicMock()
+        mock_wt_repo.git.rebase.side_effect = gitmod.GitCommandError("rebase", "CONFLICT")
+        mock_wt_repo.git.status.return_value = "UU file.py"
+
+        with patch("app_factory.agents.qa_agent.git.Repo", return_value=mock_wt_repo), \
+             patch.object(agent, "auto_resolve_conflicts", return_value=True):
+            result = await agent.rebase_onto_main("/worktree/path", "task-1")
+
+        assert result["success"] is True
+
+    @pytest.mark.asyncio
+    async def test_fetch_failure_continues(self):
+        """If fetch fails (no remote), rebase should still be attempted."""
+        agent, _, _ = _build_agent()
+        mock_wt_repo = MagicMock()
+        mock_wt_repo.git.fetch.side_effect = gitmod.GitCommandError("fetch", "No remote")
+
+        with patch("app_factory.agents.qa_agent.git.Repo", return_value=mock_wt_repo):
+            result = await agent.rebase_onto_main("/worktree/path", "task-1")
+
+        assert result["success"] is True
+        mock_wt_repo.git.rebase.assert_called_once_with("main")
+
+
+# ---------------------------------------------------------------------------
+# Linter
+# ---------------------------------------------------------------------------
+
+class TestRunLinter:
+    def test_lint_passes(self):
+        agent, _, _ = _build_agent()
+        mock_result = subprocess.CompletedProcess(
+            args=["ruff", "check", "."],
+            returncode=0,
+            stdout="All checks passed!\n",
+            stderr="",
+        )
+        with patch("app_factory.agents.qa_agent.subprocess.run", return_value=mock_result):
+            result = agent.run_linter("/worktree/path")
+
+        assert result["passed"] is True
+        assert result["errors"] == []
+
+    def test_lint_fails_with_errors(self):
+        agent, _, _ = _build_agent()
+        ruff_output = (
+            "app/main.py:10:1: E501 Line too long (120 > 88 characters)\n"
+            "app/main.py:15:5: F841 Local variable 'x' is assigned but never used\n"
+            "Found 2 errors.\n"
+        )
+        mock_result = subprocess.CompletedProcess(
+            args=["ruff", "check", "."],
+            returncode=1,
+            stdout=ruff_output,
+            stderr="",
+        )
+        with patch("app_factory.agents.qa_agent.subprocess.run", return_value=mock_result):
+            result = agent.run_linter("/worktree/path")
+
+        assert result["passed"] is False
+        assert len(result["errors"]) == 2
+
+    def test_lint_ruff_not_found(self):
+        agent, _, _ = _build_agent()
+        with patch("app_factory.agents.qa_agent.subprocess.run", side_effect=FileNotFoundError):
+            result = agent.run_linter("/worktree/path")
+
+        assert result["passed"] is True
+        assert "ruff not found" in result["warnings"][0]
+
+    def test_lint_timeout(self):
+        agent, _, _ = _build_agent()
+        with patch("app_factory.agents.qa_agent.subprocess.run",
+                    side_effect=subprocess.TimeoutExpired(cmd="ruff", timeout=120)):
+            result = agent.run_linter("/worktree/path")
+
+        assert result["passed"] is False
+        assert "timed out" in result["errors"][0]
+
+
+# ---------------------------------------------------------------------------
+# Tests
+# ---------------------------------------------------------------------------
+
+class TestRunTests:
+    def test_all_tests_pass(self):
+        agent, _, _ = _build_agent()
+        pytest_output = (
+            "tests/test_foo.py::test_one PASSED\n"
+            "tests/test_foo.py::test_two PASSED\n"
+            "========================= 2 passed =========================\n"
+        )
+        mock_result = subprocess.CompletedProcess(
+            args=["python", "-m", "pytest"],
+            returncode=0,
+            stdout=pytest_output,
+            stderr="",
+        )
+        with patch("app_factory.agents.qa_agent.subprocess.run", return_value=mock_result):
+            result = agent.run_tests("/worktree/path")
+
+        assert result["passed"] is True
+        assert result["total"] == 2
+        assert result["failures"] == 0
+        assert result["errors"] == 0
+
+    def test_some_tests_fail(self):
+        agent, _, _ = _build_agent()
+        pytest_output = (
+            "tests/test_foo.py::test_one PASSED\n"
+            "tests/test_foo.py::test_two FAILED\n"
+            "=================== 1 failed, 1 passed ====================\n"
+        )
+        mock_result = subprocess.CompletedProcess(
+            args=["python", "-m", "pytest"],
+            returncode=1,
+            stdout=pytest_output,
+            stderr="",
+        )
+        with patch("app_factory.agents.qa_agent.subprocess.run", return_value=mock_result):
+            result = agent.run_tests("/worktree/path")
+
+        assert result["passed"] is False
+        assert result["total"] == 2
+        assert result["failures"] == 1
+
+    def test_pytest_not_found(self):
+        agent, _, _ = _build_agent()
+        with patch("app_factory.agents.qa_agent.subprocess.run", side_effect=FileNotFoundError):
+            result = agent.run_tests("/worktree/path")
+
+        assert result["passed"] is False
+        assert "pytest not found" in result["output"]
+
+    def test_pytest_timeout(self):
+        agent, _, _ = _build_agent()
+        with patch("app_factory.agents.qa_agent.subprocess.run",
+                    side_effect=subprocess.TimeoutExpired(cmd="pytest", timeout=300)):
+            result = agent.run_tests("/worktree/path")
+
+        assert result["passed"] is False
+        assert "timed out" in result["output"]
+
+
+# ---------------------------------------------------------------------------
+# Code Review
+# ---------------------------------------------------------------------------
+
+class TestCodeReview:
+    @pytest.mark.asyncio
+    async def test_review_approved(self):
+        agent, _, mock_client = _build_agent()
+        mock_client.complete = AsyncMock(
+            return_value=_make_review_response(APPROVED_REVIEW)
+        )
+
+        result = await agent.code_review("diff content", task={"id": "1", "title": "Add feature"})
+
+        assert result["approved"] is True
+        assert len(result["issues"]) == 1
+        assert result["issues"][0]["severity"] == "info"
+        assert result["summary"] != ""
+
+    @pytest.mark.asyncio
+    async def test_review_rejected(self):
+        agent, _, mock_client = _build_agent()
+        mock_client.complete = AsyncMock(
+            return_value=_make_review_response(REJECTED_REVIEW)
+        )
+
+        result = await agent.code_review("diff with issues")
+
+        assert result["approved"] is False
+        assert len(result["issues"]) == 2
+        assert result["issues"][0]["severity"] == "critical"
+        assert "security" in result["summary"].lower()
+
+    @pytest.mark.asyncio
+    async def test_review_no_task_context(self):
+        agent, _, mock_client = _build_agent()
+        mock_client.complete = AsyncMock(
+            return_value=_make_review_response(APPROVED_REVIEW)
+        )
+
+        result = await agent.code_review("diff content", task=None)
+
+        assert result["approved"] is True
+        mock_client.complete.assert_awaited_once()
+
+    @pytest.mark.asyncio
+    async def test_review_loads_template(self):
+        agent, _, mock_client = _build_agent()
+        mock_client.complete = AsyncMock(
+            return_value=_make_review_response(APPROVED_REVIEW)
+        )
+
+        await agent.code_review("some diff")
+
+        call_args = mock_client.complete.call_args
+        prompt_text = call_args.kwargs["prompt"]
+        assert "Review Checklist" in prompt_text
+        assert "OWASP" in prompt_text
+
+
+# ---------------------------------------------------------------------------
+# Merge to main
+# ---------------------------------------------------------------------------
+
+class TestMergeToMain:
+    def test_merge_success(self):
+        agent, mock_repo, _ = _build_agent()
+        mock_repo.head.commit.hexsha = "abc123def456"
+
+        result = agent.merge_to_main("/worktree/path", "42")
+
+        assert result["success"] is True
+        assert result["commit_sha"] == "abc123def456"
+        mock_repo.git.checkout.assert_called_once_with("main")
+        mock_repo.git.merge.assert_called_once_with(
+            "--no-ff", "feature/task-42", m="Merge feature/task-42"
+        )
+
+    def test_merge_failure(self):
+        agent, mock_repo, _ = _build_agent()
+        mock_repo.git.merge.side_effect = gitmod.GitCommandError("merge", "conflict")
+
+        result = agent.merge_to_main("/worktree/path", "42")
+
+        assert result["success"] is False
+        assert result["commit_sha"] is None
+
+
+# ---------------------------------------------------------------------------
+# Full pipeline: review_and_merge
+# ---------------------------------------------------------------------------
+
+class TestReviewAndMerge:
+    @pytest.mark.asyncio
+    async def test_happy_path(self):
+        agent, mock_repo, mock_client = _build_agent()
+        mock_repo.head.commit.hexsha = "merged123"
+
+        mock_wt_repo = MagicMock()
+        mock_wt_repo.git.diff.return_value = "diff --git a/file.py"
+
+        with patch.object(agent, "rebase_onto_main", new_callable=AsyncMock,
+                          return_value={"success": True, "conflicts": []}), \
+             patch.object(agent, "run_linter",
+                          return_value={"passed": True, "errors": [], "warnings": []}), \
+             patch.object(agent, "run_tests",
+                          return_value={"passed": True, "total": 5, "failures": 0, "errors": 0, "output": "ok"}), \
+             patch.object(agent, "code_review", new_callable=AsyncMock,
+                          return_value={"approved": True, "issues": [], "summary": "All good"}), \
+             patch.object(agent, "merge_to_main",
+                          return_value={"success": True, "commit_sha": "merged123"}), \
+             patch("app_factory.agents.qa_agent.git.Repo", return_value=mock_wt_repo):
+
+            result = await agent.review_and_merge("task-1", "/worktree/path")
+
+        assert result["status"] == "merged"
+        assert result["commit_sha"] == "merged123"
+
+    @pytest.mark.asyncio
+    async def test_rebase_failure(self):
+        agent, _, _ = _build_agent()
+
+        with patch.object(agent, "rebase_onto_main", new_callable=AsyncMock,
+                          return_value={"success": False, "conflicts": ["file.py"]}):
+            result = await agent.review_and_merge("task-2", "/worktree/path")
+
+        assert result["status"] == "rebase_failed"
+        assert "file.py" in result["conflicts"]
+        assert result["retry_count"] == 1
+
+    @pytest.mark.asyncio
+    async def test_lint_failure(self):
+        agent, _, _ = _build_agent()
+
+        with patch.object(agent, "rebase_onto_main", new_callable=AsyncMock,
+                          return_value={"success": True, "conflicts": []}), \
+             patch.object(agent, "run_linter",
+                          return_value={"passed": False, "errors": ["E501 line too long"], "warnings": []}):
+            result = await agent.review_and_merge("task-3", "/worktree/path")
+
+        assert result["status"] == "lint_failed"
+        assert len(result["errors"]) == 1
+
+    @pytest.mark.asyncio
+    async def test_test_failure(self):
+        agent, _, _ = _build_agent()
+        mock_wt_repo = MagicMock()
+
+        with patch.object(agent, "rebase_onto_main", new_callable=AsyncMock,
+                          return_value={"success": True, "conflicts": []}), \
+             patch.object(agent, "run_linter",
+                          return_value={"passed": True, "errors": [], "warnings": []}), \
+             patch.object(agent, "run_tests",
+                          return_value={"passed": False, "total": 3, "failures": 1, "errors": 0, "output": "FAILED"}):
+            result = await agent.review_and_merge("task-4", "/worktree/path")
+
+        assert result["status"] == "tests_failed"
+        assert result["failures"] == 1
+
+    @pytest.mark.asyncio
+    async def test_review_failure(self):
+        agent, _, _ = _build_agent()
+        mock_wt_repo = MagicMock()
+        mock_wt_repo.git.diff.return_value = "diff"
+
+        with patch.object(agent, "rebase_onto_main", new_callable=AsyncMock,
+                          return_value={"success": True, "conflicts": []}), \
+             patch.object(agent, "run_linter",
+                          return_value={"passed": True, "errors": [], "warnings": []}), \
+             patch.object(agent, "run_tests",
+                          return_value={"passed": True, "total": 3, "failures": 0, "errors": 0, "output": "ok"}), \
+             patch.object(agent, "code_review", new_callable=AsyncMock,
+                          return_value={"approved": False, "issues": [{"severity": "critical", "description": "vuln"}], "summary": "Bad"}), \
+             patch("app_factory.agents.qa_agent.git.Repo", return_value=mock_wt_repo):
+            result = await agent.review_and_merge("task-5", "/worktree/path")
+
+        assert result["status"] == "review_failed"
+        assert len(result["issues"]) == 1
+
+
+# ---------------------------------------------------------------------------
+# Parse test results
+# ---------------------------------------------------------------------------
+
+class TestParseTestResults:
+    def test_all_passed(self):
+        agent, _, _ = _build_agent()
+        output = "========================= 5 passed =========================\n"
+        result = agent.parse_test_results(output)
+        assert result["passed"] is True
+        assert result["total"] == 5
+        assert result["failures"] == 0
+        assert result["errors"] == 0
+
+    def test_mixed_results(self):
+        agent, _, _ = _build_agent()
+        output = "================ 1 failed, 4 passed, 1 error ================\n"
+        result = agent.parse_test_results(output)
+        assert result["passed"] is False
+        assert result["total"] == 6
+        assert result["failures"] == 1
+        assert result["errors"] == 1
+
+    def test_all_failed(self):
+        agent, _, _ = _build_agent()
+        output = "========================= 3 failed =========================\n"
+        result = agent.parse_test_results(output)
+        assert result["passed"] is False
+        assert result["total"] == 3
+        assert result["failures"] == 3
+
+    def test_no_tests(self):
+        agent, _, _ = _build_agent()
+        output = "no tests ran\n"
+        result = agent.parse_test_results(output)
+        assert result["passed"] is False
+        assert result["total"] == 0
+
+    def test_errors_only(self):
+        agent, _, _ = _build_agent()
+        output = "========================= 2 error =========================\n"
+        result = agent.parse_test_results(output)
+        assert result["passed"] is False
+        assert result["errors"] == 2
+
+
+# ---------------------------------------------------------------------------
+# Retry counter
+# ---------------------------------------------------------------------------
+
+class TestRetryCounter:
+    def test_initial_count_zero(self):
+        agent, _, _ = _build_agent()
+        assert agent.get_retry_count("task-1") == 0
+
+    def test_increment_and_get(self):
+        agent, _, _ = _build_agent()
+        agent._increment_retry("task-1")
+        assert agent.get_retry_count("task-1") == 1
+        agent._increment_retry("task-1")
+        assert agent.get_retry_count("task-1") == 2
+
+    def test_separate_task_counters(self):
+        agent, _, _ = _build_agent()
+        agent._increment_retry("task-1")
+        agent._increment_retry("task-1")
+        agent._increment_retry("task-2")
+        assert agent.get_retry_count("task-1") == 2
+        assert agent.get_retry_count("task-2") == 1
+
+    @pytest.mark.asyncio
+    async def test_pipeline_failure_increments_counter(self):
+        agent, _, _ = _build_agent()
+
+        with patch.object(agent, "rebase_onto_main", new_callable=AsyncMock,
+                          return_value={"success": False, "conflicts": ["a.py"]}):
+            await agent.review_and_merge("task-99", "/wt")
+
+        assert agent.get_retry_count("task-99") == 1
+
+        with patch.object(agent, "rebase_onto_main", new_callable=AsyncMock,
+                          return_value={"success": False, "conflicts": ["a.py"]}):
+            await agent.review_and_merge("task-99", "/wt")
+
+        assert agent.get_retry_count("task-99") == 2
+
+
+# ---------------------------------------------------------------------------
+# Review response parsing
+# ---------------------------------------------------------------------------
+
+class TestParseReviewResponse:
+    def test_approved_with_info(self):
+        agent, _, _ = _build_agent()
+        result = agent._parse_review_response(APPROVED_REVIEW)
+        assert result["approved"] is True
+        assert len(result["issues"]) == 1
+        assert result["issues"][0]["severity"] == "info"
+        assert "good" in result["summary"].lower()
+
+    def test_rejected_with_critical(self):
+        agent, _, _ = _build_agent()
+        result = agent._parse_review_response(REJECTED_REVIEW)
+        assert result["approved"] is False
+        assert len(result["issues"]) == 2
+        assert result["issues"][0]["severity"] == "critical"
+        assert result["issues"][1]["severity"] == "warning"
+
+    def test_empty_response(self):
+        agent, _, _ = _build_agent()
+        result = agent._parse_review_response("")
+        assert result["approved"] is False
+        assert result["issues"] == []
+        assert result["summary"] == ""
--- a/tests/test_task_agent.py
+++ b/tests/test_task_agent.py
@@ -0,0 +1,273 @@
+"""Tests for TaskMasterAgent."""
+
+import asyncio
+import json
+import subprocess
+from unittest.mock import AsyncMock, MagicMock, patch
+
+import pytest
+
+from app_factory.agents.task_agent import TaskMasterAgent
+
+
+@pytest.fixture
+def agent(tmp_path):
+    return TaskMasterAgent(project_root=str(tmp_path))
+
+
+def _cli_result(data, returncode=0, stderr=""):
+    """Build a mock subprocess.CompletedProcess returning JSON data."""
+    result = MagicMock(spec=subprocess.CompletedProcess)
+    result.returncode = returncode
+    result.stdout = json.dumps(data) if isinstance(data, dict) else data
+    result.stderr = stderr
+    return result
+
+
+# --- parse_prd ---
+
+
+@pytest.mark.asyncio
+async def test_parse_prd_writes_file_and_calls_cli(agent, tmp_path):
+    prd_text = "# My PRD\nBuild a thing."
+    cli_output = {"tasks": [{"id": 1, "title": "Task 1"}], "count": 1}
+
+    with patch("subprocess.run", return_value=_cli_result(cli_output)) as mock_run:
+        result = await agent.parse_prd(prd_text, num_tasks=5)
+
+    prd_path = tmp_path / ".taskmaster" / "docs" / "prd.md"
+    assert prd_path.exists()
+    assert prd_path.read_text() == prd_text
+
+    mock_run.assert_called_once()
+    call_args = mock_run.call_args[0][0]
+    assert call_args[0] == "task-master"
+    assert "parse-prd" in call_args
+    assert "--num-tasks" in call_args
+    assert "5" in call_args
+
+    assert result == cli_output
+
+
+# --- get_unblocked_tasks ---
+
+
+@pytest.mark.asyncio
+async def test_get_unblocked_tasks_filters_correctly(agent):
+    tasks_data = {
+        "tasks": [
+            {"id": 1, "title": "Done task", "status": "done", "dependencies": []},
+            {"id": 2, "title": "Pending, no deps", "status": "pending", "dependencies": []},
+            {"id": 3, "title": "Pending, dep done", "status": "pending", "dependencies": [1]},
+            {"id": 4, "title": "Pending, dep not done", "status": "pending", "dependencies": [5]},
+            {"id": 5, "title": "In-progress", "status": "in-progress", "dependencies": []},
+        ]
+    }
+
+    with patch("subprocess.run", return_value=_cli_result(tasks_data)):
+        unblocked = await agent.get_unblocked_tasks()
+
+    ids = [t["id"] for t in unblocked]
+    assert 2 in ids
+    assert 3 in ids
+    assert 4 not in ids  # dependency 5 is not done
+    assert 1 not in ids  # status is done, not pending
+    assert 5 not in ids  # status is in-progress
+
+
+@pytest.mark.asyncio
+async def test_get_unblocked_tasks_empty(agent):
+    with patch("subprocess.run", return_value=_cli_result({"tasks": []})):
+        unblocked = await agent.get_unblocked_tasks()
+
+    assert unblocked == []
+
+
+# --- update_task_status ---
+
+
+@pytest.mark.asyncio
+async def test_update_task_status_without_notes(agent):
+    with patch("subprocess.run", return_value=_cli_result({})) as mock_run:
+        await agent.update_task_status("3", "done")
+
+    assert mock_run.call_count == 1
+    call_args = mock_run.call_args[0][0]
+    assert "--id=3" in call_args
+    assert "--status=done" in call_args
+
+
+@pytest.mark.asyncio
+async def test_update_task_status_with_notes(agent):
+    with patch("subprocess.run", return_value=_cli_result({})) as mock_run:
+        await agent.update_task_status("3", "in-progress", notes="Started work")
+
+    assert mock_run.call_count == 2
+    first_call = mock_run.call_args_list[0][0][0]
+    second_call = mock_run.call_args_list[1][0][0]
+    assert "set-status" in first_call
+    assert "update-subtask" in second_call
+    assert "--prompt=Started work" in second_call
+
+
+# --- get_task_details ---
+
+
+@pytest.mark.asyncio
+async def test_get_task_details_returns_correct_structure(agent):
+    task_data = {
+        "task": {
+            "id": 2,
+            "title": "Auth system",
+            "description": "Implement JWT auth",
+            "details": "Use bcrypt for hashing",
+            "testStrategy": "Unit tests for auth",
+            "dependencies": [1],
+            "subtasks": [{"id": "2.1", "title": "Setup JWT"}],
+            "status": "pending",
+            "priority": "high",
+        }
+    }
+
+    with patch("subprocess.run", return_value=_cli_result(task_data)):
+        result = await agent.get_task_details("2")
+
+    assert result["id"] == 2
+    assert result["title"] == "Auth system"
+    assert result["description"] == "Implement JWT auth"
+    assert result["details"] == "Use bcrypt for hashing"
+    assert result["testStrategy"] == "Unit tests for auth"
+    assert result["dependencies"] == [1]
+    assert len(result["subtasks"]) == 1
+    assert result["status"] == "pending"
+    assert result["priority"] == "high"
+
+
+# --- get_next_task ---
+
+
+@pytest.mark.asyncio
+async def test_get_next_task_uses_cli(agent):
+    task_data = {"task": {"id": 3, "title": "Next task", "status": "pending"}}
+
+    with patch("subprocess.run", return_value=_cli_result(task_data)):
+        result = await agent.get_next_task()
+
+    assert result["id"] == 3
+
+
+@pytest.mark.asyncio
+async def test_get_next_task_fallback_when_cli_fails(agent):
+    tasks_data = {
+        "tasks": [
+            {"id": 1, "title": "Done", "status": "done", "dependencies": []},
+            {"id": 2, "title": "Low", "status": "pending", "dependencies": [], "priority": "low"},
+            {"id": 3, "title": "High", "status": "pending", "dependencies": [], "priority": "high"},
+        ]
+    }
+
+    fail_result = _cli_result("", returncode=1, stderr="error")
+
+    call_count = 0
+
+    def side_effect(*args, **kwargs):
+        nonlocal call_count
+        call_count += 1
+        cmd = args[0]
+        if "next" in cmd:
+            return fail_result
+        return _cli_result(tasks_data)
+
+    with patch("subprocess.run", side_effect=side_effect):
+        result = await agent.get_next_task()
+
+    assert result["id"] == 3  # high priority comes first
+
+
+@pytest.mark.asyncio
+async def test_get_next_task_returns_none_when_all_done(agent):
+    tasks_data = {
+        "tasks": [
+            {"id": 1, "title": "Done", "status": "done", "dependencies": []},
+        ]
+    }
+
+    fail_result = _cli_result("", returncode=1, stderr="error")
+
+    def side_effect(*args, **kwargs):
+        cmd = args[0]
+        if "next" in cmd:
+            return fail_result
+        return _cli_result(tasks_data)
+
+    with patch("subprocess.run", side_effect=side_effect):
+        result = await agent.get_next_task()
+
+    assert result is None
+
+
+# --- expand_task ---
+
+
+@pytest.mark.asyncio
+async def test_expand_task(agent):
+    expand_data = {"subtasks": [{"id": "1.1"}, {"id": "1.2"}]}
+
+    with patch("subprocess.run", return_value=_cli_result(expand_data)) as mock_run:
+        result = await agent.expand_task("1", num_subtasks=2)
+
+    call_args = mock_run.call_args[0][0]
+    assert "--id=1" in call_args
+    assert "--num=2" in call_args
+    assert "--force" in call_args
+    assert result == expand_data
+
+
+# --- retry logic ---
+
+
+@pytest.mark.asyncio
+async def test_retry_succeeds_after_failures(agent):
+    agent.base_delay = 0.01  # speed up test
+
+    fail = _cli_result("", returncode=1, stderr="transient error")
+    success = _cli_result({"tasks": []})
+
+    with patch("subprocess.run", side_effect=[fail, fail, success]):
+        result = await agent.get_unblocked_tasks()
+
+    assert result == []
+
+
+@pytest.mark.asyncio
+async def test_retry_exhausted_raises(agent):
+    agent.base_delay = 0.01
+
+    fail = _cli_result("", returncode=1, stderr="persistent error")
+
+    with patch("subprocess.run", return_value=fail):
+        with pytest.raises(RuntimeError, match="All 3 attempts failed"):
+            await agent.get_unblocked_tasks()
+
+
+@pytest.mark.asyncio
+async def test_retry_exponential_backoff(agent):
+    agent.base_delay = 0.01
+
+    fail = _cli_result("", returncode=1, stderr="error")
+    success = _cli_result({"tasks": []})
+
+    delays = []
+    original_sleep = asyncio.sleep
+
+    async def mock_sleep(duration):
+        delays.append(duration)
+        # don't actually sleep
+
+    with patch("subprocess.run", side_effect=[fail, fail, success]):
+        with patch("asyncio.sleep", side_effect=mock_sleep):
+            await agent.get_unblocked_tasks()
+
+    assert len(delays) == 2
+    assert delays[0] == pytest.approx(0.01)  # base_delay * 2^0
+    assert delays[1] == pytest.approx(0.02)  # base_delay * 2^1
--- a/tests/test_workspace.py
+++ b/tests/test_workspace.py
@@ -0,0 +1,269 @@
+"""Tests for WorkspaceManager."""
+
+import os
+from pathlib import Path
+from unittest.mock import MagicMock, patch
+
+import docker as docker_mod
+import git
+import pytest
+
+from app_factory.core.workspace import (
+    DockerProvisionError,
+    GitWorktreeError,
+    WorkspaceError,
+    WorkspaceManager,
+)
+
+
+@pytest.fixture
+def temp_git_repo(tmp_path):
+    """Create a real temporary git repository."""
+    repo_dir = tmp_path / "repo"
+    repo_dir.mkdir()
+    repo = git.Repo.init(repo_dir)
+    readme = repo_dir / "README.md"
+    readme.write_text("# Test Repo")
+    repo.index.add(["README.md"])
+    repo.index.commit("Initial commit")
+    if repo.active_branch.name != "main":
+        repo.git.branch("-M", "main")
+    return str(repo_dir)
+
+
+@pytest.fixture
+def mock_docker_client():
+    """Create a mock Docker client."""
+    mock_client = MagicMock()
+    mock_client.images.pull.return_value = MagicMock()
+    mock_container = MagicMock()
+    mock_container.id = "abc123container"
+    mock_client.containers.create.return_value = mock_container
+    return mock_client
+
+
+@pytest.fixture
+def workspace_manager(temp_git_repo, mock_docker_client):
+    """Create a WorkspaceManager with a real git repo and mocked Docker."""
+    with patch("app_factory.core.workspace.docker.from_env", return_value=mock_docker_client):
+        wm = WorkspaceManager(temp_git_repo)
+    return wm
+
+
+class TestWorkspaceManagerInit:
+    def test_init_valid_repo(self, temp_git_repo, mock_docker_client):
+        with patch("app_factory.core.workspace.docker.from_env", return_value=mock_docker_client):
+            wm = WorkspaceManager(temp_git_repo)
+        assert wm.repo_path == Path(temp_git_repo).resolve()
+        assert wm.docker_image == "python:3.11-slim"
+        assert wm.active_workspaces == {}
+
+    def test_init_custom_image(self, temp_git_repo, mock_docker_client):
+        with patch("app_factory.core.workspace.docker.from_env", return_value=mock_docker_client):
+            wm = WorkspaceManager(temp_git_repo, docker_image="node:20-slim")
+        assert wm.docker_image == "node:20-slim"
+
+    def test_init_invalid_repo(self, tmp_path, mock_docker_client):
+        with patch("app_factory.core.workspace.docker.from_env", return_value=mock_docker_client):
+            with pytest.raises(GitWorktreeError, match="Invalid git repository"):
+                WorkspaceManager(str(tmp_path))
+
+    def test_init_nonexistent_path(self, mock_docker_client):
+        with patch("app_factory.core.workspace.docker.from_env", return_value=mock_docker_client):
+            with pytest.raises(GitWorktreeError, match="Repository path not found"):
+                WorkspaceManager("/nonexistent/path/to/repo")
+
+    def test_init_docker_unavailable(self, temp_git_repo):
+        with patch(
+            "app_factory.core.workspace.docker.from_env",
+            side_effect=docker_mod.errors.DockerException("Connection refused"),
+        ):
+            with pytest.raises(DockerProvisionError, match="Failed to connect to Docker"):
+                WorkspaceManager(temp_git_repo)
+
+
+@pytest.mark.asyncio
+class TestCreateWorktree:
+    async def test_create_worktree_success(self, workspace_manager):
+        wm = workspace_manager
+        path = await wm.create_worktree("task-001")
+        assert os.path.isdir(path)
+        assert "task-001" in path
+        assert "feature/task-task-001" in [b.name for b in wm.repo.branches]
+        # Cleanup
+        wm.repo.git.worktree("remove", path, "--force")
+
+    async def test_create_worktree_invalid_base_branch(self, workspace_manager):
+        wm = workspace_manager
+        with pytest.raises(GitWorktreeError, match="does not exist"):
+            await wm.create_worktree("task-002", base_branch="nonexistent")
+
+    async def test_create_worktree_branch_already_exists(self, workspace_manager):
+        wm = workspace_manager
+        wm.repo.git.branch("feature/task-task-003")
+        with pytest.raises(GitWorktreeError, match="Branch already exists"):
+            await wm.create_worktree("task-003")
+
+    async def test_create_worktree_path_already_exists(self, workspace_manager):
+        wm = workspace_manager
+        worktree_dir = wm.repo_path.parent / "worktrees" / "task-004"
+        worktree_dir.mkdir(parents=True)
+        with pytest.raises(GitWorktreeError, match="Worktree path already exists"):
+            await wm.create_worktree("task-004")
+
+
+@pytest.mark.asyncio
+class TestSpinUpCleanRoom:
+    async def test_spin_up_clean_room_success(self, workspace_manager, mock_docker_client):
+        wm = workspace_manager
+        container = await wm.spin_up_clean_room("/tmp/fake_worktree", "task-010")
+
+        assert container.id == "abc123container"
+        mock_docker_client.images.pull.assert_called_once_with("python:3.11-slim")
+        mock_docker_client.containers.create.assert_called_once_with(
+            image="python:3.11-slim",
+            name="appfactory-task-task-010",
+            volumes={"/tmp/fake_worktree": {"bind": "/workspace", "mode": "rw"}},
+            working_dir="/workspace",
+            network_mode="none",
+            auto_remove=False,
+            detach=True,
+            command="sleep infinity",
+        )
+        assert "task-010" in wm.active_workspaces
+        info = wm.active_workspaces["task-010"]
+        assert info["worktree_path"] == "/tmp/fake_worktree"
+        assert info["container_id"] == "abc123container"
+
+    async def test_spin_up_image_pull_failure(self, workspace_manager, mock_docker_client):
+        wm = workspace_manager
+        mock_docker_client.images.pull.side_effect = docker_mod.errors.APIError("pull failed")
+        with pytest.raises(DockerProvisionError, match="Failed to pull image"):
+            await wm.spin_up_clean_room("/tmp/fake", "task-011")
+
+    async def test_spin_up_container_create_failure(self, workspace_manager, mock_docker_client):
+        wm = workspace_manager
+        mock_docker_client.containers.create.side_effect = docker_mod.errors.APIError("create failed")
+        with pytest.raises(DockerProvisionError, match="Failed to create container"):
+            await wm.spin_up_clean_room("/tmp/fake", "task-012")
+
+
+@pytest.mark.asyncio
+class TestCleanupWorkspace:
+    async def test_cleanup_workspace_success(self, workspace_manager):
+        wm = workspace_manager
+        path = await wm.create_worktree("task-020")
+        mock_container = MagicMock()
+        wm.active_workspaces["task-020"] = {
+            "task_id": "task-020",
+            "worktree_path": path,
+            "container_id": "cont123",
+            "container": mock_container,
+        }
+
+        await wm.cleanup_workspace("task-020")
+
+        mock_container.stop.assert_called_once_with(timeout=5)
+        mock_container.remove.assert_called_once_with(force=True)
+        assert "task-020" not in wm.active_workspaces
+        assert not os.path.exists(path)
+
+    async def test_cleanup_already_stopped_container(self, workspace_manager):
+        wm = workspace_manager
+        mock_container = MagicMock()
+        mock_container.stop.side_effect = Exception("already stopped")
+        mock_container.remove.return_value = None
+
+        wm.active_workspaces["task-021"] = {
+            "task_id": "task-021",
+            "worktree_path": "/tmp/nonexistent",
+            "container_id": "cont456",
+            "container": mock_container,
+        }
+
+        # Should not raise even if stop fails
+        await wm.cleanup_workspace("task-021")
+        assert "task-021" not in wm.active_workspaces
+
+    async def test_cleanup_no_container(self, workspace_manager):
+        wm = workspace_manager
+        path = await wm.create_worktree("task-022")
+        wm.active_workspaces["task-022"] = {
+            "task_id": "task-022",
+            "worktree_path": path,
+            "container_id": None,
+            "container": None,
+        }
+        await wm.cleanup_workspace("task-022")
+        assert "task-022" not in wm.active_workspaces
+
+
+class TestGetActiveWorkspaces:
+    def test_get_empty(self, workspace_manager):
+        assert workspace_manager.get_active_workspaces() == []
+
+    def test_get_with_workspaces(self, workspace_manager):
+        wm = workspace_manager
+        wm.active_workspaces["t1"] = {
+            "task_id": "t1",
+            "worktree_path": "/path/t1",
+            "container_id": "c1",
+            "container": MagicMock(),
+        }
+        wm.active_workspaces["t2"] = {
+            "task_id": "t2",
+            "worktree_path": "/path/t2",
+            "container_id": "c2",
+            "container": MagicMock(),
+        }
+
+        result = wm.get_active_workspaces()
+        assert len(result) == 2
+        assert {"task_id": "t1", "worktree_path": "/path/t1", "container_id": "c1"} in result
+        assert {"task_id": "t2", "worktree_path": "/path/t2", "container_id": "c2"} in result
+        for item in result:
+            assert "container" not in item
+
+
+@pytest.mark.asyncio
+class TestCleanupAll:
+    async def test_cleanup_all_success(self, workspace_manager):
+        wm = workspace_manager
+        mock_c1 = MagicMock()
+        mock_c2 = MagicMock()
+        wm.active_workspaces["t1"] = {
+            "task_id": "t1",
+            "worktree_path": "/tmp/nonexistent1",
+            "container_id": "c1",
+            "container": mock_c1,
+        }
+        wm.active_workspaces["t2"] = {
+            "task_id": "t2",
+            "worktree_path": "/tmp/nonexistent2",
+            "container_id": "c2",
+            "container": mock_c2,
+        }
+
+        await wm.cleanup_all()
+
+        mock_c1.stop.assert_called_once()
+        mock_c2.stop.assert_called_once()
+        assert wm.active_workspaces == {}
+
+    async def test_cleanup_all_with_errors(self, workspace_manager):
+        wm = workspace_manager
+        mock_c1 = MagicMock()
+        mock_c1.stop.side_effect = Exception("stop failed")
+        mock_c1.remove.side_effect = Exception("remove failed")
+
+        wm.active_workspaces["t1"] = {
+            "task_id": "t1",
+            "worktree_path": "/tmp/nonexistent",
+            "container_id": "c1",
+            "container": mock_c1,
+        }
+
+        with pytest.raises(WorkspaceError, match="Cleanup all completed with errors"):
+            await wm.cleanup_all()
+
+        assert wm.active_workspaces == {}