first commit

2026-02-25 23:49:54 -05:00
commit 4d097161cb
1775 changed files with 452827 additions and 0 deletions
--- a/app_factory/init.py
+++ b/app_factory/init.py
@@ -0,0 +1,7 @@
+"""App Factory - Autonomous multi-agent orchestration framework."""
+
+from app_factory.core.graph import AppFactoryOrchestrator
+from app_factory.core.workspace import WorkspaceManager
+from app_factory.core.observability import ObservabilityManager
+
+__all__ = ["AppFactoryOrchestrator", "WorkspaceManager", "ObservabilityManager"]
--- a/app_factory/agents/init.py
+++ b/app_factory/agents/init.py
@@ -0,0 +1,8 @@
+"""Agent modules for the App Factory orchestration framework."""
+
+from app_factory.agents.pm_agent import PMAgent
+from app_factory.agents.task_agent import TaskMasterAgent
+from app_factory.agents.dev_agent import DevAgentManager
+from app_factory.agents.qa_agent import QAAgent
+
+__all__ = ["PMAgent", "TaskMasterAgent", "DevAgentManager", "QAAgent"]
--- a/app_factory/agents/dev_agent.py
+++ b/app_factory/agents/dev_agent.py
@@ -0,0 +1,205 @@
+"""Dev Agent Manager - Spawns Dev Agents in Docker containers via Claude Code."""
+
+import asyncio
+import logging
+import os
+import re
+import tempfile
+from pathlib import Path
+
+import pexpect
+
+logger = logging.getLogger(__name__)
+
+PROMPT_TEMPLATE_PATH = Path(__file__).resolve().parent.parent / "prompts" / "dev_task_execution.txt"
+
+
+class DevAgentManager:
+    """Spawns Dev Agents in Docker containers, interfaces with Claude Code via pexpect."""
+
+    def __init__(self, docker_client=None, max_retries: int = 3, timeout: int = 1800):
+        """Initialize DevAgentManager.
+
+        Args:
+            docker_client: Docker client instance (or None to create from env).
+            max_retries: Maximum Dev-QA bounce retries per task.
+            timeout: Timeout in seconds for Claude Code execution (default 30 min).
+        """
+        if docker_client is not None:
+            self.docker_client = docker_client
+        else:
+            import docker
+            self.docker_client = docker.from_env()
+
+        self.max_retries = max_retries
+        self.timeout = timeout
+        self._retry_counts: dict[str, int] = {}
+
+    def prepare_task_prompt(self, task: dict, global_arch: str = "") -> str:
+        """Build a prompt string for the Dev Agent from the template.
+
+        Args:
+            task: Task dict with keys task_id, title, description, details, testStrategy.
+            global_arch: Optional global architecture summary.
+
+        Returns:
+            Formatted prompt string.
+        """
+        template = PROMPT_TEMPLATE_PATH.read_text()
+        return template.format(
+            task_id=task.get("task_id", task.get("id", "")),
+            title=task.get("title", ""),
+            description=task.get("description", ""),
+            details=task.get("details", ""),
+            test_strategy=task.get("testStrategy", ""),
+            global_architecture=global_arch or "No architecture context provided.",
+        )
+
+    async def execute_task(
+        self,
+        task: dict,
+        container_id: str,
+        worktree_path: str,
+        global_arch: str = "",
+    ) -> dict:
+        """Execute a task inside a Docker container using Claude Code.
+
+        Args:
+            task: Task dict.
+            container_id: Docker container ID to exec into.
+            worktree_path: Host path to the worktree (mounted at /workspace).
+            global_arch: Optional architecture context.
+
+        Returns:
+            Dict with status, output, files_changed, and exit_code.
+        """
+        prompt = self.prepare_task_prompt(task, global_arch)
+
+        # Write prompt to temp file in worktree so it's visible inside the container
+        prompt_file = os.path.join(worktree_path, ".task_prompt.txt")
+        with open(prompt_file, "w") as f:
+            f.write(prompt)
+
+        cmd = f"docker exec {container_id} claude --print --prompt-file /workspace/.task_prompt.txt"
+
+        try:
+            child = pexpect.spawn(cmd, timeout=self.timeout, encoding="utf-8")
+            child.expect(pexpect.EOF, timeout=self.timeout)
+            output = child.before or ""
+            child.close()
+            exit_code = child.exitstatus if child.exitstatus is not None else -1
+        except pexpect.TIMEOUT:
+            try:
+                child.close(force=True)
+            except Exception:
+                pass
+            return {
+                "status": "failed",
+                "output": "timeout",
+                "files_changed": [],
+                "exit_code": -1,
+            }
+        finally:
+            # Clean up prompt file
+            try:
+                os.remove(prompt_file)
+            except OSError:
+                pass
+
+        parsed = self.parse_claude_output(output)
+
+        if exit_code == 0:
+            status = "success"
+        else:
+            status = "failed"
+
+        return {
+            "status": status,
+            "output": output,
+            "files_changed": parsed["files_changed"],
+            "exit_code": exit_code,
+        }
+
+    def parse_claude_output(self, output: str) -> dict:
+        """Parse Claude Code output to extract structured info.
+
+        Args:
+            output: Raw stdout from Claude Code.
+
+        Returns:
+            Dict with files_changed, test_results, and errors.
+        """
+        # Extract file paths (common patterns: Created/Modified/Updated path/to/file.py)
+        file_patterns = re.findall(
+            r"(?:(?:Creat|Modifi|Updat|Edit|Writ)(?:ed|ing)\s+)([^\s]+\.\w+)",
+            output,
+        )
+        # Also catch paths that look like source files mentioned standalone
+        standalone_paths = re.findall(
+            r"(?:^|\s)([\w./]+\.(?:py|js|ts|yaml|yml|json|txt|md|toml|cfg))\b",
+            output,
+        )
+        all_files = list(dict.fromkeys(file_patterns + standalone_paths))  # dedupe, preserve order
+
+        # Extract test results
+        test_results = {}
+        passed_match = re.search(r"(\d+)\s+passed", output)
+        failed_match = re.search(r"(\d+)\s+failed", output)
+        if passed_match:
+            test_results["passed"] = int(passed_match.group(1))
+        if failed_match:
+            test_results["failed"] = int(failed_match.group(1))
+
+        # Extract error messages
+        errors = re.findall(r"(?:Error|Exception|FAILED)[:\s]+(.*?)(?:\n|$)", output, re.IGNORECASE)
+
+        return {
+            "files_changed": all_files,
+            "test_results": test_results,
+            "errors": errors,
+        }
+
+    async def execute_with_retry(
+        self,
+        task: dict,
+        container_id: str,
+        worktree_path: str,
+        global_arch: str = "",
+    ) -> dict:
+        """Execute a task with retry logic.
+
+        Retries up to max_retries times on failure. If all retries are exhausted,
+        returns a result with status 'needs_clarification'.
+
+        Args:
+            task: Task dict.
+            container_id: Docker container ID.
+            worktree_path: Host worktree path.
+            global_arch: Optional architecture context.
+
+        Returns:
+            Final execution result dict.
+        """
+        task_id = str(task.get("task_id", task.get("id", "")))
+
+        for attempt in range(self.max_retries):
+            self._retry_counts[task_id] = attempt + 1
+            result = await self.execute_task(task, container_id, worktree_path, global_arch)
+            if result["status"] == "success":
+                return result
+
+        # All retries exhausted
+        return {
+            "status": "needs_clarification",
+            "output": result.get("output", ""),
+            "files_changed": result.get("files_changed", []),
+            "exit_code": result.get("exit_code", -1),
+        }
+
+    def get_retry_count(self, task_id: str) -> int:
+        """Return current retry count for a task."""
+        return self._retry_counts.get(task_id, 0)
+
+    def reset_retry_count(self, task_id: str):
+        """Reset retry counter for a task (after clarification resolved)."""
+        self._retry_counts.pop(task_id, None)
--- a/app_factory/agents/pm_agent.py
+++ b/app_factory/agents/pm_agent.py
@@ -0,0 +1,136 @@
+"""Project Manager Agent - Expands user prompts into structured PRDs and handles clarification requests."""
+
+import os
+from datetime import datetime, timezone
+from pathlib import Path
+
+from app_factory.core.claude_client import ClaudeSDKClient
+
+
+class PMAgent:
+    """Agent responsible for PRD generation, clarification handling, and project planning."""
+
+    def __init__(
+        self,
+        api_key: str = None,
+        auth_token: str = None,
+        model: str = "claude-opus-4-6",
+        debug: bool = False,
+        observability=None,
+    ):
+        self.model = model
+        self.input_tokens = 0
+        self.output_tokens = 0
+        self._prompts_dir = Path(__file__).resolve().parent.parent / "prompts"
+        self.observability = observability
+
+        resolved_key = api_key or os.environ.get("ANTHROPIC_API_KEY")
+        resolved_auth = auth_token or os.environ.get("ANTHROPIC_AUTH_TOKEN")
+        self.client = ClaudeSDKClient(
+            api_key=resolved_key,
+            auth_token=resolved_auth,
+            enable_debug=debug,
+        )
+
+    def _load_template(self, template_name: str) -> str:
+        """Load a prompt template file from app_factory/prompts/."""
+        path = self._prompts_dir / template_name
+        return path.read_text()
+
+    async def expand_prompt_to_prd(self, user_input: str) -> str:
+        """Expand a user prompt into a structured PRD using Claude.
+
+        Returns markdown with sections: Objective, Core Requirements,
+        Technical Architecture, Tech Stack, Success Criteria, Non-Functional Requirements.
+        """
+        system_prompt = self._load_template("pm_prd_expansion.txt")
+
+        response = await self.client.complete(
+            prompt=user_input,
+            model=self.model,
+            system_prompt=system_prompt,
+            max_turns=100,
+            observability=self.observability,
+            agent_name="pm_agent",
+            task_id="expand_prd",
+        )
+
+        self.input_tokens += response.input_tokens
+        self.output_tokens += response.output_tokens
+        if self.observability:
+            self.observability.log_token_usage(
+                "pm_agent",
+                "expand_prd",
+                input_tokens=response.input_tokens,
+                output_tokens=response.output_tokens,
+                model=self.model,
+            )
+
+        return response.text
+
+    async def handle_clarification_request(self, clarification: dict) -> str:
+        """Handle a clarification request from a downstream agent.
+
+        Args:
+            clarification: dict with keys requesting_agent, task_id, question, context.
+
+        Returns:
+            Clarification response string. If the question requires human input,
+            prompts the user and returns their answer.
+        """
+        template = self._load_template("pm_clarification.txt")
+        prompt = template.format(
+            requesting_agent=clarification.get("requesting_agent", "unknown"),
+            task_id=clarification.get("task_id", "N/A"),
+            question=clarification.get("question", ""),
+            context=clarification.get("context", ""),
+        )
+
+        response = await self.client.complete(
+            prompt=prompt,
+            model=self.model,
+            max_turns=100,
+            observability=self.observability,
+            agent_name="pm_agent",
+            task_id=f"clarification:{clarification.get('task_id', 'N/A')}",
+        )
+
+        self.input_tokens += response.input_tokens
+        self.output_tokens += response.output_tokens
+        if self.observability:
+            self.observability.log_token_usage(
+                "pm_agent",
+                f"clarification:{clarification.get('task_id', 'N/A')}",
+                input_tokens=response.input_tokens,
+                output_tokens=response.output_tokens,
+                model=self.model,
+            )
+
+        answer = response.text.strip()
+
+        if "ESCALATE_TO_HUMAN" in answer:
+            human_answer = input(
+                f"[PMAgent] Clarification needed for {clarification.get('requesting_agent', 'agent')} "
+                f"(task {clarification.get('task_id', 'N/A')}): "
+                f"{clarification.get('question', '')}\n> "
+            )
+            return human_answer
+
+        return answer
+
+    def update_prd(self, prd_path: str, updates: str):
+        """Append updates to an existing PRD file with a versioned header."""
+        timestamp = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
+        header = f"\n\n---\n## PRD Update - {timestamp}\n\n"
+
+        with open(prd_path, "a") as f:
+            f.write(header)
+            f.write(updates)
+
+    def get_token_usage(self) -> dict:
+        """Return cumulative token usage."""
+        return {
+            "input_tokens": self.input_tokens,
+            "output_tokens": self.output_tokens,
+            "total_tokens": self.input_tokens + self.output_tokens,
+        }
--- a/app_factory/agents/qa_agent.py
+++ b/app_factory/agents/qa_agent.py
@@ -0,0 +1,383 @@
+"""QA Agent - Handles code review, testing, linting, and merge operations."""
+
+import os
+import re
+import subprocess
+from pathlib import Path
+
+import git
+
+from app_factory.core.claude_client import ClaudeSDKClient
+
+
+class QAAgent:
+    """Reviews code, runs tests, handles merge conflicts, merges worktrees to main."""
+
+    def __init__(
+        self,
+        repo_path: str,
+        api_key: str = None,
+        auth_token: str = None,
+        max_retries: int = 3,
+        debug: bool = False,
+        observability=None,
+    ):
+        """Initialize QAAgent.
+
+        Args:
+            repo_path: Path to the git repository.
+            api_key: Optional API key. Falls back to ANTHROPIC_API_KEY env var.
+            max_retries: Maximum QA-Dev bounce retries per task.
+        """
+        self.repo = git.Repo(repo_path)
+        self.repo_path = Path(repo_path).resolve()
+        self.max_retries = max_retries
+        self._retry_counts: dict[str, int] = {}
+        self._prompts_dir = Path(__file__).resolve().parent.parent / "prompts"
+        self.observability = observability
+
+        resolved_key = api_key or os.environ.get("ANTHROPIC_API_KEY")
+        resolved_auth = auth_token or os.environ.get("ANTHROPIC_AUTH_TOKEN")
+        self.client = ClaudeSDKClient(
+            api_key=resolved_key,
+            auth_token=resolved_auth,
+            enable_debug=debug,
+        )
+
+    async def review_and_merge(self, task_id: str, worktree_path: str, task: dict = None) -> dict:
+        """Full QA pipeline: rebase, lint, test, review, merge.
+
+        Returns:
+            dict with status and details. Status is one of:
+            'merged', 'rebase_failed', 'lint_failed', 'tests_failed', 'review_failed'.
+        """
+        # 1. Rebase feature branch onto main
+        rebase_result = await self.rebase_onto_main(worktree_path, task_id)
+        if not rebase_result["success"]:
+            self._increment_retry(task_id)
+            return {
+                "status": "rebase_failed",
+                "conflicts": rebase_result.get("conflicts", []),
+                "retry_count": self.get_retry_count(task_id),
+            }
+
+        # 2. Run linting
+        lint_result = self.run_linter(worktree_path)
+        if not lint_result["passed"]:
+            self._increment_retry(task_id)
+            return {
+                "status": "lint_failed",
+                "errors": lint_result["errors"],
+                "warnings": lint_result["warnings"],
+                "retry_count": self.get_retry_count(task_id),
+            }
+
+        # 3. Run tests
+        test_result = self.run_tests(worktree_path)
+        if not test_result["passed"]:
+            self._increment_retry(task_id)
+            return {
+                "status": "tests_failed",
+                "total": test_result["total"],
+                "failures": test_result["failures"],
+                "errors": test_result["errors"],
+                "output": test_result["output"],
+                "retry_count": self.get_retry_count(task_id),
+            }
+
+        # 4. Code review via Claude
+        wt_repo = git.Repo(worktree_path)
+        diff = wt_repo.git.diff("main", "--", ".")
+        review_result = await self.code_review(diff, task=task)
+        if not review_result["approved"]:
+            self._increment_retry(task_id)
+            return {
+                "status": "review_failed",
+                "issues": review_result["issues"],
+                "summary": review_result["summary"],
+                "retry_count": self.get_retry_count(task_id),
+            }
+
+        # 5. Merge to main
+        merge_result = self.merge_to_main(worktree_path, task_id)
+        if not merge_result["success"]:
+            return {
+                "status": "merge_failed",
+                "error": merge_result.get("error", "Unknown merge error"),
+            }
+
+        return {
+            "status": "merged",
+            "commit_sha": merge_result["commit_sha"],
+            "review_summary": review_result["summary"],
+        }
+
+    async def rebase_onto_main(self, worktree_path: str, task_id: str) -> dict:
+        """Rebase the feature branch in the worktree onto main.
+
+        Returns:
+            dict with success bool and conflicts list.
+        """
+        wt_repo = git.Repo(worktree_path)
+        try:
+            wt_repo.git.fetch("origin", "main")
+        except git.GitCommandError:
+            pass  # fetch may fail in local-only repos; continue with local main
+
+        try:
+            wt_repo.git.rebase("main")
+            return {"success": True, "conflicts": []}
+        except git.GitCommandError:
+            # Rebase failed — check for conflicts
+            conflicts = self._get_conflict_files(wt_repo)
+            if conflicts and self.auto_resolve_conflicts(worktree_path):
+                return {"success": True, "conflicts": []}
+            # Abort the failed rebase
+            try:
+                wt_repo.git.rebase("--abort")
+            except git.GitCommandError:
+                pass
+            return {"success": False, "conflicts": conflicts}
+
+    def run_linter(self, worktree_path: str) -> dict:
+        """Run ruff linter on the worktree.
+
+        Returns:
+            dict with passed bool, errors list, and warnings list.
+        """
+        try:
+            result = subprocess.run(
+                ["ruff", "check", "."],
+                cwd=worktree_path,
+                capture_output=True,
+                text=True,
+                timeout=120,
+            )
+        except FileNotFoundError:
+            return {"passed": True, "errors": [], "warnings": ["ruff not found, skipping lint"]}
+        except subprocess.TimeoutExpired:
+            return {"passed": False, "errors": ["Linter timed out"], "warnings": []}
+
+        errors = []
+        warnings = []
+        for line in result.stdout.splitlines():
+            line = line.strip()
+            if not line or line.startswith("Found") or line.startswith("All checks"):
+                continue
+            # ruff output lines contain error codes like E501, W291, etc.
+            if re.search(r"\b[A-Z]\d{3,4}\b", line):
+                errors.append(line)
+            elif line:
+                warnings.append(line)
+
+        passed = result.returncode == 0
+        return {"passed": passed, "errors": errors, "warnings": warnings}
+
+    def run_tests(self, worktree_path: str) -> dict:
+        """Run pytest in the worktree.
+
+        Returns:
+            dict with passed bool, total/failures/errors counts, and raw output.
+        """
+        try:
+            result = subprocess.run(
+                ["python", "-m", "pytest", "-v", "--tb=short"],
+                cwd=worktree_path,
+                capture_output=True,
+                text=True,
+                timeout=300,
+            )
+        except FileNotFoundError:
+            return {"passed": False, "total": 0, "failures": 0, "errors": 1,
+                    "output": "pytest not found"}
+        except subprocess.TimeoutExpired:
+            return {"passed": False, "total": 0, "failures": 0, "errors": 1,
+                    "output": "Test execution timed out"}
+
+        output = result.stdout + result.stderr
+        parsed = self.parse_test_results(output)
+        parsed["output"] = output
+        return parsed
+
+    async def code_review(self, diff: str, task: dict = None) -> dict:
+        """Review a diff using Claude for quality and security issues.
+
+        Returns:
+            dict with approved bool, issues list, and summary string.
+        """
+        template = self._load_template("qa_review.txt")
+        task_context = ""
+        if task:
+            task_context = (
+                f"Task ID: {task.get('id', 'N/A')}\n"
+                f"Title: {task.get('title', 'N/A')}\n"
+                f"Description: {task.get('description', 'N/A')}"
+            )
+
+        prompt = template.format(task_context=task_context, diff=diff)
+
+        response = await self.client.complete(
+            prompt=prompt,
+            model="claude-sonnet-4-6",
+            max_turns=100,
+            observability=self.observability,
+            agent_name="qa_agent",
+            task_id=str(task.get("id", task.get("task_id", "review"))) if task else "review",
+        )
+        if self.observability:
+            self.observability.log_token_usage(
+                "qa_agent",
+                str(task.get("id", task.get("task_id", "review"))) if task else "review",
+                input_tokens=response.input_tokens,
+                output_tokens=response.output_tokens,
+                model="claude-sonnet-4-6",
+            )
+
+        text = response.text
+        return self._parse_review_response(text)
+
+    def merge_to_main(self, worktree_path: str, task_id: str) -> dict:
+        """Merge the feature branch into main with --no-ff.
+
+        Returns:
+            dict with success bool and commit_sha.
+        """
+        branch_name = f"feature/task-{task_id}"
+        try:
+            self.repo.git.checkout("main")
+            self.repo.git.merge("--no-ff", branch_name, m=f"Merge {branch_name}")
+            commit_sha = self.repo.head.commit.hexsha
+            return {"success": True, "commit_sha": commit_sha}
+        except git.GitCommandError as e:
+            return {"success": False, "commit_sha": None, "error": str(e)}
+
+    def auto_resolve_conflicts(self, worktree_path: str) -> bool:
+        """Try to auto-resolve simple merge conflicts.
+
+        Returns True if all conflicts were resolved.
+        """
+        wt_repo = git.Repo(worktree_path)
+        unmerged = wt_repo.index.unmerged_blobs()
+        if not unmerged:
+            return True
+
+        for path in unmerged:
+            file_path = os.path.join(worktree_path, path)
+            if not os.path.exists(file_path):
+                continue
+            try:
+                with open(file_path) as f:
+                    content = f.read()
+                # Accept "theirs" (incoming) for simple conflicts
+                if "<<<<<<< " in content and "=======" in content and ">>>>>>> " in content:
+                    resolved = re.sub(
+                        r"<<<<<<< [^\n]*\n.*?=======\n(.*?)>>>>>>> [^\n]*\n",
+                        r"\1",
+                        content,
+                        flags=re.DOTALL,
+                    )
+                    with open(file_path, "w") as f:
+                        f.write(resolved)
+                    wt_repo.index.add([path])
+                else:
+                    return False
+            except Exception:
+                return False
+
+        try:
+            wt_repo.git.rebase("--continue")
+            return True
+        except git.GitCommandError:
+            return False
+
+    def parse_test_results(self, output: str) -> dict:
+        """Parse pytest output into structured results.
+
+        Returns:
+            dict with passed bool, total int, failures int, errors int.
+        """
+        # Match pytest summary line like "5 passed, 2 failed, 1 error"
+        passed_count = 0
+        failed_count = 0
+        error_count = 0
+
+        # Look for the summary line
+        summary_match = re.search(
+            r"=+\s*(.*?)\s*=+\s*$",
+            output,
+            re.MULTILINE,
+        )
+        if summary_match:
+            summary_line = summary_match.group(1)
+            p = re.search(r"(\d+)\s+passed", summary_line)
+            f = re.search(r"(\d+)\s+failed", summary_line)
+            e = re.search(r"(\d+)\s+error", summary_line)
+            if p:
+                passed_count = int(p.group(1))
+            if f:
+                failed_count = int(f.group(1))
+            if e:
+                error_count = int(e.group(1))
+
+        total = passed_count + failed_count + error_count
+        all_passed = failed_count == 0 and error_count == 0 and total > 0
+
+        return {
+            "passed": all_passed,
+            "total": total,
+            "failures": failed_count,
+            "errors": error_count,
+        }
+
+    def get_retry_count(self, task_id: str) -> int:
+        """Return QA retry count for a task."""
+        return self._retry_counts.get(task_id, 0)
+
+    def _increment_retry(self, task_id: str):
+        """Increment the retry counter for a task."""
+        self._retry_counts[task_id] = self._retry_counts.get(task_id, 0) + 1
+
+    def _load_template(self, template_name: str) -> str:
+        """Load a prompt template file from app_factory/prompts/."""
+        path = self._prompts_dir / template_name
+        return path.read_text()
+
+    def _get_conflict_files(self, repo: git.Repo) -> list[str]:
+        """Get list of conflicting files from a repo."""
+        try:
+            status_output = repo.git.status("--porcelain")
+            conflicts = []
+            for line in status_output.splitlines():
+                if line.startswith("UU ") or line.startswith("AA "):
+                    conflicts.append(line[3:].strip())
+            return conflicts
+        except git.GitCommandError:
+            return []
+
+    def _parse_review_response(self, text: str) -> dict:
+        """Parse Claude's review response into structured data."""
+        approved = False
+        issues = []
+        summary = ""
+
+        for line in text.splitlines():
+            line = line.strip()
+            if line.upper().startswith("APPROVED:"):
+                value = line.split(":", 1)[1].strip().lower()
+                approved = value in ("true", "yes")
+            elif line.startswith("- ["):
+                # Parse issue lines like "- [severity: critical] description"
+                issue_match = re.match(
+                    r"-\s*\[severity:\s*(critical|warning|info)\]\s*(.*)",
+                    line,
+                    re.IGNORECASE,
+                )
+                if issue_match:
+                    issues.append({
+                        "severity": issue_match.group(1).lower(),
+                        "description": issue_match.group(2).strip(),
+                    })
+            elif line.upper().startswith("SUMMARY:"):
+                summary = line.split(":", 1)[1].strip()
+
+        return {"approved": approved, "issues": issues, "summary": summary}
--- a/app_factory/agents/task_agent.py
+++ b/app_factory/agents/task_agent.py
@@ -0,0 +1,180 @@
+"""Task Master Agent - Bridge to claude-task-master for task graph management."""
+
+import asyncio
+import json
+import logging
+import os
+import subprocess
+from pathlib import Path
+
+logger = logging.getLogger(__name__)
+
+
+class TaskMasterAgent:
+    """Bridge to claude-task-master for task graph management and dependency resolution."""
+
+    def __init__(self, project_root: str, mcp_client=None):
+        self.project_root = str(project_root)
+        self.mcp_client = mcp_client
+        self.max_retries = 3
+        self.base_delay = 1.0
+
+    async def parse_prd(self, prd_content: str, num_tasks: int = 10) -> dict:
+        """Write PRD content to disk and invoke task-master parse-prd."""
+        docs_dir = Path(self.project_root) / ".taskmaster" / "docs"
+        docs_dir.mkdir(parents=True, exist_ok=True)
+        prd_path = docs_dir / "prd.md"
+        prd_path.write_text(prd_content)
+
+        result = await self._call_with_retry(
+            self._run_cli,
+            "parse-prd",
+            str(prd_path),
+            "--num-tasks",
+            str(num_tasks),
+            "--force",
+        )
+        return result
+
+    async def get_unblocked_tasks(self) -> list:
+        """Get all pending tasks whose dependencies are all done."""
+        result = await self._call_with_retry(self._run_cli, "list", "--json")
+        tasks = result.get("tasks", [])
+
+        done_ids = {
+            str(t["id"]) for t in tasks if t.get("status") == "done"
+        }
+
+        unblocked = []
+        for task in tasks:
+            if task.get("status") != "pending":
+                continue
+            deps = [str(d) for d in task.get("dependencies", [])]
+            if all(d in done_ids for d in deps):
+                unblocked.append(task)
+
+        return unblocked
+
+    async def update_task_status(
+        self, task_id: str, status: str, notes: str = ""
+    ):
+        """Update a task's status and optionally add implementation notes."""
+        await self._call_with_retry(
+            self._run_cli,
+            "set-status",
+            f"--id={task_id}",
+            f"--status={status}",
+        )
+        if notes:
+            await self._call_with_retry(
+                self._run_cli,
+                "update-subtask",
+                f"--id={task_id}",
+                f"--prompt={notes}",
+            )
+
+    async def get_task_details(self, task_id: str) -> dict:
+        """Get full details for a specific task."""
+        result = await self._call_with_retry(
+            self._run_cli, "show", str(task_id), "--json"
+        )
+        task = result.get("task", result)
+        return {
+            "id": task.get("id"),
+            "title": task.get("title", ""),
+            "description": task.get("description", ""),
+            "details": task.get("details", ""),
+            "testStrategy": task.get("testStrategy", ""),
+            "dependencies": task.get("dependencies", []),
+            "subtasks": task.get("subtasks", []),
+            "status": task.get("status", "pending"),
+            "priority": task.get("priority", ""),
+        }
+
+    async def get_next_task(self) -> dict | None:
+        """Get the highest-priority unblocked task, or None."""
+        try:
+            result = await self._call_with_retry(
+                self._run_cli, "next", "--json"
+            )
+            task = result.get("task", result)
+            if task and task.get("id"):
+                return task
+        except RuntimeError:
+            logger.debug("next_task command failed, falling back to manual selection")
+
+        unblocked = await self.get_unblocked_tasks()
+        if not unblocked:
+            return None
+
+        priority_order = {"high": 0, "medium": 1, "low": 2}
+        unblocked.sort(
+            key=lambda t: (
+                priority_order.get(t.get("priority", "medium"), 1),
+                t.get("id", 0),
+            )
+        )
+        return unblocked[0]
+
+    async def expand_task(self, task_id: str, num_subtasks: int = 5) -> dict:
+        """Break a task into subtasks."""
+        result = await self._call_with_retry(
+            self._run_cli,
+            "expand",
+            f"--id={task_id}",
+            f"--num={num_subtasks}",
+            "--force",
+        )
+        return result
+
+    async def _call_with_retry(self, func, *args, **kwargs):
+        """Retry with exponential backoff."""
+        last_exc = None
+        for attempt in range(self.max_retries):
+            try:
+                return await func(*args, **kwargs)
+            except Exception as exc:
+                last_exc = exc
+                if attempt < self.max_retries - 1:
+                    delay = self.base_delay * (2 ** attempt)
+                    logger.warning(
+                        "Attempt %d/%d failed: %s. Retrying in %.1fs",
+                        attempt + 1,
+                        self.max_retries,
+                        exc,
+                        delay,
+                    )
+                    await asyncio.sleep(delay)
+        raise RuntimeError(
+            f"All {self.max_retries} attempts failed. Last error: {last_exc}"
+        ) from last_exc
+
+    async def _run_cli(self, *args: str) -> dict:
+        """Execute a task-master CLI command and return parsed JSON output."""
+        cmd = ["task-master", *args]
+        logger.debug("Running CLI: %s", " ".join(cmd))
+
+        proc = await asyncio.get_event_loop().run_in_executor(
+            None,
+            lambda: subprocess.run(
+                cmd,
+                capture_output=True,
+                text=True,
+                cwd=self.project_root,
+                timeout=120,
+            ),
+        )
+
+        if proc.returncode != 0:
+            raise RuntimeError(
+                f"task-master {args[0]} failed (rc={proc.returncode}): {proc.stderr.strip()}"
+            )
+
+        stdout = proc.stdout.strip()
+        if not stdout:
+            return {}
+
+        try:
+            return json.loads(stdout)
+        except json.JSONDecodeError:
+            return {"raw_output": stdout}
--- a/app_factory/core/init.py
+++ b/app_factory/core/init.py
@@ -0,0 +1,8 @@
+"""Core modules for the App Factory orchestration framework."""
+
+from app_factory.core.graph import AppFactoryOrchestrator, AppFactoryState
+from app_factory.core.workspace import WorkspaceManager
+from app_factory.core.observability import ObservabilityManager
+from app_factory.core.architecture_tracker import ArchitectureTracker
+
+__all__ = ["AppFactoryOrchestrator", "AppFactoryState", "WorkspaceManager", "ObservabilityManager", "ArchitectureTracker"]
--- a/app_factory/core/architecture_tracker.py
+++ b/app_factory/core/architecture_tracker.py
@@ -0,0 +1,300 @@
+"""Architecture Tracker - Tracks global architecture to prevent context starvation and code duplication."""
+
+import ast
+import json
+import logging
+import os
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Optional
+
+from app_factory.core.claude_client import ClaudeSDKClient
+
+logger = logging.getLogger(__name__)
+
+
+class ArchitectureTracker:
+    """Tracks global architecture to prevent Dev Agent context starvation and code duplication."""
+
+    def __init__(
+        self,
+        data_dir: str = "app_factory/data",
+        api_key: str = None,
+        auth_token: str = None,
+        debug: bool = False,
+        observability=None,
+    ):
+        """Initialize ArchitectureTracker.
+
+        Args:
+            data_dir: Directory for storing global_architecture.json.
+            api_key: Optional API key for AI-powered summarization.
+        """
+        self.data_dir = Path(data_dir)
+        self.data_dir.mkdir(parents=True, exist_ok=True)
+        self._arch_path = self.data_dir / "global_architecture.json"
+        self.observability = observability
+
+        self._client = None
+        resolved_key = api_key or os.environ.get("ANTHROPIC_API_KEY")
+        resolved_auth = auth_token or os.environ.get("ANTHROPIC_AUTH_TOKEN")
+        if resolved_key or resolved_auth:
+            try:
+                self._client = ClaudeSDKClient(
+                    api_key=resolved_key,
+                    auth_token=resolved_auth,
+                    enable_debug=debug,
+                )
+            except Exception as exc:
+                logger.warning("Claude SDK unavailable (%s). AI summarization disabled.", exc)
+
+        self._architecture = self.load_architecture()
+
+    def _default_architecture(self) -> dict:
+        """Return default architecture schema."""
+        return {
+            "modules": [],
+            "utilities": [],
+            "design_patterns": [],
+            "naming_conventions": {
+                "variables": "snake_case",
+                "classes": "PascalCase",
+                "functions": "snake_case",
+                "constants": "UPPER_SNAKE_CASE",
+            },
+            "tech_stack": {
+                "language": "Python",
+                "framework": "LangGraph",
+            },
+            "version": 1,
+            "last_updated": datetime.now(timezone.utc).isoformat(),
+        }
+
+    def load_architecture(self) -> dict:
+        """Load from global_architecture.json or return default."""
+        if self._arch_path.exists():
+            try:
+                with open(self._arch_path, "r") as f:
+                    return json.load(f)
+            except (json.JSONDecodeError, OSError) as exc:
+                logger.warning("Failed to load architecture file (%s). Using default.", exc)
+        return self._default_architecture()
+
+    def save_architecture(self, data: dict):
+        """Save to global_architecture.json with timestamp update."""
+        data["last_updated"] = datetime.now(timezone.utc).isoformat()
+        with open(self._arch_path, "w") as f:
+            json.dump(data, f, indent=2)
+        self._architecture = data
+
+    async def update_architecture(self, completed_task: dict, files_changed: list):
+        """Update architecture based on completed task and changed files.
+
+        Args:
+            completed_task: Dict with task info (e.g. title, description).
+            files_changed: List of file paths that were modified.
+        """
+        new_modules = []
+        new_utilities = []
+
+        for file_path in files_changed:
+            if not os.path.exists(file_path) or not file_path.endswith(".py"):
+                continue
+
+            try:
+                with open(file_path, "r") as f:
+                    source = f.read()
+            except OSError:
+                continue
+
+            if self._client:
+                await self._ai_extract(source, file_path, new_modules, new_utilities)
+            else:
+                self._basic_extract(source, file_path, new_modules, new_utilities)
+
+        existing_module_names = {m["name"] for m in self._architecture["modules"]}
+        for mod in new_modules:
+            if mod["name"] not in existing_module_names:
+                self._architecture["modules"].append(mod)
+                existing_module_names.add(mod["name"])
+
+        existing_utility_names = {u["name"] for u in self._architecture["utilities"]}
+        for util in new_utilities:
+            if util["name"] not in existing_utility_names:
+                self._architecture["utilities"].append(util)
+                existing_utility_names.add(util["name"])
+
+        self.save_architecture(self._architecture)
+
+    async def _ai_extract(
+        self, source: str, file_path: str, modules: list, utilities: list
+    ):
+        """Use Claude to extract architecture info from source code."""
+        prompt = (
+            "Analyze this Python source file and extract:\n"
+            "1. Module-level classes (name, purpose)\n"
+            "2. Utility functions (name, description)\n"
+            "Respond ONLY with valid JSON: "
+            '{"classes": [{"name": "...", "purpose": "..."}], '
+            '"functions": [{"name": "...", "description": "..."}]}\n\n'
+            f"File: {file_path}\n```python\n{source[:4000]}\n```"
+        )
+        try:
+            response = await self._client.complete(
+                prompt=prompt,
+                model="claude-sonnet-4-6",
+                max_turns=100,
+                observability=self.observability,
+                agent_name="architecture_tracker",
+                task_id=f"ai_extract:{Path(file_path).name}",
+            )
+            if self.observability:
+                self.observability.log_token_usage(
+                    "architecture_tracker",
+                    f"ai_extract:{Path(file_path).name}",
+                    input_tokens=response.input_tokens,
+                    output_tokens=response.output_tokens,
+                    model="claude-sonnet-4-6",
+                )
+            text = response.text
+            # Extract JSON from response
+            start = text.find("{")
+            end = text.rfind("}") + 1
+            if start >= 0 and end > start:
+                data = json.loads(text[start:end])
+                for cls in data.get("classes", []):
+                    modules.append({
+                        "name": cls["name"],
+                        "purpose": cls.get("purpose", ""),
+                        "file_path": file_path,
+                    })
+                for func in data.get("functions", []):
+                    utilities.append({
+                        "name": func["name"],
+                        "description": func.get("description", ""),
+                        "file_path": file_path,
+                    })
+        except Exception as exc:
+            logger.warning("AI extraction failed (%s). Falling back to basic.", exc)
+            self._basic_extract(source, file_path, modules, utilities)
+
+    def _basic_extract(
+        self, source: str, file_path: str, modules: list, utilities: list
+    ):
+        """Extract architecture info using AST parsing."""
+        try:
+            tree = ast.parse(source)
+        except SyntaxError:
+            return
+
+        for node in ast.iter_child_nodes(tree):
+            if isinstance(node, ast.ClassDef):
+                docstring = ast.get_docstring(node) or ""
+                modules.append({
+                    "name": node.name,
+                    "purpose": docstring.split("\n")[0] if docstring else "",
+                    "file_path": file_path,
+                })
+            elif isinstance(node, ast.FunctionDef) or isinstance(node, ast.AsyncFunctionDef):
+                if node.name.startswith("_"):
+                    continue
+                docstring = ast.get_docstring(node) or ""
+                utilities.append({
+                    "name": node.name,
+                    "description": docstring.split("\n")[0] if docstring else "",
+                    "file_path": file_path,
+                })
+
+    def get_architecture_summary(self, max_tokens: int = 2000) -> str:
+        """Generate concise text summary from architecture data.
+
+        Args:
+            max_tokens: Approximate max tokens for the summary (~4 chars per token).
+
+        Returns:
+            Formatted string for injection into Dev Agent prompts.
+        """
+        max_chars = max_tokens * 4
+        parts = []
+
+        parts.append("## Project Architecture Summary")
+        parts.append("")
+
+        # Tech stack
+        tech = self._architecture.get("tech_stack", {})
+        if tech:
+            parts.append("### Tech Stack")
+            for key, value in tech.items():
+                parts.append(f"- {key}: {value}")
+            parts.append("")
+
+        # Modules
+        mods = self._architecture.get("modules", [])
+        if mods:
+            parts.append("### Modules")
+            for m in mods:
+                line = f"- **{m['name']}** ({m.get('file_path', '')}): {m.get('purpose', '')}"
+                parts.append(line)
+            parts.append("")
+
+        # Utilities
+        utils = self._architecture.get("utilities", [])
+        if utils:
+            parts.append("### Shared Utilities")
+            for u in utils:
+                line = f"- **{u['name']}** ({u.get('file_path', '')}): {u.get('description', '')}"
+                parts.append(line)
+            parts.append("")
+
+        # Design patterns
+        patterns = self._architecture.get("design_patterns", [])
+        if patterns:
+            parts.append("### Design Patterns")
+            for p in patterns:
+                parts.append(f"- {p.get('pattern', '')}: {p.get('usage', '')}")
+            parts.append("")
+
+        # Naming conventions
+        conventions = self._architecture.get("naming_conventions", {})
+        if conventions:
+            parts.append("### Naming Conventions")
+            for key, value in conventions.items():
+                parts.append(f"- {key}: {value}")
+            parts.append("")
+
+        summary = "\n".join(parts)
+
+        if len(summary) > max_chars:
+            summary = summary[:max_chars - 3] + "..."
+
+        return summary
+
+    def add_module(self, name: str, purpose: str, file_path: str):
+        """Manually add a module to the architecture.
+
+        Args:
+            name: Module/class name.
+            purpose: Brief description of what it does.
+            file_path: Path to the source file.
+        """
+        self._architecture["modules"].append({
+            "name": name,
+            "purpose": purpose,
+            "file_path": file_path,
+        })
+        self.save_architecture(self._architecture)
+
+    def add_utility(self, name: str, description: str, file_path: str):
+        """Manually add a utility function to the architecture.
+
+        Args:
+            name: Function name.
+            description: Brief description of what it does.
+            file_path: Path to the source file.
+        """
+        self._architecture["utilities"].append({
+            "name": name,
+            "description": description,
+            "file_path": file_path,
+        })
+        self.save_architecture(self._architecture)
--- a/app_factory/core/claude_client.py
+++ b/app_factory/core/claude_client.py
@@ -0,0 +1,721 @@
+"""Shared Claude Agent SDK client wrapper."""
+
+from __future__ import annotations
+
+import asyncio
+import logging
+import os
+import shutil
+import tempfile
+from dataclasses import dataclass
+from importlib import import_module
+from pathlib import Path
+from typing import Any
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class ClaudeCompletion:
+    """Normalized completion result from Claude Agent SDK."""
+
+    text: str
+    input_tokens: int = 0
+    output_tokens: int = 0
+    raw_usage: dict[str, Any] | None = None
+
+
+def _load_sdk() -> tuple[Any, Any]:
+    """Load Claude Agent SDK symbols."""
+    try:
+        mod = import_module("claude_agent_sdk")
+        return mod.query, mod.ClaudeAgentOptions
+    except Exception as exc:
+        raise ImportError(
+            "Claude Agent SDK is not installed. Install 'claude-agent-sdk'."
+        ) from exc
+
+
+class ClaudeSDKClient:
+    """Small adapter over Claude Agent SDK query() streaming API."""
+
+    _RATE_LIMIT_RETRY_TIME_MARKS_SECONDS: tuple[float, ...] = (0.2, 1.0, 5.0)
+    _SENSITIVE_KEY_TOKENS: tuple[str, ...] = (
+        "api_key",
+        "apikey",
+        "auth",
+        "token",
+        "secret",
+        "password",
+        "authorization",
+        "cookie",
+    )
+
+    def __init__(
+        self,
+        api_key: str | None = None,
+        auth_token: str | None = None,
+        enable_debug: bool = False,
+    ):
+        self._query: Any | None = None
+        self._options_cls: Any | None = None
+        self._env: dict[str, str] = {}
+        self._enable_debug = enable_debug
+        if api_key:
+            self._env["ANTHROPIC_API_KEY"] = api_key
+        if auth_token:
+            self._env["ANTHROPIC_AUTH_TOKEN"] = auth_token
+
+    async def complete(
+        self,
+        prompt: str,
+        *,
+        model: str | None = None,
+        system_prompt: str | None = None,
+        max_turns: int = 100,
+        cwd: str | None = None,
+        env: dict[str, str] | None = None,
+        observability: Any | None = None,
+        agent_name: str = "claude_sdk",
+        task_id: str = "-",
+    ) -> ClaudeCompletion:
+        """Run a single-turn completion and normalize text/token usage."""
+        self._ensure_sdk_loaded()
+        self._emit_observability_event(
+            observability,
+            agent_name,
+            task_id,
+            "request_start",
+            {
+                "model": model,
+                "max_turns": max_turns,
+                "cwd": cwd,
+                "prompt_chars": len(prompt),
+                "system_prompt_chars": len(system_prompt) if system_prompt else 0,
+            },
+        )
+        options_kwargs: dict[str, Any] = {"max_turns": max_turns}
+        if model:
+            options_kwargs["model"] = model
+        if system_prompt:
+            options_kwargs["system_prompt"] = system_prompt
+        if cwd:
+            options_kwargs["cwd"] = cwd
+
+        effective_env = dict(self._env)
+        if env:
+            effective_env.update(env)
+        effective_env = self._ensure_claude_home_writable(effective_env, cwd=cwd)
+        if effective_env:
+            options_kwargs["env"] = effective_env
+        total_attempts = len(self._RATE_LIMIT_RETRY_TIME_MARKS_SECONDS) + 1
+        for attempt in range(total_attempts):
+            self._emit_observability_event(
+                observability,
+                agent_name,
+                task_id,
+                "attempt_start",
+                {"attempt": attempt + 1, "total_attempts": total_attempts},
+            )
+            debug_stderr = None
+            if self._enable_debug:
+                debug_stderr = tempfile.TemporaryFile(mode="w+t", encoding="utf-8")
+            attempt_options_kwargs = dict(options_kwargs)
+            if debug_stderr is not None:
+                attempt_options_kwargs["debug_stderr"] = debug_stderr
+                attempt_options_kwargs["extra_args"] = {"debug-to-stderr": None}
+            options = self._options_cls(**attempt_options_kwargs)
+            assistant_parts: list[str] = []
+            result_text: str | None = None
+            usage: dict[str, Any] | None = None
+            error_text: str | None = None
+            result_subtype: str | None = None
+            session_id: str | None = None
+            stderr_detail = ""
+
+            try:
+                async for msg in self._query(prompt=prompt, options=options):
+                    session_id = self._record_stream_message(
+                        msg=msg,
+                        observability=observability,
+                        agent_name=agent_name,
+                        task_id=task_id,
+                        current_session_id=session_id,
+                    )
+                    content = getattr(msg, "content", None)
+                    # Only assistant messages contain model output content.
+                    if content and hasattr(msg, "model"):
+                        for block in content:
+                            text = getattr(block, "text", None)
+                            if text:
+                                assistant_parts.append(text)
+
+                    msg_result = getattr(msg, "result", None)
+                    if isinstance(msg_result, str) and msg_result.strip():
+                        result_text = msg_result
+
+                    msg_subtype = getattr(msg, "subtype", None)
+                    if isinstance(msg_subtype, str):
+                        result_subtype = msg_subtype
+
+                    msg_usage = getattr(msg, "usage", None)
+                    if isinstance(msg_usage, dict):
+                        usage = msg_usage
+
+                    if getattr(msg, "is_error", False):
+                        error_text = msg_result if isinstance(msg_result, str) else "Claude SDK error"
+                stderr_detail = self._combine_stderr_details(self._read_debug_stderr(debug_stderr))
+            except Exception as exc:
+                stderr_detail = self._combine_stderr_details(
+                    self._read_debug_stderr(debug_stderr),
+                    self._extract_exception_stderr(exc),
+                )
+                error_message = self._format_error(
+                    f"Claude SDK query failed: {exc}",
+                    stderr_detail,
+                    add_hint=True,
+                )
+                should_retry = await self._should_retry_rate_limit_error(error_message, attempt)
+                self._emit_observability_event(
+                    observability,
+                    agent_name,
+                    task_id,
+                    "request_error",
+                    {
+                        "attempt": attempt + 1,
+                        "error": self._truncate_text(str(exc)),
+                        "retrying": should_retry,
+                        "stderr": self._truncate_text(stderr_detail),
+                    },
+                )
+                if should_retry:
+                    continue
+                raise RuntimeError(error_message) from exc
+            finally:
+                if debug_stderr is not None:
+                    debug_stderr.close()
+
+            if error_text:
+                error_message = self._format_error(error_text, stderr_detail, add_hint=True)
+                should_retry = await self._should_retry_rate_limit_error(error_message, attempt)
+                self._emit_observability_event(
+                    observability,
+                    agent_name,
+                    task_id,
+                    "request_error",
+                    {
+                        "attempt": attempt + 1,
+                        "error": self._truncate_text(error_text),
+                        "retrying": should_retry,
+                        "stderr": self._truncate_text(stderr_detail),
+                    },
+                )
+                if should_retry:
+                    continue
+                raise RuntimeError(error_message)
+
+            if result_subtype and "error" in result_subtype.lower():
+                error_message = self._format_error(
+                    f"Claude SDK execution ended with subtype '{result_subtype}'.",
+                    stderr_detail,
+                    add_hint=True,
+                )
+                should_retry = await self._should_retry_rate_limit_error(error_message, attempt)
+                self._emit_observability_event(
+                    observability,
+                    agent_name,
+                    task_id,
+                    "request_error",
+                    {
+                        "attempt": attempt + 1,
+                        "error": f"result subtype={result_subtype}",
+                        "retrying": should_retry,
+                        "stderr": self._truncate_text(stderr_detail),
+                    },
+                )
+                if should_retry:
+                    continue
+                raise RuntimeError(error_message)
+
+            text = (result_text or "\n".join(assistant_parts)).strip()
+            if not text:
+                error_message = self._format_error(
+                    "Claude SDK returned empty response",
+                    stderr_detail,
+                    add_hint=True,
+                )
+                should_retry = await self._should_retry_rate_limit_error(error_message, attempt)
+                self._emit_observability_event(
+                    observability,
+                    agent_name,
+                    task_id,
+                    "request_error",
+                    {
+                        "attempt": attempt + 1,
+                        "error": "empty response",
+                        "retrying": should_retry,
+                        "stderr": self._truncate_text(stderr_detail),
+                    },
+                )
+                if should_retry:
+                    continue
+                raise RuntimeError(error_message)
+
+            input_tokens, output_tokens = self._extract_token_counts(usage)
+            self._emit_observability_event(
+                observability,
+                agent_name,
+                task_id,
+                "request_complete",
+                {
+                    "attempt": attempt + 1,
+                    "session_id": session_id,
+                    "result_subtype": result_subtype,
+                    "result_preview": self._truncate_text(text, max_chars=180),
+                    "input_tokens": input_tokens,
+                    "output_tokens": output_tokens,
+                    "usage": self._sanitize_payload(usage),
+                },
+            )
+            return ClaudeCompletion(
+                text=text,
+                input_tokens=input_tokens,
+                output_tokens=output_tokens,
+                raw_usage=usage,
+            )
+
+        raise RuntimeError("Claude SDK retry loop exhausted unexpectedly")
+
+    def _ensure_sdk_loaded(self) -> None:
+        if self._query is not None and self._options_cls is not None:
+            return
+        self._query, self._options_cls = _load_sdk()
+
+    def _extract_token_counts(self, usage: dict[str, Any] | None) -> tuple[int, int]:
+        """Best-effort token extraction across SDK usage payload variants."""
+        if not isinstance(usage, dict):
+            return 0, 0
+
+        input_tokens = self._to_int(
+            usage.get("input_tokens") or usage.get("inputTokens")
+        )
+        output_tokens = self._to_int(
+            usage.get("output_tokens") or usage.get("outputTokens")
+        )
+
+        if input_tokens == 0:
+            input_tokens = sum(
+                self._to_int(v)
+                for k, v in usage.items()
+                if "input" in k.lower() and "output" not in k.lower()
+            )
+
+        if output_tokens == 0:
+            output_tokens = sum(
+                self._to_int(v) for k, v in usage.items() if "output" in k.lower()
+            )
+
+        return input_tokens, output_tokens
+
+    def _ensure_claude_home_writable(
+        self,
+        env: dict[str, str],
+        *,
+        cwd: str | None = None,
+    ) -> dict[str, str]:
+        """Fallback to a project-local HOME when ~/.claude paths are not writable."""
+        effective = dict(env)
+        current_home = Path(effective.get("HOME") or str(Path.home())).expanduser()
+
+        if self._claude_home_is_writable(current_home):
+            return effective
+
+        fallback_root = Path(cwd or os.getcwd()) / ".app_factory" / "claude_home"
+        fallback_home = self._prepare_fallback_claude_home(
+            source_home=current_home,
+            fallback_home=fallback_root,
+        )
+        effective["HOME"] = str(fallback_home)
+        logger.warning(
+            "Claude home '%s' is not writable; using fallback HOME at '%s'.",
+            current_home,
+            fallback_home,
+        )
+        return effective
+
+    @staticmethod
+    def _claude_home_is_writable(home: Path) -> bool:
+        claude_dir = home / ".claude"
+        required_dirs = [claude_dir, claude_dir / "todos", claude_dir / "debug"]
+        config_file = home / ".claude.json"
+
+        try:
+            for directory in required_dirs:
+                directory.mkdir(parents=True, exist_ok=True)
+                probe = directory / ".app_factory_write_probe"
+                probe.write_text("ok", encoding="utf-8")
+                probe.unlink()
+
+            config_file.touch(exist_ok=True)
+            with open(config_file, "a", encoding="utf-8"):
+                pass
+        except OSError:
+            return False
+
+        return True
+
+    @staticmethod
+    def _prepare_fallback_claude_home(source_home: Path, fallback_home: Path) -> Path:
+        fallback_home.mkdir(parents=True, exist_ok=True)
+
+        fallback_claude_dir = fallback_home / ".claude"
+        (fallback_claude_dir / "todos").mkdir(parents=True, exist_ok=True)
+        (fallback_claude_dir / "debug").mkdir(parents=True, exist_ok=True)
+
+        source_claude_dir = source_home / ".claude"
+        source_config = source_home / ".claude.json"
+        target_config = fallback_home / ".claude.json"
+
+        if source_config.exists() and source_config.is_file() and os.access(source_config, os.R_OK):
+            try:
+                shutil.copy2(source_config, target_config)
+            except OSError:
+                pass
+        else:
+            target_config.touch(exist_ok=True)
+
+        if source_claude_dir.exists() and source_claude_dir.is_dir() and os.access(
+            source_claude_dir, os.R_OK
+        ):
+            try:
+                shutil.copytree(source_claude_dir, fallback_claude_dir, dirs_exist_ok=True)
+            except OSError:
+                pass
+
+        return fallback_home
+
+    @staticmethod
+    def _to_int(value: Any) -> int:
+        try:
+            return int(value)
+        except (TypeError, ValueError):
+            return 0
+
+    @staticmethod
+    def _read_debug_stderr(debug_stderr: Any) -> str:
+        if debug_stderr is None:
+            return ""
+        try:
+            debug_stderr.flush()
+            debug_stderr.seek(0)
+            value = debug_stderr.read()
+            if isinstance(value, str):
+                return value.strip()
+        except Exception:
+            pass
+        return ""
+
+    @staticmethod
+    def _extract_exception_stderr(exc: Exception) -> str:
+        stderr = getattr(exc, "stderr", None)
+        return stderr.strip() if isinstance(stderr, str) else ""
+
+    @staticmethod
+    def _combine_stderr_details(*details: str) -> str:
+        merged: list[str] = []
+        seen: set[str] = set()
+        for detail in details:
+            value = detail.strip() if isinstance(detail, str) else ""
+            if not value or value in seen:
+                continue
+            seen.add(value)
+            merged.append(value)
+
+        if not merged:
+            return ""
+
+        placeholder = "Check stderr output for details"
+        non_placeholder = [detail for detail in merged if placeholder not in detail]
+        preferred = non_placeholder if non_placeholder else merged
+        return "\n\n".join(preferred)
+
+    async def _should_retry_rate_limit_error(self, error_message: str, attempt: int) -> bool:
+        if attempt >= len(self._RATE_LIMIT_RETRY_TIME_MARKS_SECONDS):
+            return False
+
+        text = error_message.lower()
+        retryable_tokens = (
+            "rate_limit_event",
+            "rate limit",
+            "rate-limited",
+            "too many requests",
+            "status code: 429",
+            "status code 429",
+        )
+        if not any(token in text for token in retryable_tokens):
+            return False
+
+        time_marks = self._RATE_LIMIT_RETRY_TIME_MARKS_SECONDS
+        target_mark = time_marks[attempt]
+        previous_mark = time_marks[attempt - 1] if attempt > 0 else 0.0
+        delay = max(target_mark - previous_mark, 0.0)
+        logger.warning(
+            "Claude SDK rate limit/transient event detected (attempt %d/%d). "
+            "Retrying in %.1fs (target %.1fs from first failure).",
+            attempt + 1,
+            len(self._RATE_LIMIT_RETRY_TIME_MARKS_SECONDS) + 1,
+            delay,
+            target_mark,
+        )
+        await asyncio.sleep(delay)
+        return True
+
+    @staticmethod
+    def _format_error(message: str, stderr_detail: str, add_hint: bool = False) -> str:
+        hint = ""
+        if add_hint:
+            hint = (
+                "\nHint: verify Claude auth is available (ANTHROPIC_API_KEY or "
+                "ANTHROPIC_AUTH_TOKEN, or a valid Claude Code OAuth session) and that the "
+                "process can write ~/.claude and ~/.claude.json."
+            )
+        if stderr_detail:
+            return f"{message}\nSDK stderr:\n{stderr_detail}{hint}"
+        return f"{message}{hint}"
+
+    def _record_stream_message(
+        self,
+        *,
+        msg: Any,
+        observability: Any | None,
+        agent_name: str,
+        task_id: str,
+        current_session_id: str | None,
+    ) -> str | None:
+        session_id = getattr(msg, "session_id", None) or current_session_id
+        parent_tool_use_id = getattr(msg, "parent_tool_use_id", None)
+
+        stream_event = getattr(msg, "event", None)
+        if stream_event is not None and hasattr(msg, "uuid"):
+            stream_event_type = None
+            if isinstance(stream_event, dict):
+                stream_event_type = stream_event.get("type") or stream_event.get("event")
+            self._emit_observability_event(
+                observability,
+                agent_name,
+                task_id,
+                "stream_event",
+                {
+                    "session_id": session_id,
+                    "stream_event_type": stream_event_type,
+                    "parent_tool_use_id": parent_tool_use_id,
+                },
+            )
+            return session_id
+
+        content = getattr(msg, "content", None)
+        if content:
+            is_assistant_message = hasattr(msg, "model")
+            self._emit_observability_event(
+                observability,
+                agent_name,
+                task_id,
+                "assistant_message" if is_assistant_message else "user_message",
+                {
+                    "session_id": session_id,
+                    "parent_tool_use_id": parent_tool_use_id,
+                    "model": getattr(msg, "model", None),
+                    "content_block_count": len(content) if isinstance(content, list) else 1,
+                },
+            )
+            if isinstance(content, list):
+                for block in content:
+                    self._record_content_block(
+                        block=block,
+                        observability=observability,
+                        agent_name=agent_name,
+                        task_id=task_id,
+                        session_id=session_id,
+                        parent_tool_use_id=parent_tool_use_id,
+                    )
+
+        subtype = getattr(msg, "subtype", None)
+        if isinstance(subtype, str):
+            if hasattr(msg, "duration_ms"):
+                self._emit_observability_event(
+                    observability,
+                    agent_name,
+                    task_id,
+                    "result_message",
+                    {
+                        "session_id": session_id,
+                        "subtype": subtype,
+                        "is_error": bool(getattr(msg, "is_error", False)),
+                        "num_turns": getattr(msg, "num_turns", None),
+                        "duration_ms": getattr(msg, "duration_ms", None),
+                        "duration_api_ms": getattr(msg, "duration_api_ms", None),
+                        "total_cost_usd": getattr(msg, "total_cost_usd", None),
+                        "usage": self._sanitize_payload(getattr(msg, "usage", None)),
+                    },
+                )
+            elif hasattr(msg, "data"):
+                self._emit_observability_event(
+                    observability,
+                    agent_name,
+                    task_id,
+                    "system_message",
+                    {
+                        "session_id": session_id,
+                        "subtype": subtype,
+                        "data": self._sanitize_payload(getattr(msg, "data", None)),
+                    },
+                )
+
+        return session_id
+
+    def _record_content_block(
+        self,
+        *,
+        block: Any,
+        observability: Any | None,
+        agent_name: str,
+        task_id: str,
+        session_id: str | None,
+        parent_tool_use_id: str | None,
+    ) -> None:
+        block_name = getattr(block, "name", None)
+        block_input = getattr(block, "input", None)
+        block_id = getattr(block, "id", None)
+        if block_name is not None and block_input is not None and block_id is not None:
+            self._emit_observability_event(
+                observability,
+                agent_name,
+                task_id,
+                "tool_use",
+                {
+                    "session_id": session_id,
+                    "tool_use_id": block_id,
+                    "parent_tool_use_id": parent_tool_use_id,
+                    "tool_name": str(block_name),
+                    "tool_input": self._sanitize_payload(block_input),
+                },
+            )
+            return
+
+        tool_use_id = getattr(block, "tool_use_id", None)
+        if tool_use_id is not None:
+            content = getattr(block, "content", None)
+            self._emit_observability_event(
+                observability,
+                agent_name,
+                task_id,
+                "tool_result",
+                {
+                    "session_id": session_id,
+                    "tool_use_id": tool_use_id,
+                    "parent_tool_use_id": parent_tool_use_id,
+                    "is_error": bool(getattr(block, "is_error", False)),
+                    "content": self._sanitize_payload(content),
+                },
+            )
+            return
+
+        text = getattr(block, "text", None)
+        if isinstance(text, str) and text:
+            self._emit_observability_event(
+                observability,
+                agent_name,
+                task_id,
+                "text_block",
+                {
+                    "session_id": session_id,
+                    "chars": len(text),
+                    "preview": self._truncate_text(text),
+                },
+            )
+            return
+
+        thinking = getattr(block, "thinking", None)
+        if isinstance(thinking, str) and thinking:
+            self._emit_observability_event(
+                observability,
+                agent_name,
+                task_id,
+                "thinking_block",
+                {
+                    "session_id": session_id,
+                    "chars": len(thinking),
+                },
+            )
+
+    def _emit_observability_event(
+        self,
+        observability: Any | None,
+        agent_name: str,
+        task_id: str,
+        event_type: str,
+        payload: dict[str, Any] | None = None,
+    ) -> None:
+        if observability is None:
+            return
+        log_method = getattr(observability, "log_claude_event", None)
+        if not callable(log_method):
+            return
+        try:
+            log_method(
+                agent_name=agent_name,
+                task_id=task_id,
+                event_type=event_type,
+                payload=self._sanitize_payload(payload),
+            )
+        except Exception:
+            # Observability should never break execution.
+            logger.debug("Failed to emit observability event", exc_info=True)
+
+    @classmethod
+    def _is_sensitive_key(cls, key: Any) -> bool:
+        if not isinstance(key, str):
+            return False
+        lowered = key.lower()
+        return any(token in lowered for token in cls._SENSITIVE_KEY_TOKENS)
+
+    @classmethod
+    def _sanitize_payload(cls, value: Any, *, _depth: int = 0) -> Any:
+        if _depth >= 4:
+            return "[truncated]"
+
+        if isinstance(value, dict):
+            sanitized: dict[str, Any] = {}
+            for idx, (k, v) in enumerate(value.items()):
+                if idx >= 40:
+                    sanitized["__truncated_items__"] = len(value) - 40
+                    break
+                key = str(k)
+                if cls._is_sensitive_key(key):
+                    sanitized[key] = "[REDACTED]"
+                else:
+                    sanitized[key] = cls._sanitize_payload(v, _depth=_depth + 1)
+            return sanitized
+
+        if isinstance(value, (list, tuple)):
+            items = [cls._sanitize_payload(v, _depth=_depth + 1) for v in value[:40]]
+            if len(value) > 40:
+                items.append(f"...({len(value) - 40} more)")
+            return items
+
+        if isinstance(value, str):
+            return cls._truncate_text(value)
+
+        if isinstance(value, (int, float, bool)) or value is None:
+            return value
+
+        return cls._truncate_text(str(value))
+
+    @staticmethod
+    def _truncate_text(value: str, max_chars: int = 400) -> str:
+        if not isinstance(value, str):
+            return ""
+        trimmed = value.strip()
+        if len(trimmed) <= max_chars:
+            return trimmed
+        return f"{trimmed[:max_chars]}...({len(trimmed) - max_chars} more chars)"
--- a/app_factory/core/graph.py
+++ b/app_factory/core/graph.py
@@ -0,0 +1,444 @@
+"""Graph Orchestrator - LangGraph-based multi-agent workflow orchestration."""
+
+import asyncio
+import json
+import logging
+import os
+from typing import TypedDict
+
+from langgraph.graph import END, START, StateGraph
+
+logger = logging.getLogger(__name__)
+
+
+class AppFactoryState(TypedDict):
+    """Global state passed through the orchestration graph."""
+
+    user_input: str
+    prd: str
+    tasks: list  # All tasks from task-master
+    active_tasks: dict  # task_id -> {status, container_id, worktree_path}
+    completed_tasks: list  # List of completed task_ids
+    blocked_tasks: dict  # task_id -> reason
+    clarification_requests: list  # Pending clarification dicts
+    global_architecture: str  # Architecture summary for dev agents
+    iteration_count: int  # Safety counter to prevent infinite loops
+    max_iterations: int  # Max loop iterations (default 50)
+    errors: list  # Error log
+
+
+class AppFactoryOrchestrator:
+    """Main LangGraph state machine for the App Factory."""
+
+    def __init__(
+        self,
+        pm_agent=None,
+        task_agent=None,
+        dev_manager=None,
+        qa_agent=None,
+        workspace_manager=None,
+        observability=None,
+    ):
+        self.pm_agent = pm_agent
+        self.task_agent = task_agent
+        self.dev_manager = dev_manager
+        self.qa_agent = qa_agent
+        self.workspace_manager = workspace_manager
+        self.observability = observability
+
+    def build_graph(self) -> StateGraph:
+        """Build and compile the LangGraph StateGraph with nodes and edges."""
+        graph = StateGraph(AppFactoryState)
+
+        graph.add_node("pm_node", self._pm_node)
+        graph.add_node("task_node", self._task_node)
+        graph.add_node("dev_dispatch_node", self._dev_dispatch_node)
+        graph.add_node("qa_node", self._qa_node)
+        graph.add_node("clarification_node", self._clarification_node)
+
+        graph.add_edge(START, "pm_node")
+        graph.add_conditional_edges(
+            "pm_node",
+            self._should_continue_after_pm,
+            {
+                "task_node": "task_node",
+                "end": END,
+            },
+        )
+        graph.add_conditional_edges(
+            "task_node",
+            self._should_continue_after_tasks,
+            {
+                "dev_dispatch": "dev_dispatch_node",
+                "end": END,
+                "clarification": "clarification_node",
+            },
+        )
+        graph.add_edge("dev_dispatch_node", "qa_node")
+        graph.add_conditional_edges(
+            "qa_node",
+            self._should_continue_after_qa,
+            {
+                "task_node": "task_node",
+                "clarification": "clarification_node",
+                "end": END,
+            },
+        )
+        graph.add_edge("clarification_node", "task_node")
+
+        return graph.compile()
+
+    def _should_continue_after_pm(self, state: dict) -> str:
+        """Routing function after pm_node: 'task_node' | 'end'."""
+        prd = state.get("prd", "")
+        if prd and prd.strip():
+            return "task_node"
+
+        # PM failure (or empty prompt) yields no PRD and should terminate cleanly.
+        return "end"
+
+    def _should_continue_after_tasks(self, state: dict) -> str:
+        """Routing function after task_node: 'dev_dispatch' | 'end' | 'clarification'."""
+        if state.get("iteration_count", 0) >= state.get("max_iterations", 50):
+            return "end"
+
+        tasks = state.get("tasks", [])
+        completed = set(state.get("completed_tasks", []))
+        all_task_ids = {str(t.get("id", "")) for t in tasks}
+
+        # Check if all tasks are done
+        if all_task_ids and all_task_ids <= completed:
+            return "end"
+
+        # Check for unblocked tasks (pending tasks with all deps done)
+        unblocked = []
+        for t in tasks:
+            if str(t.get("id", "")) in completed:
+                continue
+            if t.get("status") == "done":
+                continue
+            deps = [str(d) for d in t.get("dependencies", [])]
+            if all(d in completed for d in deps):
+                unblocked.append(t)
+
+        if unblocked:
+            return "dev_dispatch"
+
+        # No unblocked tasks - if there are blocked ones, try clarification
+        if state.get("blocked_tasks") or state.get("clarification_requests"):
+            return "clarification"
+
+        # No tasks at all or nothing left to do
+        return "end"
+
+    def _should_continue_after_qa(self, state: dict) -> str:
+        """Routing function after qa_node: 'task_node' | 'clarification' | 'end'."""
+        if state.get("iteration_count", 0) >= state.get("max_iterations", 50):
+            return "end"
+
+        if state.get("clarification_requests"):
+            return "clarification"
+
+        # Loop back to check for newly unblocked tasks
+        return "task_node"
+
+    async def _pm_node(self, state: dict) -> dict:
+        """Call PM agent to expand user input into a PRD."""
+        if self.observability:
+            self.observability.log_state_transition("start", "pm_node")
+
+        user_input = state.get("user_input", "")
+        if not user_input:
+            return {"prd": "", "errors": state.get("errors", []) + ["No user input provided"]}
+
+        if self.pm_agent is None:
+            return {"prd": f"Mock PRD for: {user_input}"}
+
+        try:
+            prd = await self.pm_agent.expand_prompt_to_prd(user_input)
+            return {"prd": prd}
+        except Exception as e:
+            logger.error("PM agent failed: %s", e)
+            return {"prd": "", "errors": state.get("errors", []) + [f"PM agent error: {e}"]}
+
+    async def _task_node(self, state: dict) -> dict:
+        """Parse PRD into tasks or get unblocked tasks. Increments iteration_count."""
+        if self.observability:
+            self.observability.log_state_transition("pm_node/qa_node/clarification_node", "task_node")
+
+        iteration_count = state.get("iteration_count", 0) + 1
+        updates = {"iteration_count": iteration_count}
+
+        if iteration_count >= state.get("max_iterations", 50):
+            updates["errors"] = state.get("errors", []) + ["Max iterations reached"]
+            return updates
+
+        if self.task_agent is None:
+            return updates
+
+        try:
+            existing_tasks = state.get("tasks", [])
+            if not existing_tasks:
+                # First pass - parse the PRD
+                prd = state.get("prd", "")
+                if prd:
+                    await self.task_agent.parse_prd(prd)
+                unblocked = await self.task_agent.get_unblocked_tasks()
+                updates["tasks"] = unblocked
+            else:
+                # Subsequent passes - refresh unblocked tasks
+                unblocked = await self.task_agent.get_unblocked_tasks()
+                updates["tasks"] = unblocked
+        except Exception as e:
+            logger.error("Task agent failed: %s", e)
+            updates["errors"] = state.get("errors", []) + [f"Task agent error: {e}"]
+
+        return updates
+
+    async def _dev_dispatch_node(self, state: dict) -> dict:
+        """Dispatch dev agents concurrently for unblocked tasks."""
+        if self.observability:
+            self.observability.log_state_transition("task_node", "dev_dispatch_node")
+
+        tasks = state.get("tasks", [])
+        completed = set(state.get("completed_tasks", []))
+        active_tasks = dict(state.get("active_tasks", {}))
+        errors = list(state.get("errors", []))
+        clarification_requests = list(state.get("clarification_requests", []))
+        global_arch = state.get("global_architecture", "")
+
+        # Filter to unblocked, not-yet-completed tasks
+        to_execute = []
+        for t in tasks:
+            tid = str(t.get("id", ""))
+            if tid in completed or tid in active_tasks:
+                continue
+            deps = [str(d) for d in t.get("dependencies", [])]
+            if all(d in completed for d in deps):
+                to_execute.append(t)
+
+        if not to_execute:
+            return {}
+
+        if self.dev_manager is None or self.workspace_manager is None:
+            # Mock execution for testing
+            new_completed = list(completed)
+            for t in to_execute:
+                tid = str(t.get("id", ""))
+                active_tasks[tid] = {"status": "success", "container_id": "mock", "worktree_path": "/mock"}
+                new_completed.append(tid)
+            return {"active_tasks": active_tasks, "completed_tasks": new_completed}
+
+        async def _execute_single(task):
+            tid = str(task.get("id", ""))
+            worktree_path = None
+            container = None
+            try:
+                worktree_path = await self.workspace_manager.create_worktree(tid)
+                container = await self.workspace_manager.spin_up_clean_room(worktree_path, tid)
+                container_id = container.id
+
+                if self.task_agent:
+                    await self.task_agent.update_task_status(tid, "in-progress")
+
+                result = await self.dev_manager.execute_with_retry(
+                    task, container_id, worktree_path, global_arch
+                )
+                return tid, result, worktree_path
+            except Exception as e:
+                logger.error("Dev dispatch failed for task %s: %s", tid, e)
+                return tid, {"status": "failed", "output": str(e), "files_changed": [], "exit_code": -1}, worktree_path
+
+        # Execute concurrently
+        results = await asyncio.gather(*[_execute_single(t) for t in to_execute], return_exceptions=True)
+
+        new_completed = list(completed)
+        for item in results:
+            if isinstance(item, Exception):
+                errors.append(f"Dev dispatch exception: {item}")
+                continue
+
+            tid, result, worktree_path = item
+            status = result.get("status", "failed")
+            active_tasks[tid] = {
+                "status": status,
+                "container_id": result.get("container_id", ""),
+                "worktree_path": worktree_path or "",
+            }
+
+            if status == "success":
+                new_completed.append(tid)
+            elif status == "needs_clarification":
+                clarification_requests.append({
+                    "requesting_agent": "dev_agent",
+                    "task_id": tid,
+                    "question": f"Task {tid} failed after retries. Output: {result.get('output', '')[:500]}",
+                    "context": result.get("output", "")[:1000],
+                })
+
+        return {
+            "active_tasks": active_tasks,
+            "completed_tasks": new_completed,
+            "errors": errors,
+            "clarification_requests": clarification_requests,
+        }
+
+    async def _qa_node(self, state: dict) -> dict:
+        """Run QA on completed dev tasks."""
+        if self.observability:
+            self.observability.log_state_transition("dev_dispatch_node", "qa_node")
+
+        active_tasks = dict(state.get("active_tasks", {}))
+        completed = list(state.get("completed_tasks", []))
+        errors = list(state.get("errors", []))
+        clarification_requests = list(state.get("clarification_requests", []))
+        blocked_tasks = dict(state.get("blocked_tasks", {}))
+
+        # Find tasks that were successfully completed by dev and need QA
+        tasks_for_qa = []
+        for tid, info in active_tasks.items():
+            if info.get("status") == "success" and tid in completed:
+                tasks_for_qa.append((tid, info))
+
+        if not tasks_for_qa or self.qa_agent is None:
+            return {}
+
+        for tid, info in tasks_for_qa:
+            worktree_path = info.get("worktree_path", "")
+            if not worktree_path:
+                continue
+
+            try:
+                # Find the task dict for context
+                task_dict = None
+                for t in state.get("tasks", []):
+                    if str(t.get("id", "")) == tid:
+                        task_dict = t
+                        break
+
+                qa_result = await self.qa_agent.review_and_merge(tid, worktree_path, task=task_dict)
+                qa_status = qa_result.get("status", "")
+
+                if qa_status == "merged":
+                    # Successfully merged - update task status
+                    if self.task_agent:
+                        await self.task_agent.update_task_status(tid, "done")
+                    active_tasks[tid]["status"] = "merged"
+                else:
+                    # QA failed - may need clarification or retry
+                    retry_count = qa_result.get("retry_count", 0)
+                    if retry_count >= (self.qa_agent.max_retries if self.qa_agent else 3):
+                        clarification_requests.append({
+                            "requesting_agent": "qa_agent",
+                            "task_id": tid,
+                            "question": f"QA failed for task {tid} with status '{qa_status}'",
+                            "context": str(qa_result),
+                        })
+                    else:
+                        blocked_tasks[tid] = f"QA {qa_status}: {qa_result}"
+                        # Remove from completed so it can be retried
+                        if tid in completed:
+                            completed.remove(tid)
+                        active_tasks[tid]["status"] = qa_status
+
+                # Cleanup workspace after QA
+                if self.workspace_manager:
+                    try:
+                        await self.workspace_manager.cleanup_workspace(tid)
+                    except Exception as e:
+                        logger.warning("Workspace cleanup failed for task %s: %s", tid, e)
+
+            except Exception as e:
+                logger.error("QA failed for task %s: %s", tid, e)
+                errors.append(f"QA error for task {tid}: {e}")
+
+        return {
+            "active_tasks": active_tasks,
+            "completed_tasks": completed,
+            "errors": errors,
+            "clarification_requests": clarification_requests,
+            "blocked_tasks": blocked_tasks,
+        }
+
+    async def _clarification_node(self, state: dict) -> dict:
+        """Handle clarification requests via PM agent."""
+        if self.observability:
+            self.observability.log_state_transition("task_node/qa_node", "clarification_node")
+
+        requests = list(state.get("clarification_requests", []))
+        blocked_tasks = dict(state.get("blocked_tasks", {}))
+        errors = list(state.get("errors", []))
+
+        if not requests:
+            return {"clarification_requests": []}
+
+        if self.pm_agent is None:
+            # Clear requests without processing for testing
+            return {"clarification_requests": [], "blocked_tasks": {}}
+
+        resolved = []
+        remaining = []
+
+        for req in requests:
+            try:
+                answer = await self.pm_agent.handle_clarification_request(req)
+                tid = req.get("task_id", "")
+                if tid and tid in blocked_tasks:
+                    del blocked_tasks[tid]
+                resolved.append({"request": req, "answer": answer})
+            except Exception as e:
+                logger.error("Clarification failed: %s", e)
+                errors.append(f"Clarification error: {e}")
+                remaining.append(req)
+
+        return {
+            "clarification_requests": remaining,
+            "blocked_tasks": blocked_tasks,
+            "errors": errors,
+        }
+
+    async def run(self, user_input: str) -> dict:
+        """Build graph and execute with initial state."""
+        compiled = self.build_graph()
+
+        initial_state = {
+            "user_input": user_input,
+            "prd": "",
+            "tasks": [],
+            "active_tasks": {},
+            "completed_tasks": [],
+            "blocked_tasks": {},
+            "clarification_requests": [],
+            "global_architecture": "",
+            "iteration_count": 0,
+            "max_iterations": 50,
+            "errors": [],
+        }
+
+        if self.observability:
+            self.observability.log_state_transition("init", "run")
+
+        result = await compiled.ainvoke(initial_state)
+
+        self.save_state(result)
+        return result
+
+    def save_state(self, state: dict, path: str = "app_factory/data/state.json"):
+        """Persist state to disk."""
+        os.makedirs(os.path.dirname(path), exist_ok=True)
+        # Convert to JSON-serializable form
+        serializable = {}
+        for k, v in state.items():
+            try:
+                json.dumps(v)
+                serializable[k] = v
+            except (TypeError, ValueError):
+                serializable[k] = str(v)
+
+        with open(path, "w") as f:
+            json.dump(serializable, f, indent=2)
+
+    def load_state(self, path: str = "app_factory/data/state.json") -> dict:
+        """Load state from disk."""
+        with open(path) as f:
+            return json.load(f)
--- a/app_factory/core/logging_utils.py
+++ b/app_factory/core/logging_utils.py
@@ -0,0 +1,83 @@
+"""Logging formatters and helpers for colorized terminal output."""
+
+import logging
+import os
+import sys
+from typing import Optional, TextIO
+
+
+RESET = "\033[0m"
+DIM = "\033[2m"
+BOLD = "\033[1m"
+FG_BLUE = "\033[34m"
+FG_CYAN = "\033[36m"
+FG_GREEN = "\033[32m"
+FG_MAGENTA = "\033[35m"
+FG_YELLOW = "\033[33m"
+FG_RED = "\033[31m"
+
+LEVEL_COLORS = {
+    logging.DEBUG: f"{DIM}{FG_CYAN}",
+    logging.INFO: FG_GREEN,
+    logging.WARNING: FG_YELLOW,
+    logging.ERROR: FG_RED,
+    logging.CRITICAL: f"{BOLD}{FG_RED}",
+}
+
+
+def should_use_color(stream: Optional[TextIO] = None, use_color: Optional[bool] = None) -> bool:
+    """Return whether ANSI colors should be used for the given stream."""
+    if use_color is not None:
+        return use_color
+
+    if os.getenv("NO_COLOR") is not None:
+        return False
+
+    force_color = os.getenv("FORCE_COLOR", "").strip().lower()
+    if force_color and force_color not in {"0", "false", "no"}:
+        return True
+
+    if os.getenv("TERM", "").lower() == "dumb":
+        return False
+
+    target_stream = stream or sys.stderr
+    is_tty = getattr(target_stream, "isatty", None)
+    return bool(is_tty and is_tty())
+
+
+def colorize(text: str, style: str, enabled: bool) -> str:
+    """Apply ANSI style to text when enabled."""
+    if not enabled or not style:
+        return text
+    return f"{style}{text}{RESET}"
+
+
+class LevelColorFormatter(logging.Formatter):
+    """Formatter that colors only the log level token."""
+
+    def __init__(
+        self,
+        fmt: Optional[str] = None,
+        datefmt: Optional[str] = None,
+        style: str = "%",
+        *,
+        stream: Optional[TextIO] = None,
+        use_color: Optional[bool] = None,
+    ):
+        super().__init__(fmt=fmt, datefmt=datefmt, style=style)
+        self._use_color = should_use_color(stream=stream, use_color=use_color)
+
+    def format(self, record: logging.LogRecord) -> str:
+        if not self._use_color:
+            return super().format(record)
+
+        original_levelname = record.levelname
+        record.levelname = colorize(
+            original_levelname,
+            LEVEL_COLORS.get(record.levelno, ""),
+            enabled=True,
+        )
+        try:
+            return super().format(record)
+        finally:
+            record.levelname = original_levelname
--- a/app_factory/core/observability.py
+++ b/app_factory/core/observability.py
@@ -0,0 +1,572 @@
+"""Observability Manager - LangSmith tracing, logging, and monitoring."""
+
+import contextlib
+import functools
+import inspect
+import json
+import logging
+import os
+import time
+import traceback
+import uuid
+from collections import defaultdict
+from datetime import datetime, timezone
+from typing import Any, Callable, Optional
+
+from app_factory.core.logging_utils import (
+    FG_BLUE,
+    FG_CYAN,
+    FG_MAGENTA,
+    LEVEL_COLORS,
+    colorize,
+    should_use_color,
+)
+
+
+class _StructuredFormatter(logging.Formatter):
+    """Custom formatter: [ISO_TIMESTAMP] [AGENT] [TASK] [LEVEL] message"""
+
+    _EVENT_COLORS = {
+        "State transition": FG_MAGENTA,
+        "Token usage": FG_BLUE,
+        "Claude event": FG_BLUE,
+        "Trace started": FG_CYAN,
+        "Trace ended": FG_CYAN,
+    }
+
+    def __init__(self, use_color: Optional[bool] = None):
+        super().__init__()
+        self._use_color = should_use_color(use_color=use_color)
+
+    def _colorize_message(self, message: str) -> str:
+        for prefix, style in self._EVENT_COLORS.items():
+            if message.startswith(prefix):
+                return colorize(message, style, self._use_color)
+        return message
+
+    def format(self, record: logging.LogRecord) -> str:
+        ts = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%S")
+        agent = getattr(record, "agent_name", "SYSTEM")
+        task = getattr(record, "task_id", "-")
+        level = record.levelname
+        message = record.getMessage()
+
+        if self._use_color:
+            ts = colorize(ts, FG_BLUE, enabled=True)
+            agent = colorize(agent, FG_CYAN, enabled=True)
+            task = colorize(task, FG_MAGENTA, enabled=True)
+            level = colorize(level, LEVEL_COLORS.get(record.levelno, ""), enabled=True)
+            message = self._colorize_message(message)
+
+        return f"[{ts}] [{agent}] [{task}] [{level}] {message}"
+
+
+class _TraceContext:
+    """Async context manager for trace_context()."""
+
+    def __init__(self, manager: "ObservabilityManager", agent_name: str, task_id: str):
+        self._manager = manager
+        self._agent_name = agent_name
+        self._task_id = task_id
+        self._run_id: Optional[str] = None
+
+    async def __aenter__(self) -> str:
+        self._run_id = self._manager.start_trace(self._agent_name, self._task_id)
+        return self._run_id
+
+    async def __aexit__(self, exc_type, exc_val, exc_tb) -> bool:
+        if exc_val is not None:
+            self._manager.end_trace(
+                self._run_id, error=f"{exc_type.__name__}: {exc_val}"
+            )
+        else:
+            self._manager.end_trace(self._run_id)
+        return False  # do not suppress exceptions
+
+
+class ObservabilityManager:
+    """Wraps LangSmith client for tracing and structured logging."""
+
+    _CLAUDE_EVENT_FILTERS = {
+        "quiet": {
+            "request_start",
+            "request_error",
+            "request_complete",
+            "tool_use",
+            "tool_result",
+        },
+        "focused": {
+            "request_start",
+            "request_error",
+            "request_complete",
+            "tool_use",
+            "tool_result",
+            "thinking_block",
+            "result_message",
+        },
+        "verbose": None,  # no filtering
+        "off": set(),
+    }
+
+    def __init__(self, project_name: str = None, claude_event_mode: str | None = None):
+        self.project_name = project_name or os.getenv("LANGSMITH_PROJECT", "app-factory")
+        requested_mode = (
+            claude_event_mode
+            or os.getenv("APP_FACTORY_CLAUDE_EVENT_MODE", "quiet")
+        )
+        normalized_mode = requested_mode.strip().lower() if isinstance(requested_mode, str) else "focused"
+        self._claude_event_mode = (
+            normalized_mode if normalized_mode in self._CLAUDE_EVENT_FILTERS else "focused"
+        )
+
+        # --- LangSmith client (optional) ---
+        self._client = None
+        try:
+            from langsmith import Client  # noqa: F811
+
+            self._client = Client()
+        except Exception as exc:
+            # LangSmith not configured or unreachable -- degrade gracefully
+            logging.getLogger(__name__).warning(
+                "LangSmith unavailable (%s). Tracing disabled.", exc
+            )
+
+        # --- Structured logger ---
+        self.logger = logging.getLogger(f"app_factory.{self.project_name}")
+        if not self.logger.handlers:
+            handler = logging.StreamHandler()
+            handler.setFormatter(_StructuredFormatter())
+            self.logger.addHandler(handler)
+            self.logger.setLevel(logging.DEBUG)
+        self.logger.propagate = False
+        self._tool_name_by_use_id: dict[str, str] = {}
+        self._tool_summary_by_use_id: dict[str, str] = {}
+
+        # --- Internal metrics ---
+        self._active_runs: dict[str, dict] = {}
+        self._metrics = {
+            "total_tokens": 0,
+            "total_traces": 0,
+            "total_errors": 0,
+            "total_claude_events": 0,
+            "total_tool_calls": 0,
+            "per_agent": defaultdict(lambda: {
+                "tokens": 0,
+                "traces": 0,
+                "errors": 0,
+                "claude_events": 0,
+                "tool_calls": 0,
+            }),
+        }
+
+    # ------------------------------------------------------------------
+    # Tracing
+    # ------------------------------------------------------------------
+
+    def start_trace(self, agent_name: str, task_id: str, inputs: dict = None) -> str:
+        """Start a new trace run, return run_id."""
+        run_id = uuid.uuid4().hex
+        self._metrics["total_traces"] += 1
+        self._metrics["per_agent"][agent_name]["traces"] += 1
+
+        self._active_runs[run_id] = {
+            "agent_name": agent_name,
+            "task_id": task_id,
+            "start_time": time.time(),
+        }
+
+        self.logger.info(
+            "Trace started: run_id=%s",
+            run_id,
+            extra={"agent_name": agent_name, "task_id": task_id},
+        )
+
+        try:
+            if self._client is not None:
+                self._client.create_run(
+                    name=f"{agent_name}:{task_id}",
+                    run_type="chain",
+                    inputs=inputs or {},
+                    id=run_id,
+                    project_name=self.project_name,
+                )
+        except Exception as exc:
+            self.logger.warning(
+                "LangSmith create_run failed: %s",
+                exc,
+                extra={"agent_name": agent_name, "task_id": task_id},
+            )
+
+        return run_id
+
+    def end_trace(self, run_id: str, outputs: dict = None, error: str = None):
+        """End a trace run with outputs or error."""
+        run_info = self._active_runs.pop(run_id, {})
+        agent_name = run_info.get("agent_name", "unknown")
+        task_id = run_info.get("task_id", "-")
+
+        if error:
+            self._metrics["total_errors"] += 1
+            self._metrics["per_agent"][agent_name]["errors"] += 1
+            self.logger.error(
+                "Trace error: run_id=%s error=%s",
+                run_id,
+                error,
+                extra={"agent_name": agent_name, "task_id": task_id},
+            )
+        else:
+            self.logger.info(
+                "Trace ended: run_id=%s",
+                run_id,
+                extra={"agent_name": agent_name, "task_id": task_id},
+            )
+
+        try:
+            if self._client is not None:
+                update_kwargs: dict[str, Any] = {"end_time": datetime.now(timezone.utc)}
+                if outputs:
+                    update_kwargs["outputs"] = outputs
+                if error:
+                    update_kwargs["error"] = error
+                self._client.update_run(run_id, **update_kwargs)
+        except Exception as exc:
+            self.logger.warning(
+                "LangSmith update_run failed: %s",
+                exc,
+                extra={"agent_name": agent_name, "task_id": task_id},
+            )
+
+    # ------------------------------------------------------------------
+    # Decorator
+    # ------------------------------------------------------------------
+
+    def trace_agent_execution(self, agent_name: str, task_id: str):
+        """Decorator for tracking agent calls with context."""
+
+        def decorator(func: Callable):
+            @functools.wraps(func)
+            async def async_wrapper(*args, **kwargs):
+                run_id = self.start_trace(agent_name, task_id, inputs={"args": str(args), "kwargs": str(kwargs)})
+                try:
+                    result = await func(*args, **kwargs)
+                    self.end_trace(run_id, outputs={"result": str(result)})
+                    return result
+                except Exception as exc:
+                    self.end_trace(run_id, error=f"{type(exc).__name__}: {exc}")
+                    raise
+
+            @functools.wraps(func)
+            def sync_wrapper(*args, **kwargs):
+                run_id = self.start_trace(agent_name, task_id, inputs={"args": str(args), "kwargs": str(kwargs)})
+                try:
+                    result = func(*args, **kwargs)
+                    self.end_trace(run_id, outputs={"result": str(result)})
+                    return result
+                except Exception as exc:
+                    self.end_trace(run_id, error=f"{type(exc).__name__}: {exc}")
+                    raise
+
+            if inspect.iscoroutinefunction(func):
+                return async_wrapper
+            return sync_wrapper
+
+        return decorator
+
+    # ------------------------------------------------------------------
+    # Async helpers
+    # ------------------------------------------------------------------
+
+    async def trace_agent(self, agent_name: str, task_id: str, func: Callable):
+        """Async helper to run a function within a trace context."""
+        run_id = self.start_trace(agent_name, task_id)
+        try:
+            result = await func()
+            self.end_trace(run_id, outputs={"result": str(result)})
+            return result
+        except Exception as exc:
+            self.end_trace(run_id, error=f"{type(exc).__name__}: {exc}")
+            raise
+
+    def trace_context(self, agent_name: str, task_id: str) -> _TraceContext:
+        """Return an async context manager for tracing.
+
+        Usage::
+
+            async with obs.trace_context("agent", "task_id") as run_id:
+                ...
+        """
+        return _TraceContext(self, agent_name, task_id)
+
+    # ------------------------------------------------------------------
+    # Logging helpers
+    # ------------------------------------------------------------------
+
+    def log_state_transition(self, from_state: str, to_state: str, metadata: dict = None):
+        """Log a state machine transition."""
+        msg = f"State transition: {from_state} -> {to_state}"
+        if metadata:
+            msg += f" metadata={metadata}"
+        self.logger.info(msg, extra={"agent_name": "STATE_MACHINE", "task_id": "-"})
+
+    def log_token_usage(
+        self,
+        agent_name: str,
+        task_id: str,
+        input_tokens: int,
+        output_tokens: int,
+        model: str = None,
+    ):
+        """Log token usage for cost monitoring."""
+        total = input_tokens + output_tokens
+        self._metrics["total_tokens"] += total
+        self._metrics["per_agent"][agent_name]["tokens"] += total
+
+        msg = f"Token usage: input={input_tokens} output={output_tokens} total={total}"
+        if model:
+            msg += f" model={model}"
+        self.logger.info(msg, extra={"agent_name": agent_name, "task_id": task_id})
+
+    def log_error(self, agent_name: str, task_id: str, error: Exception, context: dict = None):
+        """Log an error with full stack trace."""
+        self._metrics["total_errors"] += 1
+        self._metrics["per_agent"][agent_name]["errors"] += 1
+
+        tb = traceback.format_exception(type(error), error, error.__traceback__)
+        msg = f"Error: {error}\n{''.join(tb)}"
+        if context:
+            msg += f" context={context}"
+        self.logger.error(msg, extra={"agent_name": agent_name, "task_id": task_id})
+
+    def log_claude_event(
+        self,
+        agent_name: str,
+        task_id: str,
+        event_type: str,
+        payload: dict | None = None,
+    ):
+        """Log a Claude SDK/CLI event in structured form."""
+        self._metrics["total_claude_events"] += 1
+        self._metrics["per_agent"][agent_name]["claude_events"] += 1
+
+        normalized_event = (event_type or "unknown").strip().lower()
+        normalized_payload = dict(payload or {})
+        if normalized_event == "tool_use":
+            self._metrics["total_tool_calls"] += 1
+            self._metrics["per_agent"][agent_name]["tool_calls"] += 1
+            tool_use_id = normalized_payload.get("tool_use_id")
+            tool_name = normalized_payload.get("tool_name")
+            tool_input = normalized_payload.get("tool_input")
+            if isinstance(tool_use_id, str) and isinstance(tool_name, str):
+                self._tool_name_by_use_id[tool_use_id] = tool_name
+                self._tool_summary_by_use_id[tool_use_id] = self._summarize_tool_input(
+                    str(tool_name),
+                    tool_input,
+                )
+
+        if normalized_event == "tool_result":
+            tool_use_id = normalized_payload.get("tool_use_id")
+            if isinstance(tool_use_id, str):
+                tool_name = self._tool_name_by_use_id.pop(tool_use_id, None)
+                tool_summary = self._tool_summary_by_use_id.pop(tool_use_id, None)
+                if "tool_name" not in normalized_payload and tool_name:
+                    normalized_payload["tool_name"] = tool_name
+                if "tool_input_summary" not in normalized_payload and tool_summary:
+                    normalized_payload["tool_input_summary"] = tool_summary
+
+        if not self._should_log_claude_event(normalized_event):
+            return
+
+        msg = self._format_claude_event_message(normalized_event, normalized_payload)
+        if not msg:
+            return
+
+        self.logger.debug(msg, extra={"agent_name": agent_name, "task_id": task_id})
+
+    def _should_log_claude_event(self, event_type: str) -> bool:
+        allowed = self._CLAUDE_EVENT_FILTERS.get(self._claude_event_mode)
+        if allowed is None:
+            return True
+        return event_type in allowed
+
+    def _format_claude_event_message(self, event_type: str, payload: dict[str, Any]) -> str:
+        session_id = payload.get("session_id")
+        session_suffix = f" session={session_id}" if session_id else ""
+
+        if event_type == "request_start":
+            model = payload.get("model") or "default"
+            prompt_chars = payload.get("prompt_chars", 0)
+            return f"Claude request started: model={model} prompt_chars={prompt_chars}{session_suffix}"
+
+        if event_type == "request_complete":
+            inp = payload.get("input_tokens", 0)
+            out = payload.get("output_tokens", 0)
+            subtype = payload.get("result_subtype") or "unknown"
+            preview = self._shorten_text(payload.get("result_preview"), max_chars=140)
+            preview_fragment = f' result="{preview}"' if preview else ""
+            return (
+                f"Claude request completed: subtype={subtype} "
+                f"tokens={inp}->{out}{preview_fragment}{session_suffix}"
+            )
+
+        if event_type == "request_error":
+            err = self._shorten_text(payload.get("error"))
+            retrying = payload.get("retrying")
+            retry_fragment = " retrying=true" if retrying else ""
+            return f"Claude request error: {err}{retry_fragment}{session_suffix}"
+
+        if event_type == "tool_use":
+            tool_name = payload.get("tool_name", "unknown_tool")
+            tool_input = payload.get("tool_input")
+            input_summary = self._summarize_tool_input(str(tool_name), tool_input)
+            return f"Claude tool call: {tool_name} {input_summary}{session_suffix}"
+
+        if event_type == "tool_result":
+            tool_name = payload.get("tool_name", "tool")
+            is_error = bool(payload.get("is_error", False))
+            content = payload.get("content")
+            input_summary = payload.get("tool_input_summary")
+            input_fragment = f" {input_summary}" if input_summary else ""
+            status = "error" if is_error else "ok"
+            if self._is_noisy_tool_name(str(tool_name)) and not is_error:
+                return ""
+            if self._is_noisy_tool_name(str(tool_name)) and is_error:
+                error_preview = self._shorten_text(content, max_chars=420)
+                error_fragment = f" error={error_preview}" if error_preview else ""
+                return (
+                    f"Claude tool result: {tool_name} status={status}"
+                    f"{input_fragment}{error_fragment}{session_suffix}"
+                )
+            content_preview = self._compact_json(content, max_chars=420)
+            return (
+                f"Claude tool result: {tool_name} status={status}"
+                f"{input_fragment} content={content_preview}{session_suffix}"
+            )
+
+        if event_type == "text_block":
+            preview = self._shorten_text(payload.get("preview"))
+            return f"Claude says: {preview}{session_suffix}"
+
+        if event_type == "thinking_block":
+            chars = payload.get("chars", 0)
+            return f"Claude thinking block: chars={chars}{session_suffix}"
+
+        if event_type == "result_message":
+            subtype = payload.get("subtype", "unknown")
+            turns = payload.get("num_turns", 0)
+            duration_ms = payload.get("duration_ms")
+            duration_fragment = f" duration_ms={duration_ms}" if duration_ms is not None else ""
+            return f"Claude result message: subtype={subtype} turns={turns}{duration_fragment}{session_suffix}"
+
+        payload_json = self._compact_json(payload)
+        return f"Claude event: type={event_type} payload={payload_json}{session_suffix}"
+
+    @staticmethod
+    def _shorten_text(value: Any, max_chars: int = 220) -> str:
+        text = str(value) if value is not None else ""
+        text = text.strip().replace("\n", " ")
+        if len(text) <= max_chars:
+            return text
+        return f"{text[:max_chars]}..."
+
+    @staticmethod
+    def _compact_json(value: Any, max_chars: int = 300) -> str:
+        with contextlib.suppress(TypeError, ValueError):
+            rendered = json.dumps(value, sort_keys=True, default=str)
+            if len(rendered) <= max_chars:
+                return rendered
+            return f"{rendered[:max_chars]}..."
+        return ObservabilityManager._shorten_text(value, max_chars=max_chars)
+
+    @staticmethod
+    def _is_noisy_tool_name(tool_name: str) -> bool:
+        return tool_name.lower() in {"read", "bash", "grep", "glob", "find", "ls"}
+
+    @classmethod
+    def _summarize_tool_input(cls, tool_name: str, tool_input: Any) -> str:
+        if not isinstance(tool_input, dict):
+            return f"input={cls._compact_json(tool_input, max_chars=140)}"
+
+        normalized_name = tool_name.lower()
+        if normalized_name == "read":
+            path = tool_input.get("file_path") or tool_input.get("path")
+            return f"path={cls._shorten_path(path, max_chars=120)}"
+
+        if normalized_name == "bash":
+            cmd = tool_input.get("command")
+            compact_cmd = cls._abbreviate_workspace_paths(cmd)
+            return f"command={cls._shorten_text(compact_cmd, max_chars=160)}"
+
+        description = tool_input.get("description")
+        if isinstance(description, str) and description.strip():
+            return f"description={cls._shorten_text(description, max_chars=140)}"
+
+        summary_keys = ("file_path", "path", "pattern", "query", "command", "name")
+        summary: dict[str, Any] = {}
+        for key in summary_keys:
+            if key in tool_input:
+                value = tool_input[key]
+                if key in {"file_path", "path"}:
+                    value = cls._shorten_path(value, max_chars=120)
+                summary[key] = value
+        if summary:
+            return f"input={cls._compact_json(summary, max_chars=160)}"
+        return f"input={cls._compact_json(tool_input, max_chars=160)}"
+
+    @classmethod
+    def _shorten_path(cls, value: Any, max_chars: int = 120) -> str:
+        text = str(value).strip() if value is not None else ""
+        if not text:
+            return ""
+
+        normalized = text
+        with contextlib.suppress(Exception):
+            cwd = os.path.abspath(os.getcwd())
+            if os.path.isabs(text):
+                abs_path = os.path.abspath(text)
+                if abs_path == cwd:
+                    normalized = "."
+                elif abs_path.startswith(f"{cwd}{os.sep}"):
+                    normalized = os.path.relpath(abs_path, cwd)
+            else:
+                normalized = text.replace(f"{cwd}{os.sep}", "")
+
+        return cls._shorten_text(normalized, max_chars=max_chars)
+
+    @staticmethod
+    def _abbreviate_workspace_paths(value: Any) -> str:
+        text = str(value).strip() if value is not None else ""
+        if not text:
+            return ""
+
+        compact = text
+        with contextlib.suppress(Exception):
+            cwd = os.path.abspath(os.getcwd())
+            compact = compact.replace(f"{cwd}{os.sep}", "")
+            compact = compact.replace(cwd, ".")
+
+        return compact
+
+    @classmethod
+    def _estimate_chars(cls, value: Any) -> int:
+        if value is None:
+            return 0
+        if isinstance(value, str):
+            return len(value)
+        with contextlib.suppress(TypeError, ValueError):
+            return len(json.dumps(value, default=str))
+        return len(str(value))
+
+    # ------------------------------------------------------------------
+    # Metrics
+    # ------------------------------------------------------------------
+
+    def get_metrics(self) -> dict:
+        """Return accumulated metrics (total tokens, traces, errors)."""
+        return {
+            "total_tokens": self._metrics["total_tokens"],
+            "total_traces": self._metrics["total_traces"],
+            "total_errors": self._metrics["total_errors"],
+            "total_claude_events": self._metrics["total_claude_events"],
+            "total_tool_calls": self._metrics["total_tool_calls"],
+            "per_agent": dict(self._metrics["per_agent"]),
+        }
--- a/app_factory/core/workspace.py
+++ b/app_factory/core/workspace.py
@@ -0,0 +1,230 @@
+"""Workspace Manager - Handles git worktrees and Docker containers for isolated execution."""
+
+import os
+import shutil
+from pathlib import Path
+
+import docker
+import git
+
+
+class WorkspaceError(Exception):
+    """Base exception for workspace operations."""
+
+
+class GitWorktreeError(WorkspaceError):
+    """Exception for git worktree failures."""
+
+
+class DockerProvisionError(WorkspaceError):
+    """Exception for Docker provisioning failures."""
+
+
+class WorkspaceManager:
+    """Manages git worktrees and Docker containers for isolated Dev Agent execution."""
+
+    def __init__(self, repo_path: str, docker_image: str = "python:3.11-slim"):
+        """Initialize WorkspaceManager.
+
+        Args:
+            repo_path: Path to the git repository.
+            docker_image: Docker image to use for clean room containers.
+        """
+        try:
+            self.repo = git.Repo(repo_path)
+        except git.InvalidGitRepositoryError as e:
+            raise GitWorktreeError(f"Invalid git repository: {repo_path}") from e
+        except git.NoSuchPathError as e:
+            raise GitWorktreeError(f"Repository path not found: {repo_path}") from e
+
+        self.repo_path = Path(repo_path).resolve()
+        self.docker_image = docker_image
+        self.active_workspaces: dict[str, dict] = {}
+
+        try:
+            self.docker_client = docker.from_env()
+        except docker.errors.DockerException as e:
+            raise DockerProvisionError(
+                "Failed to connect to Docker daemon. Is Docker running?"
+            ) from e
+
+    async def create_worktree(self, task_id: str, base_branch: str = "main") -> str:
+        """Create a git worktree for a task.
+
+        Args:
+            task_id: Unique identifier for the task.
+            base_branch: Branch to base the worktree on.
+
+        Returns:
+            Absolute path to the created worktree.
+
+        Raises:
+            GitWorktreeError: If worktree creation fails.
+        """
+        branch_name = f"feature/task-{task_id}"
+        worktree_path = str(self.repo_path.parent / "worktrees" / task_id)
+
+        # Validate base branch exists
+        try:
+            self.repo.git.rev_parse("--verify", base_branch)
+        except git.GitCommandError as e:
+            raise GitWorktreeError(
+                f"Base branch '{base_branch}' does not exist"
+            ) from e
+
+        # Check if worktree path already exists
+        if os.path.exists(worktree_path):
+            raise GitWorktreeError(
+                f"Worktree path already exists: {worktree_path}"
+            )
+
+        # Check if branch already exists
+        if branch_name in [ref.name for ref in self.repo.branches]:
+            raise GitWorktreeError(
+                f"Branch already exists: {branch_name}"
+            )
+
+        try:
+            os.makedirs(os.path.dirname(worktree_path), exist_ok=True)
+            self.repo.git.worktree(
+                "add", worktree_path, "-b", branch_name, base_branch
+            )
+        except git.GitCommandError as e:
+            raise GitWorktreeError(
+                f"Failed to create worktree for task {task_id}: {e}"
+            ) from e
+
+        return str(Path(worktree_path).resolve())
+
+    async def spin_up_clean_room(self, worktree_path: str, task_id: str):
+        """Create an isolated Docker container for a task.
+
+        Args:
+            worktree_path: Path to the git worktree to mount.
+            task_id: Unique identifier for the task.
+
+        Returns:
+            Container object with metadata.
+
+        Raises:
+            DockerProvisionError: If container creation fails.
+        """
+        try:
+            self.docker_client.images.pull(self.docker_image)
+        except docker.errors.APIError as e:
+            raise DockerProvisionError(
+                f"Failed to pull image '{self.docker_image}': {e}"
+            ) from e
+
+        try:
+            container = self.docker_client.containers.create(
+                image=self.docker_image,
+                name=f"appfactory-task-{task_id}",
+                volumes={
+                    worktree_path: {"bind": "/workspace", "mode": "rw"}
+                },
+                working_dir="/workspace",
+                network_mode="none",
+                auto_remove=False,
+                detach=True,
+                command="sleep infinity",
+            )
+        except docker.errors.APIError as e:
+            raise DockerProvisionError(
+                f"Failed to create container for task {task_id}: {e}"
+            ) from e
+
+        self.active_workspaces[task_id] = {
+            "task_id": task_id,
+            "worktree_path": worktree_path,
+            "container_id": container.id,
+            "container": container,
+        }
+
+        return container
+
+    async def cleanup_workspace(self, task_id: str, container=None):
+        """Clean up a workspace by removing its container and worktree.
+
+        Args:
+            task_id: Unique identifier for the task.
+            container: Optional container object. If None, uses the registered one.
+
+        Raises:
+            WorkspaceError: If cleanup fails completely.
+        """
+        workspace = self.active_workspaces.get(task_id, {})
+        errors = []
+
+        # Resolve container
+        if container is None:
+            container = workspace.get("container")
+
+        # Stop and remove container
+        if container is not None:
+            try:
+                container.stop(timeout=5)
+            except Exception:
+                pass  # Container may already be stopped
+            try:
+                container.remove(force=True)
+            except Exception as e:
+                errors.append(f"Container removal failed: {e}")
+
+        # Remove worktree
+        worktree_path = workspace.get("worktree_path")
+        if worktree_path is None:
+            worktree_path = str(self.repo_path.parent / "worktrees" / task_id)
+
+        try:
+            self.repo.git.worktree("remove", worktree_path, "--force")
+        except git.GitCommandError:
+            # Worktree may already be removed; try cleaning up the directory
+            if os.path.exists(worktree_path):
+                try:
+                    shutil.rmtree(worktree_path)
+                except OSError as e:
+                    errors.append(f"Worktree directory removal failed: {e}")
+
+        # Prune worktree references
+        try:
+            self.repo.git.worktree("prune")
+        except git.GitCommandError:
+            pass
+
+        # Remove from registry
+        self.active_workspaces.pop(task_id, None)
+
+        if errors:
+            raise WorkspaceError(
+                f"Cleanup completed with errors for task {task_id}: {'; '.join(errors)}"
+            )
+
+    def get_active_workspaces(self) -> list:
+        """Return list of active workspace info dicts.
+
+        Returns:
+            List of dicts with task_id, worktree_path, and container_id.
+        """
+        return [
+            {
+                "task_id": info["task_id"],
+                "worktree_path": info["worktree_path"],
+                "container_id": info["container_id"],
+            }
+            for info in self.active_workspaces.values()
+        ]
+
+    async def cleanup_all(self):
+        """Cleanup all active workspaces. Used for graceful shutdown."""
+        task_ids = list(self.active_workspaces.keys())
+        errors = []
+        for task_id in task_ids:
+            try:
+                await self.cleanup_workspace(task_id)
+            except WorkspaceError as e:
+                errors.append(str(e))
+        if errors:
+            raise WorkspaceError(
+                f"Cleanup all completed with errors: {'; '.join(errors)}"
+            )
--- a/app_factory/data/init.py
+++ b/app_factory/data/init.py
@@ -0,0 +1 @@
+"""Data models and schemas for App Factory."""
--- a/app_factory/data/state.json
+++ b/app_factory/data/state.json
@@ -0,0 +1,15 @@
+{
+  "user_input": "please review the project in the app_factory directory and create an api middleware to sit between the app_factory and the ui. if the core app does not have the required features, they will be added according to the middleware spec. the middleware should support sending + receiving data (start/stop jobs, respond to pm questions, etc), tracking progress, errors, logs, etc. visualizing the graph, tracking multiple projects running at a time",
+  "prd": "",
+  "tasks": [],
+  "active_tasks": {},
+  "completed_tasks": [],
+  "blocked_tasks": {},
+  "clarification_requests": [],
+  "global_architecture": "",
+  "iteration_count": 0,
+  "max_iterations": 50,
+  "errors": [
+    "PM agent error: Claude SDK query failed: Command failed with exit code -9 (exit code: -9)\nError output: Check stderr output for details\nHint: verify Claude auth is available (ANTHROPIC_API_KEY or ANTHROPIC_AUTH_TOKEN, or a valid Claude Code OAuth session) and that the process can write ~/.claude and ~/.claude.json."
+  ]
+}
--- a/app_factory/prompts/init.py
+++ b/app_factory/prompts/init.py
@@ -0,0 +1 @@
+"""Prompt templates and management for App Factory agents."""
--- a/app_factory/prompts/dev_task_execution.txt
+++ b/app_factory/prompts/dev_task_execution.txt
@@ -0,0 +1,22 @@
+You are a Dev Agent working on a specific task in an automated software factory.
+
+## YOUR TASK
+- Task ID: {task_id}
+- Title: {title}
+- Description: {description}
+
+## DETAILED REQUIREMENTS
+{details}
+
+## TEST STRATEGY
+{test_strategy}
+
+## GLOBAL ARCHITECTURE (Read-Only Context)
+{global_architecture}
+
+## STRICT INSTRUCTIONS
+1. Implement ONLY this task. Do not make changes unrelated to this task.
+2. Follow existing code patterns and conventions from the architecture summary.
+3. Create or update test files as specified in the test strategy.
+4. All tests must pass before you consider the task complete.
+5. Do not modify files outside the scope of this task.
--- a/app_factory/prompts/pm_clarification.txt
+++ b/app_factory/prompts/pm_clarification.txt
@@ -0,0 +1,11 @@
+You are a Product Manager resolving a clarification request from a downstream agent.
+
+Agent: {requesting_agent}
+Task ID: {task_id}
+Question: {question}
+Context: {context}
+
+If you can answer this question based on the PRD and general best practices, provide a clear, specific answer.
+If the question requires human input (business decision, external dependency, or ambiguous requirement), respond with exactly: ESCALATE_TO_HUMAN
+
+Provide only the answer, no preamble.
--- a/app_factory/prompts/pm_prd_expansion.txt
+++ b/app_factory/prompts/pm_prd_expansion.txt
@@ -0,0 +1,12 @@
+You are an expert Product Manager. Analyze the user's project description and expand it into a comprehensive Product Requirements Document (PRD).
+
+Your PRD must include these sections:
+1. **Objective** - Clear project goal and vision
+2. **Core Requirements** - Detailed functional requirements (numbered list)
+3. **Technical Architecture** - System design, components, data flow
+4. **Tech Stack** - Languages, frameworks, databases, infrastructure
+5. **Success Criteria** - Measurable outcomes for project completion
+6. **Non-Functional Requirements** - Performance, security, scalability constraints
+
+Be specific and actionable. Include edge cases and error handling requirements.
+Fill in reasonable technical decisions where the user hasn't specified.
--- a/app_factory/prompts/qa_review.txt
+++ b/app_factory/prompts/qa_review.txt
@@ -0,0 +1,20 @@
+You are a QA code reviewer in an automated software factory. Review the following code changes for quality and security.
+
+## Task Context
+{task_context}
+
+## Code Diff
+{diff}
+
+## Review Checklist
+1. **Security**: Check for OWASP Top 10 vulnerabilities (SQL injection, XSS, command injection, path traversal)
+2. **Code Quality**: Proper error handling, no dead code, clear naming, appropriate abstractions
+3. **Task Adherence**: Changes match the task requirements, no scope creep
+4. **Testing**: Adequate test coverage for the changes
+5. **Potential Bugs**: Race conditions, edge cases, null/None handling
+
+Respond in this format:
+APPROVED: true/false
+ISSUES:
+- [severity: critical/warning/info] description
+SUMMARY: One sentence summary of review
				`@@ -0,0 +1 @@`
				`"""Data models and schemas for App Factory."""`
				`@@ -0,0 +1 @@`
				`"""Prompt templates and management for App Factory agents."""`