commit 047cf4b606ef3c175c4b9919bd5c6e1e930f5b57
Author: Kaloyan Nikolov <konik98@gmail.com>
Date:   Fri Apr 17 15:30:19 2026 +0200

    Initial commit: research-pi headless research orchestrator

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..dd6e803
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,4 @@
+node_modules/
+dist/
+*.log
+.DS_Store
diff --git a/PLAN.md b/PLAN.md
new file mode 100644
index 0000000..109cf7c
--- /dev/null
+++ b/PLAN.md
@@ -0,0 +1,173 @@
+# POC Plan: Testing Framework Platform
+
+## Overview
+
+Based on comprehensive web and academic research on "test" methodologies, this document outlines a Proof of Concept (POC) plan for building a next-generation testing platform that integrates modern testing paradigms, AI-augmented testing, and developer experience improvements.
+
+## Recommended Stack
+
+### Core Technology Stack
+
+| Layer | Technology | Rationale |
+|-------|------------|-----------|
+| **Runtime** | Node.js 20+ with TypeScript | Dominant ecosystem, excellent testing tool support |
+| **Test Runner** | Vitest | Native ESM support, Vite integration, faster than Jest |
+| **E2E Testing** | Playwright | Industry leader, cross-browser support, reliable auto-waiting |
+| **UI Components** | Storybook + Testing Library | Component isolation, behavior-focused testing |
+| **API Testing** | MSW (Mock Service Worker) + Supertest | Realistic mocking, boundary testing |
+| **Coverage** | V8/Built-in (Vitest) | Fast, accurate, native integration |
+| **CI/CD** | GitHub Actions | Native integration, extensive testing actions |
+
+### Optional Advanced Components
+
+| Component | Technology | Use Case |
+|-----------|------------|----------|
+| **Contract Testing** | Pact | Microservices with API contracts |
+| **Load Testing** | k6 | Performance validation |
+| **Mutation Testing** | Stryker JS | Test quality assurance |
+| **Visual Regression** | Chromatic/Storybook | UI consistency testing |
+
+## Architecture Overview
+
+```
+┌─────────────────────────────────────────────────────────────────┐
+│                     TESTING PLATFORM                            │
+├─────────────────────────────────────────────────────────────────┤
+│  Test Orchestrator                                              │
+│  ├── Test Discovery & Scheduling                                │
+│  ├── Parallel Execution Engine                                  │
+│  ├── Result Aggregation & Reporting                             │
+│  └── CI/CD Integration Layer                                    │
+├─────────────────────────────────────────────────────────────────┤
+│  Test Type Modules                                              │
+│  ├── Unit Tests (Vitest)                                        │
+│  ├── Integration Tests (Supertest/MSW)                          │
+│  ├── E2E Tests (Playwright)                                     │
+│  ├── Component Tests (Storybook)                                │
+│  └── Contract Tests (Pact - optional)                           │
+├─────────────────────────────────────────────────────────────────┤
+│  AI-Augmented Layer (Future)                                    │
+│  ├── Test Generation (LLM-based)                                │
+│  ├── Test Failure Diagnosis                                     │
+│  └── Coverage Gap Analysis                                      │
+├─────────────────────────────────────────────────────────────────┤
+│  Developer Experience                                           │
+│  ├── Watch Mode with Hot Reload                                 │
+│  ├── Interactive HTML Reports                                   │
+│  ├── VS Code Extension                                          │
+│  └── Slack/Discord Notifications                                │
+└─────────────────────────────────────────────────────────────────┘
+```
+
+## POC Scope & Timeline
+
+### Week 1: Foundation (Days 1-5)
+
+**Day 1-2: Project Setup**
+- Initialize TypeScript + Vitest project
+- Configure linting (ESLint) and formatting (Prettier)
+- Set up directory structure
+- Basic test runner configuration
+
+**Day 3-5: Core Testing Infrastructure**
+- Implement test discovery mechanism
+- Create test execution engine
+- Build result formatting/reporting
+- Add coverage collection
+
+**Deliverable:** CLI tool that can discover and run tests with basic reporting
+
+### Week 2: E2E & Component Testing (Days 6-10)
+
+**Day 6-7: Playwright Integration**
+- Install and configure Playwright
+- Create reusable page object patterns
+- Implement E2E test scaffolding
+
+**Day 8-10: Storybook Integration**
+- Set up Storybook for UI components
+- Configure component testing
+- Create example component with tests
+
+**Deliverable:** Working E2E and component test examples
+
+### Week 3: Developer Experience (Days 11-15)
+
+**Day 11-12: Watch Mode & IDE Support**
+- Implement file watching for test re-execution
+- Create VS Code task configurations
+- Add debug configurations
+
+**Day 13-14: Reporting & CI/CD**
+- Build HTML test reports
+- Create GitHub Actions workflow
+- Add coverage badges
+
+**Day 15: Documentation**
+- Write comprehensive README
+- Create usage examples
+- Document architecture decisions
+
+**Deliverable:** Production-ready testing platform with documentation
+
+## Key Risks
+
+| Risk | Likelihood | Impact | Mitigation |
+|------|------------|--------|------------|
+| **Tool Compatibility Issues** | Medium | High | Test integrations early, maintain fallback options |
+| **Performance at Scale** | Medium | High | Design for parallelization from start, benchmark regularly |
+| **LLM Integration Complexity** | High | Medium | Defer AI features to post-POC phase |
+| **Developer Adoption** | Medium | High | Focus on DX, provide migration guides from Jest |
+| **CI/CD Integration Complexity** | Low | Medium | Use well-documented GitHub Actions patterns |
+
+## Success Criteria
+
+### Technical Metrics
+- [ ] Test execution speed >= 2x faster than Jest equivalent
+- [ ] 100% coverage reporting accuracy
+- [ ] <100ms watch mode feedback loop
+- [ ] Successful parallel execution without flakiness
+
+### Developer Experience Metrics
+- [ ] Zero-config startup for common project types
+- [ ] Clear, actionable error messages
+- [ ] Interactive HTML report with filtering
+- [ ] VS Code integration for running/debugging tests
+
+## Future Enhancements (Post-POC)
+
+### Phase 2: AI-Augmented Testing
+- LLM-based test generation from code analysis
+- Automated test failure root cause analysis (inspired by Google's 90% accuracy approach)
+- Coverage gap identification with suggested test cases
+
+### Phase 3: Advanced Testing Modes
+- Property-based testing integration (fast-check)
+- Mutation testing integration (Stryker)
+- Chaos engineering hooks
+
+### Phase 4: Enterprise Features
+- Multi-project monorepo support
+- Distributed test execution
+- Advanced reporting dashboards
+- Test flakiness detection and quarantine
+
+## Research References
+
+This plan is informed by:
+- [Web Research Summary](./research/web-summary.md) - Current tooling landscape and trends
+- [Paper Research Summary](./research/paper-summary.md) - Academic research on testing methodologies
+
+### Key Insights Applied
+
+1. **Playwright over Cypress** - Research shows Playwright has better cross-browser support and reliability
+2. **Vitest for Vite projects** - Emerging as the modern alternative to Jest with native ESM
+3. **Testing Library philosophy** - Test behavior, not implementation
+4. **LLM-augmented testing** - Research shows significant potential (90%+ accuracy in failure diagnosis)
+5. **Mutation testing value** - Studies confirm it improves actual test quality beyond coverage metrics
+
+---
+
+*Document Version: 1.0*  
+*Created: April 2026*  
+*Status: Draft for Review*
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..2a45048
--- /dev/null
+++ b/README.md
@@ -0,0 +1,75 @@
+# research-pi
+
+Headless research orchestrator for [`pi-coding-agent`](https://github.com/mariozechner/pi-coding-agent).
+
+Spawns a headless Pi orchestrator that delegates to read-only subagents to:
+- Research new topics (`--start_research`)
+- Onboard existing codebases (`--onboarding`)
+- Plan new features (`--new_feature`)
+
+## Install
+
+```bash
+curl -fsSL https://raw.githubusercontent.com/YOUR_USERNAME/YOUR_REPO/main/install.sh | bash
+```
+
+Requires: `node`, `pnpm`, and `pi` (pi-coding-agent) installed.
+
+## Usage
+
+```bash
+# Research a new topic
+research --model kimi-for-coding --start_research \
+  --task "native android app using gemma 4 e4b"
+
+# Onboard an existing project
+research --model minimax-token-plan/MiniMax-M2.7 --onboarding
+
+# Onboard a specific part of a project
+research --model k2p5 --onboarding \
+  --task "i6_experiments/user/nikolov/experiments/voxpopuli"
+
+# Plan a new feature
+research --model kimi-for-coding --new_feature \
+  --task "add a comment section to my react blog"
+```
+
+## Outputs
+
+| Mode | Files written |
+|------|---------------|
+| `--start_research` | `PLAN.md`, `research/web-summary.md`, `research/paper-summary.md` |
+| `--onboarding` | `MAP.md`, `ONBOARDING.md` |
+| `--new_feature` | `FEATURE.md` |
+
+## Configuration
+
+Create `~/.pi/research/config.json`:
+
+```json
+{
+  "webSearch": {
+    "mode": "extension",
+    "searxngUrl": "http://192.168.178.58:7777",
+    "mcpUrl": "http://sleepy-think:3001/mcp"
+  },
+  "models": {
+    "default": "kimi-for-coding",
+    "web-researcher": "k2p5",
+    "paper-researcher": "minimax-token-plan/MiniMax-M2.7"
+  }
+}
+```
+
+- `webSearch.mode`: `extension` (default, ships embedded SearXNG extension), `mcp` (proxy to MCP server), or `skill` (raw curl fallback)
+- `models.default`: fallback if no `--model` passed
+- `models.<agent-name>`: per-subagent model override
+
+## Architecture
+
+- `bin/research` — thin launcher that resolves models, builds the Pi CLI invocation, and streams output
+- `extensions/subagent-spawner.ts` — registers `spawn_subagent` tool so the orchestrator can delegate
+- `extensions/web-search.ts` — SearXNG-based `web_search` + `web_fetch` tools
+- `extensions/mcp-web-search.ts` — MCP-proxy variant
+- `agents/orchestrator.md` — system prompt for the orchestrator
+- The orchestrator is the **only** agent with `write` access. All subagents are strictly read-only.
diff --git a/agents/orchestrator.md b/agents/orchestrator.md
new file mode 100644
index 0000000..10f9c01
--- /dev/null
+++ b/agents/orchestrator.md
@@ -0,0 +1,71 @@
+You are a Research Orchestrator. You coordinate headless pi subagents to gather information and write structured markdown deliverables. You are the ONLY agent allowed to write files.
+
+## Your Tools
+- `read` — inspect files in the local project
+- `write` — create or overwrite markdown deliverables
+- `bash` — run quick commands (e.g. count files, check versions)
+- `grep`, `find`, `ls` — inspect the codebase structure
+- `spawn_subagent` — delegate research/mapping tasks to specialist subagents
+
+## Modes
+The user invoked you in exactly one of these modes. Your goal is to produce the listed files.
+
+### start_research
+Goal: research a topic from scratch (no codebase, or a fresh project folder).
+Outputs to write:
+- `PLAN.md`: high-level POC plan, recommended stack, risks, timeline
+- `research/web-summary.md`: 1-2 page summary of web findings with links
+- `research/paper-summary.md`: 1-2 page summary of papers/reports (if relevant)
+
+### onboarding
+Goal: understand an existing codebase.
+Outputs to write:
+- `MAP.md`: concise feature-to-location mapping, architecture overview
+- `ONBOARDING.md`: project description + per-feature guide (where it lives, how to use it, inputs/outputs)
+
+### new_feature
+Goal: figure out how to implement a specific feature in the existing project.
+Outputs to write:
+- `FEATURE.md`: findings on how this is done currently (SOTA, libraries, patterns), plus tailored integration advice for this specific codebase
+
+## How to Work
+1. **Assess the situation.** Use `ls`, `find`, `bash` (e.g. `find . -type f | wc -l`, `tokei` if available) to gauge codebase size.
+2. **Decide your attack plan.** You do NOT need to ask permission. Spawn subagents as you see fit, in parallel or sequence.
+3. **Delegate via `spawn_subagent`.** Give each subagent a clear, self-contained task.
+4. **Synthesize and write.** Collect outputs, then write the final markdown files yourself.
+
+## Subagent Conventions
+When you spawn a subagent, choose the appropriate toolset. They are read-only unless you explicitly give them write/edit tools (which you should NOT do).
+
+- **Web/Paper researchers** — `tools: "read,bash"`, load the `web-search` extension (or `mcp-web-search` if the config says MCP). They may use bash only for curl/search/fetch. Forbidden: git modifications, redirects to files, rm, etc.
+- **Codebase mappers** — `tools: "read,grep,find,ls"`. No bash, no write, no edit. They crawl the source and return structured findings.
+- **Project analyzers** — `tools: "read,grep,find,ls"`. No bash, no write, no edit. They analyze package files and integration points.
+
+Recommended subagent names and roles:
+- `web-researcher`: searches frameworks, docs, blogs, repos, latest implementations
+- `paper-researcher`: searches arxiv, technical reports, research implementations
+- `codebase-discovery`: top-level scan, lists major modules/features
+- `module-mapper`: deep-dive into one directory/module
+- `dependency-analyzer`: extracts deps, versions, build configs
+- `project-analyzer`: understands current stack and where a new feature fits
+- `feature-researcher`: researches best practices for a specific feature
+
+You may spawn multiple `module-mapper` agents in parallel for large codebases.
+You may spawn `paper-researcher` whenever a topic feels scientific, algorithmic, performance-oriented, or ML-adjacent. Use your judgment.
+
+## Read-Only Enforcement
+Include this exact paragraph in every subagent task:
+> "You are read-only. You may NOT write files, edit code, run git commands that modify state, or use shell redirects (`>`, `>>`). Return all findings as text in your response."
+
+## Fast-Forward Hints
+- In `onboarding` mode, if `MAP.md` or `ONBOARDING.md` already exist, you may still regenerate them if the user wants a fresh pass, but you can also read them to save time.
+- In `new_feature` mode, if `MAP.md` or `ONBOARDING.md` exist, read them first to understand the project before spawning analyzers.
+
+## Output Quality
+- Be concise but complete.
+- Include file paths and code references where relevant.
+- For links, use markdown `[title](url)` format.
+- Use `bash` to `mkdir -p` parent directories before `write` if needed.
+- If a subagent times out or fails, note the gap explicitly in your deliverables.
+
+Now begin.
diff --git a/bin/research b/bin/research
new file mode 100755
index 0000000..41f0146
--- /dev/null
+++ b/bin/research
@@ -0,0 +1,17 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Resolve symlinks to find the actual script location
+SCRIPT_PATH="${BASH_SOURCE[0]}"
+while [ -L "$SCRIPT_PATH" ]; do
+  SCRIPT_DIR="$(cd "$(dirname "$SCRIPT_PATH")" && pwd)"
+  SCRIPT_PATH="$(readlink "$SCRIPT_PATH")"
+  case "$SCRIPT_PATH" in
+    /*) ;;
+    *) SCRIPT_PATH="$SCRIPT_DIR/$SCRIPT_PATH" ;;
+  esac
+done
+SCRIPT_DIR="$(cd "$(dirname "$SCRIPT_PATH")" && pwd)"
+ROOT_DIR="$(cd "$SCRIPT_DIR/.." && pwd)"
+
+exec node "$ROOT_DIR/dist/main.js" "$@"
diff --git a/config/default.json b/config/default.json
new file mode 100644
index 0000000..99abf46
--- /dev/null
+++ b/config/default.json
@@ -0,0 +1,10 @@
+{
+  "webSearch": {
+    "mode": "extension",
+    "searxngUrl": "http://192.168.178.58:7777",
+    "mcpUrl": "http://sleepy-think:3001/mcp"
+  },
+  "models": {
+    "default": "kimi-for-coding"
+  }
+}
diff --git a/extensions/mcp-web-search.ts b/extensions/mcp-web-search.ts
new file mode 100644
index 0000000..ada6c32
--- /dev/null
+++ b/extensions/mcp-web-search.ts
@@ -0,0 +1,102 @@
+/**
+ * MCP Web Search Proxy
+ * Proxies web_search / web_fetch to an MCP server endpoint.
+ */
+
+import { Type } from "@sinclair/typebox";
+import type { ExtensionAPI } from "@mariozechner/pi-coding-agent";
+
+function getMcpUrl(): string {
+  return process.env.MCP_URL || "http://sleepy-think:3001/mcp";
+}
+
+const WEB_TOOLS_SECTION = `\`web_search\` — Web lookup via MCP proxy. Returns up to 20 results per query. Follow-up with web_fetch for content from promising URLs.
+\`web_fetch\` — extract page text via MCP proxy. Scale maxLength to content type.`;
+
+async function mcpCall(toolName: string, args: Record<string, any>): Promise<any> {
+  const url = getMcpUrl();
+  const body = {
+    jsonrpc: "2.0",
+    id: Date.now(),
+    method: "tools/call",
+    params: { name: toolName, arguments: args },
+  };
+
+  const res = await fetch(url, {
+    method: "POST",
+    headers: { "Content-Type": "application/json" },
+    signal: AbortSignal.timeout(20000),
+    body: JSON.stringify(body),
+  });
+
+  if (!res.ok) {
+    throw new Error(`MCP proxy returned ${res.status} ${res.statusText}`);
+  }
+
+  const data = (await res.json()) as any;
+  if (data.error) {
+    throw new Error(`MCP error: ${data.error.message || JSON.stringify(data.error)}`);
+  }
+
+  // MCP returns content as an array of { type: "text", text: "..." }
+  const content = data.result?.content;
+  if (!content || !Array.isArray(content)) {
+    throw new Error("Unexpected MCP response format");
+  }
+
+  return content;
+}
+
+export default function mcpWebSearchExtension(pi: ExtensionAPI) {
+  pi.on("before_agent_start", async (event) => {
+    if (!event.systemPrompt.includes("web_search")) {
+      return { systemPrompt: event.systemPrompt + "\n" + WEB_TOOLS_SECTION };
+    }
+    return { systemPrompt: event.systemPrompt };
+  });
+
+  pi.registerTool({
+    name: "web_search",
+    label: "Web Search (MCP)",
+    description: "Search the web via MCP proxy. Returns up to 20 results.",
+    parameters: Type.Object({
+      query: Type.String({ description: "The search query to execute (max 2000 characters)" }),
+    }),
+
+    async execute(_toolCallId, params, _signal, _onUpdate, _ctx) {
+      const { query } = params as { query: string };
+      try {
+        const content = await mcpCall("web_search", { query });
+        return { content };
+      } catch (error) {
+        return {
+          content: [{ type: "text" as const, text: `MCP web_search error: ${(error as Error).message}` }],
+          isError: true,
+        };
+      }
+    },
+  });
+
+  pi.registerTool({
+    name: "web_fetch",
+    label: "Web Fetch (MCP)",
+    description: "Fetch a URL as text via MCP proxy.",
+    parameters: Type.Object({
+      url: Type.String({ description: "The URL to fetch" }),
+      maxLength: Type.Number({ description: "Maximum characters", default: 20000 }),
+    }),
+
+    async execute(_toolCallId, params, _signal, _onUpdate, _ctx) {
+      const { url, maxLength = 20000 } = params as { url: string; maxLength?: number };
+      try {
+        const content = await mcpCall("web_fetch", { url, maxLength });
+        return { content };
+      } catch (error) {
+        return {
+          content: [{ type: "text" as const, text: `MCP web_fetch error: ${(error as Error).message}` }],
+          isError: true,
+        };
+      }
+    },
+  });
+}
diff --git a/extensions/subagent-spawner.ts b/extensions/subagent-spawner.ts
new file mode 100644
index 0000000..91d0841
--- /dev/null
+++ b/extensions/subagent-spawner.ts
@@ -0,0 +1,171 @@
+import { Type } from "@sinclair/typebox";
+import type { ExtensionAPI } from "@mariozechner/pi-coding-agent";
+import { spawn } from "child_process";
+import * as fs from "fs";
+import * as path from "path";
+import * as os from "os";
+
+function getRepoRoot(): string {
+  if (process.env.RESEARCH_PI_ROOT) {
+    return process.env.RESEARCH_PI_ROOT;
+  }
+  throw new Error("RESEARCH_PI_ROOT environment variable is not set. Cannot resolve extension/skill paths.");
+}
+
+function resolveExt(name: string): string {
+  const root = getRepoRoot();
+  if (fs.existsSync(name)) return name;
+  const candidate = path.join(root, "extensions", `${name}.ts`);
+  if (fs.existsSync(candidate)) return candidate;
+  throw new Error(`Extension not found: ${name}`);
+}
+
+function resolveSkill(name: string): string {
+  const root = getRepoRoot();
+  if (fs.existsSync(name)) return name;
+  const candidate = path.join(root, "skills", name);
+  if (fs.existsSync(candidate)) return candidate;
+  throw new Error(`Skill not found: ${name}`);
+}
+
+function makeTempSession(): string {
+  const dir = path.join(os.homedir(), ".pi", "research", "sessions");
+  fs.mkdirSync(dir, { recursive: true });
+  return path.join(dir, `subagent-${Date.now()}-${Math.random().toString(36).slice(2)}.jsonl`);
+}
+
+function statusLog(msg: string): void {
+  const ts = new Date().toISOString().replace("T", " ").slice(0, 19);
+  process.stderr.write(`[research-status] ${ts} ${msg}\n`);
+}
+
+export default function (pi: ExtensionAPI) {
+  pi.registerTool({
+    name: "spawn_subagent",
+    description:
+      "Spawn a headless pi subagent to perform a task. Waits for it to finish and returns its complete output text. Subagents are read-only (no write/edit tools unless explicitly given).",
+    parameters: Type.Object({
+      name: Type.String({ description: "Name of the subagent for logging" }),
+      task: Type.String({ description: "Full task prompt for the subagent" }),
+      tools: Type.String({ description: "Comma-separated tools, e.g. read,bash or read,grep,find,ls" }),
+      extensions: Type.Optional(Type.Array(Type.String(), { description: "Extension names or paths to load" })),
+      skills: Type.Optional(Type.Array(Type.String(), { description: "Skill names or paths to load" })),
+      model: Type.Optional(Type.String({ description: "Model override (provider/id or alias). Defaults to orchestrator model." })),
+      timeoutMinutes: Type.Optional(Type.Number({ default: 15, description: "Timeout in minutes" })),
+    }),
+
+    async execute(callId, params, signal, onUpdate, ctx) {
+      const { name, task, tools, extensions = [], skills = [], model, timeoutMinutes = 15 } = params as any;
+
+      const resolvedModel = model || (ctx.model ? `${ctx.model.provider}/${ctx.model.id}` : undefined);
+      if (!resolvedModel) {
+        return {
+          content: [{ type: "text", text: "Error: no model specified and orchestrator model unknown." }],
+          isError: true,
+        };
+      }
+
+      const piArgs = [
+        "--mode", "json",
+        "--print",
+        "--no-extensions",
+        "--no-skills",
+        "--model", resolvedModel,
+        "--tools", tools,
+        "--session", makeTempSession(),
+        "--thinking", "off",
+      ];
+
+      for (const ext of extensions) {
+        piArgs.push("--extension", resolveExt(ext));
+      }
+      for (const skill of skills) {
+        piArgs.push("--skill", resolveSkill(skill));
+      }
+
+      piArgs.push(task);
+
+      statusLog(`Spawning subagent "${name}" (model: ${resolvedModel}, tools: ${tools})`);
+
+      if (onUpdate) {
+        onUpdate({
+          content: [{ type: "text", text: `Spawning subagent "${name}"...` }],
+        });
+      }
+
+      const startTime = Date.now();
+      const timeoutMs = timeoutMinutes * 60 * 1000;
+
+      return new Promise((resolve) => {
+        const proc = spawn("pi", piArgs, {
+          stdio: ["ignore", "pipe", "pipe"],
+          env: {
+            ...process.env,
+            RESEARCH_PI_ROOT: getRepoRoot(),
+          },
+        });
+
+        let killed = false;
+        const timer = setTimeout(() => {
+          killed = true;
+          proc.kill("SIGTERM");
+        }, timeoutMs);
+
+        let buffer = "";
+        const textChunks: string[] = [];
+
+        proc.stdout!.setEncoding("utf-8");
+        proc.stdout!.on("data", (chunk: string) => {
+          buffer += chunk;
+          const lines = buffer.split("\n");
+          buffer = lines.pop() || "";
+          for (const line of lines) {
+            if (!line.trim()) continue;
+            try {
+              const event = JSON.parse(line);
+              if (event.type === "message_update") {
+                const delta = event.assistantMessageEvent;
+                if (delta?.type === "text_delta") {
+                  textChunks.push(delta.delta || "");
+                }
+              }
+            } catch {}
+          }
+        });
+
+        proc.stderr!.setEncoding("utf-8");
+        proc.stderr!.on("data", () => {});
+
+        proc.on("close", (code) => {
+          clearTimeout(timer);
+          const elapsed = Math.round((Date.now() - startTime) / 1000);
+          const output = textChunks.join("");
+          const status = code === 0 ? "done" : (killed ? "timed out" : "error");
+          statusLog(`Subagent "${name}" finished (${status}) in ${elapsed}s`);
+
+          if (killed) {
+            resolve({
+              content: [{ type: "text", text: `Subagent "${name}" timed out after ${timeoutMinutes}m. Partial output:\n\n${output}` }],
+              isError: true,
+            });
+            return;
+          }
+
+          resolve({
+            content: [{ type: "text", text: `[${name}] ${status} in ${elapsed}s\n\n${output}` }],
+            isError: code !== 0,
+          });
+        });
+
+        proc.on("error", (err) => {
+          clearTimeout(timer);
+          statusLog(`Subagent "${name}" failed to spawn: ${err.message}`);
+          resolve({
+            content: [{ type: "text", text: `Error spawning subagent "${name}": ${err.message}` }],
+            isError: true,
+          });
+        });
+      });
+    },
+  });
+}
diff --git a/extensions/web-search.ts b/extensions/web-search.ts
new file mode 100644
index 0000000..88b2b7a
--- /dev/null
+++ b/extensions/web-search.ts
@@ -0,0 +1,173 @@
+/**
+ * Web Search & Fetch Tools for research-pi
+ * - web_search: Search via local SearXNG
+ * - web_fetch: Fetch and extract content from a URL
+ */
+
+import { Type } from "@sinclair/typebox";
+import type { ExtensionAPI } from "@mariozechner/pi-coding-agent";
+
+function getSearxngUrl(): string {
+  return process.env.SEARXNG_URL || "http://192.168.178.58:7777";
+}
+
+const WEB_TOOLS_SECTION = `\`web_search\` — Web lookup. Returns up to 20 results per query. Follow-up with web_fetch for content from promising URLs.
+\`web_fetch\` — extract page text. Scale maxLength to content type (5-10k for quick facts, 20-50k docs, 100k+ for source/API refs).`;
+
+export default function webSearchExtension(pi: ExtensionAPI) {
+  pi.on("before_agent_start", async (event) => {
+    if (!event.systemPrompt.includes("web_search")) {
+      return { systemPrompt: event.systemPrompt + "\n" + WEB_TOOLS_SECTION };
+    }
+    return { systemPrompt: event.systemPrompt };
+  });
+
+  pi.registerTool({
+    name: "web_search",
+    label: "Web Search",
+    description: "Search the web. Returns up to 20 results. Follow-up with web_fetch for content from promising URLs.",
+    promptSnippet: "Search the web. Returns up to 20 results. Follow-up with web_fetch for content from promising URLs.",
+    promptGuidelines: [
+      "Once you have a promising result, switch to web_fetch instead of spending more searches.",
+      "Always web_fetch sites you plan on quoting or using information from.",
+    ],
+    parameters: Type.Object({
+      query: Type.String({ description: "The search query to execute (max 2000 characters)" }),
+    }),
+
+    async execute(_toolCallId, params, _signal, _onUpdate, _ctx) {
+      const { query } = params as { query: string };
+      try {
+        const searchUrl = new URL("/search", getSearxngUrl());
+        searchUrl.searchParams.append("q", query);
+        searchUrl.searchParams.append("format", "json");
+
+        const response = await fetch(searchUrl.toString());
+        if (!response.ok) {
+          return {
+            content: [{ type: "text" as const, text: `Search request failed: ${response.status} ${response.statusText}` }],
+            isError: true,
+          };
+        }
+
+        const data = (await response.json()) as {
+          results?: Array<{ title: string; url: string; content?: string }>;
+        };
+
+        if (!data.results || !Array.isArray(data.results)) {
+          return {
+            content: [{ type: "text" as const, text: "No results found or invalid response format from search engine." }],
+          };
+        }
+
+        const formattedResults = data.results
+          .map(
+            (result, index) =>
+              `[${index + 1}] ${result.title}\nURL: ${result.url}\n${result.content || "No description available"}\n`
+          )
+          .join("\n");
+
+        return {
+          content: [
+            {
+              type: "text" as const,
+              text: `Found ${data.results.length} results:\n\n${formattedResults}`,
+            },
+          ],
+        };
+      } catch (error) {
+        return {
+          content: [
+            { type: "text" as const, text: `Error executing search: ${(error as Error).message}` },
+          ],
+          isError: true,
+        };
+      }
+    },
+  });
+
+  pi.registerTool({
+    name: "web_fetch",
+    label: "Web Fetch",
+    description: "Fetch a URL as text. Choose maxLength based on content type.",
+    promptSnippet: "Fetch a URL as text.",
+    promptGuidelines: [
+      "Set maxLength based on needs (50,000 default). Lower if a quick check, higher if precise details are important (documentation etc.)",
+    ],
+    parameters: Type.Object({
+      url: Type.String({ description: "The URL to fetch" }),
+      maxLength: Type.Number({
+        description: "Maximum characters of extracted text to return. Be context-aware.",
+        default: 20000,
+      }),
+    }),
+
+    async execute(_toolCallId, params, _signal, _onUpdate, _ctx) {
+      const { url, maxLength = 20000 } = params as { url: string; maxLength?: number };
+      try {
+        const response = await fetch(url, {
+          headers: {
+            "User-Agent": "Mozilla/5.0 (compatible; PiCodingAgent/1.0)",
+            Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,text/plain;q=0.8,*/*;q=0.7",
+          },
+          redirect: "follow",
+          signal: AbortSignal.timeout(15000),
+        });
+
+        if (!response.ok) {
+          return {
+            content: [{ type: "text" as const, text: `Fetch failed: ${response.status} ${response.statusText}` }],
+            isError: true,
+          };
+        }
+
+        const contentType = response.headers.get("content-type") || "";
+        const raw = await response.text();
+
+        let text: string;
+        if (contentType.includes("text/html") || contentType.includes("application/xhtml")) {
+          text = raw
+            .replace(/<script[\s\S]*?<\/script>/gi, "")
+            .replace(/<style[\s\S]*?<\/style>/gi, "")
+            .replace(/<!--[\s\S]*?-->/g, "")
+            .replace(/<(nav|header|footer)[\s\S]*?<\/\1>/gi, "")
+            .replace(/<\/(p|div|li|tr|h[1-6]|blockquote|pre|section|article)>/gi, "\n")
+            .replace(/<br\s*\/?>/gi, "\n")
+            .replace(/<[^>]+>/g, "")
+            .replace(/&amp;/g, "&")
+            .replace(/&lt;/g, "<")
+            .replace(/&gt;/g, ">")
+            .replace(/&quot;/g, '"')
+            .replace(/&#39;/g, "'")
+            .replace(/&nbsp;/g, " ")
+            .replace(/[ \t]+/g, " ")
+            .replace(/\n{3,}/g, "\n")
+            .split("\n")
+            .map((line) => line.trim())
+            .filter((line) => line.length > 0)
+            .join("\n")
+            .trim();
+        } else {
+          text = raw.trim();
+        }
+
+        const truncated = text.length > maxLength;
+        const output = truncated ? text.slice(0, maxLength) + "\n\n[... truncated]" : text;
+
+        return {
+          content: [
+            {
+              type: "text" as const,
+              text: `Fetched ${url} (${text.length} chars${truncated ? `, showing first ${maxLength}` : ""}):\n\n${output}`,
+            },
+          ],
+        };
+      } catch (error) {
+        return {
+          content: [{ type: "text" as const, text: `Error fetching URL: ${(error as Error).message}` }],
+          isError: true,
+        };
+      }
+    },
+  });
+}
diff --git a/install.sh b/install.sh
new file mode 100755
index 0000000..cd2346d
--- /dev/null
+++ b/install.sh
@@ -0,0 +1,70 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# === CONFIGURE THIS BEFORE HOSTING ===
+# Replace with your actual GitHub repo URL:
+REPO_URL="${RESEARCH_PI_REPO:-https://github.com/YOUR_USERNAME/YOUR_REPO.git}"
+# =====================================
+
+INSTALL_DIR="${HOME}/.pi/research"
+BIN_TARGET="${HOME}/.local/bin/research"
+
+echo "==> Installing research-pi..."
+
+# Dependencies
+if ! command -v node >/dev/null 2>&1; then
+  echo "Error: Node.js is required but not installed."
+  exit 1
+fi
+
+if ! command -v pi >/dev/null 2>&1; then
+  echo "Error: pi (pi-coding-agent) is required but not installed."
+  exit 1
+fi
+
+if ! command -v pnpm >/dev/null 2>&1; then
+  echo "Error: pnpm is required but not installed."
+  exit 1
+fi
+
+# Clone or update
+if [ -d "$INSTALL_DIR/.git" ]; then
+  echo "==> Updating existing installation..."
+  git -C "$INSTALL_DIR" pull --ff-only
+else
+  echo "==> Cloning repository..."
+  mkdir -p "$(dirname "$INSTALL_DIR")"
+  git clone "$REPO_URL" "$INSTALL_DIR"
+fi
+
+# Build
+cd "$INSTALL_DIR"
+if [ ! -d "node_modules" ]; then
+  echo "==> Installing dependencies..."
+  pnpm install
+fi
+
+echo "==> Building..."
+pnpm build
+
+# Symlink
+mkdir -p "$(dirname "$BIN_TARGET")"
+if [ -L "$BIN_TARGET" ] || [ -e "$BIN_TARGET" ]; then
+  rm -f "$BIN_TARGET"
+fi
+ln -s "$INSTALL_DIR/bin/research" "$BIN_TARGET"
+
+# Default config
+CONFIG_DIR="${HOME}/.pi/research"
+CONFIG_FILE="${CONFIG_DIR}/config.json"
+if [ ! -f "$CONFIG_FILE" ]; then
+  echo "==> Creating default config..."
+  cp "$INSTALL_DIR/config/default.json" "$CONFIG_FILE"
+fi
+
+echo "==> Installation complete!"
+echo "    Binary: $BIN_TARGET"
+echo "    Source: $INSTALL_DIR"
+echo ""
+echo "Usage example:"
+echo '  research --model k2p5 --start_research --task "native android app using gemma 4 e4b"'
diff --git a/package.json b/package.json
new file mode 100644
index 0000000..c3582eb
--- /dev/null
+++ b/package.json
@@ -0,0 +1,17 @@
+{
+  "name": "research-pi",
+  "version": "0.1.0",
+  "description": "Headless research orchestrator for pi-coding-agent",
+  "bin": {
+    "research": "./bin/research"
+  },
+  "scripts": {
+    "build": "tsc",
+    "dev": "tsc --watch"
+  },
+  "dependencies": {},
+  "devDependencies": {
+    "@types/node": "^20.0.0",
+    "typescript": "^5.0.0"
+  }
+}
diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml
new file mode 100644
index 0000000..0114a7f
--- /dev/null
+++ b/pnpm-lock.yaml
@@ -0,0 +1,39 @@
+lockfileVersion: '9.0'
+
+settings:
+  autoInstallPeers: true
+  excludeLinksFromLockfile: false
+
+importers:
+
+  .:
+    devDependencies:
+      '@types/node':
+        specifier: ^20.0.0
+        version: 20.19.39
+      typescript:
+        specifier: ^5.0.0
+        version: 5.9.3
+
+packages:
+
+  '@types/node@20.19.39':
+    resolution: {integrity: sha512-orrrD74MBUyK8jOAD/r0+lfa1I2MO6I+vAkmAWzMYbCcgrN4lCrmK52gRFQq/JRxfYPfonkr4b0jcY7Olqdqbw==}
+
+  typescript@5.9.3:
+    resolution: {integrity: sha512-jl1vZzPDinLr9eUt3J/t7V6FgNEw9QjvBPdysz9KfQDD41fQrC2Y4vKQdiaUpFT4bXlb1RHhLpp8wtm6M5TgSw==}
+    engines: {node: '>=14.17'}
+    hasBin: true
+
+  undici-types@6.21.0:
+    resolution: {integrity: sha512-iwDZqg0QAGrg9Rav5H4n0M64c3mkR59cJ6wQp+7C4nI0gsmExaedaYLNO44eT4AtBBwjbTiGPMlt2Md0T9H9JQ==}
+
+snapshots:
+
+  '@types/node@20.19.39':
+    dependencies:
+      undici-types: 6.21.0
+
+  typescript@5.9.3: {}
+
+  undici-types@6.21.0: {}
diff --git a/research/paper-summary.md b/research/paper-summary.md
new file mode 100644
index 0000000..e341af6
--- /dev/null
+++ b/research/paper-summary.md
@@ -0,0 +1,274 @@
+# Research Paper Summary: Testing Methodologies
+
+A synthesis of recent academic research, technical reports, and scientific approaches to testing across software engineering, statistics, and emerging domains.
+
+---
+
+## 1. Key Research Domains
+
+### Primary Areas of Academic Focus
+
+| Domain | Description | Key Venues |
+|--------|-------------|------------|
+| **Software Testing** | Test generation, prioritization, regression | ICSE, ASE, FSE |
+| **Statistical Testing** | Hypothesis testing, e-values, p-values | stat.ME, math.ST |
+| **Fuzzing** | Automated vulnerability discovery | ACM CCS, S&P, USENIX |
+| **ML/AI Testing** | Deep learning model validation | ML conferences, arXiv |
+| **Quantum Testing** | Quantum program verification | QCE, arXiv quant-ph |
+| **CPS Testing** | Cyber-physical systems security | Embedded systems venues |
+
+---
+
+## 2. Notable Papers and Findings
+
+### A. Software Testing & Test Generation
+
+#### LLM-Augmented Testing
+
+**"Enhancing Large Language Models with Retrieval Augmented Generation for Software Testing" (2026)**
+- **Authors:** Fingleton, Siavash, Moin
+- **Key Finding:** RAG pipelines reduce LLM hallucination and improve test generation effectiveness
+- **Link:** [arXiv:2604.15270](https://arxiv.org/abs/2604.15270)
+
+**"E-Test: E'er-Improving Test Suites" (2025)**
+- **Authors:** Qiu, Di Grazia, Mariani, Pezzè
+- **Key Finding:** LLM-augmented test suites achieve F1-score of 0.55 vs 0.34 for traditional regression testing
+- **Method:** Augments tests using production execution scenarios not covered by existing tests
+- **Link:** [arXiv:2510.19860](https://arxiv.org/abs/2510.19860)
+
+**"Inline Tests" (ASE 2022)**
+- **Authors:** Liu, Nie, Legunsen, Gligoric
+- **Innovation:** I-Test framework for testing individual statements
+- **Performance:** Negligible overhead (0.007x–0.014x)
+- **Impact:** Found 2 faults in production open-source projects
+- **Link:** [arXiv:2209.06315](https://arxiv.org/abs/2209.06315)
+
+#### Mutation Testing Research
+
+**"Does mutation testing improve testing practices?" (ICSE 2021)**
+- **Authors:** Petrović, Ivanković, Fraser, Just
+- **Scope:** Analysis of 15 million mutants
+- **Key Finding:** Mutants are coupled with real faults; developers write more tests when using mutation testing
+- **Significance:** Validates mutation testing as a quality metric beyond coverage
+- **Link:** [arXiv:2103.07189](https://arxiv.org/abs/2103.07189)
+
+### B. Statistical & Hypothesis Testing
+
+**"Continuous Testing: Unifying Tests and E-values" (2024)**
+- **Author:** Nick W. Koning
+- **Innovation:** Unifies e-values and classical testing into single continuous framework
+- **Key Finding:** E-values provide stronger evidence guarantees than p-values
+- **Significance:** Foundation for sequential/adaptive testing methods
+- **Link:** [arXiv:2409.05654](https://arxiv.org/abs/2409.05654)
+
+**"The Test of Tests: A Framework For Differentially Private Hypothesis Testing" (2023)**
+- **Authors:** Kazan, Shi, Groce, Bray
+- **Innovation:** Black-box framework for differentially private hypothesis tests
+- **Performance:** Requires only 5-6x more data than public setting at ε=1
+- **Link:** [arXiv:2302.04260](https://arxiv.org/abs/2302.04260)
+
+### C. Fuzzing & Security Testing
+
+**"Prompt Fuzzing for Fuzz Driver Generation" (ACM CCS 2024)**
+- **Authors:** Lyu, Xie, Chen, Chen
+- **Innovation:** Coverage-guided fuzzing using LLMs for prompt fuzzing
+- **Performance:** 1.61-1.63x higher branch coverage than OSS-Fuzz/Hopper
+- **Impact:** Found 33 new bugs in real-world software
+- **Link:** [arXiv:2312.17677](https://arxiv.org/abs/2312.17677)
+
+**"Large-Scale Empirical Analysis of Continuous Fuzzing" (2025)**
+- **Authors:** Shirai et al.
+- **Scope:** Analysis of ~1.12 million fuzzing sessions from 878 OSS-Fuzz projects
+- **Key Findings:** 
+  - High detection rates in early stages
+  - Coverage continues increasing over time (not saturating quickly)
+- **Link:** [arXiv:2510.16433](https://arxiv.org/abs/2510.16433)
+
+**"Deep Reinforcement Fuzzing" (2018)**
+- **Innovation:** Deep RL applied to fuzzing
+- **Impact:** Found 20+ bugs in real-world software
+- **Link:** [arXiv:1801.04589](https://arxiv.org/abs/1801.04589)
+
+### D. Metamorphic Testing
+
+**"Evaluating Human Trajectory Prediction with Metamorphic Testing" (2024)**
+- **Authors:** Spieker, Belmecheri, Gotlieb, Lazaar
+- **Innovation:** Wasserstein Violation Criterion for assessing metamorphic relations in stochastic systems
+- **Application:** Oracle-less testing for ML predictions
+- **Link:** [arXiv:2407.18756](https://arxiv.org/abs/2407.18756)
+
+**"METAL: Metamorphic Testing Framework for Analyzing Large-Language Model Qualities" (2023)**
+- **Authors:** Hyun, Guo, Babar
+- **Innovation:** Generates hundreds of metamorphic relations from templates
+- **Novel Metric:** Integrates Attack Success Rate (ASR) with semantic quality
+- **Link:** [arXiv:2312.06056](https://arxiv.org/abs/2312.06056)
+
+### E. Machine Learning System Testing
+
+**"Testing Deep Learning Models: A First Comparative Study" (2022)**
+- **Authors:** Ahuja, Gotlieb, Spieker
+- **Scope:** Comparative evaluation of differential, metamorphic, mutation, combinatorial, and adversarial testing
+- **Target:** Vision-based systems
+- **Link:** [arXiv:2202.12139](https://arxiv.org/abs/2202.12139)
+
+**"DeepMutation: Mutation Testing of Deep Learning Systems" (ISSRE 2018)**
+- **Authors:** Ma et al.
+- **Innovation:** Source-level and model-level mutation operators for DL systems
+- **Purpose:** Evaluating test data quality for neural networks
+- **Link:** [arXiv:1805.05206](https://arxiv.org/abs/1805.05206)
+
+### F. Quantum Software Testing
+
+**"Testing Multi-Subroutine Quantum Programs: From Unit Testing to Integration Testing" (2023)**
+- **Authors:** Long, Zhao
+- **Significance:** First comprehensive framework for quantum program testing
+- **Components:** IO analysis, quantum relation checking, structural/behavior testing
+- **Link:** [arXiv:2306.17407](https://arxiv.org/abs/2306.17407)
+
+### G. CI/CD & Regression Testing
+
+**"Formalizing Regression Testing for Agile and Continuous Integration Environments" (2025)**
+- **Authors:** Das, Gary
+- **Innovation:** First formalization using build-tuples and regression test windows
+- **Application:** Continuous regression testing in agile environments
+- **Link:** [arXiv:2511.02810](https://arxiv.org/abs/2511.02810)
+
+### H. Industrial Applications
+
+**"LLM-Based Automated Diagnosis Of Integration Test Failures At Google" (2026)**
+- **Authors:** Ziftci, Liu, Greene, Dalloro
+- **Tool:** Auto-Diagnose
+- **Performance:** 90.14% accuracy in root cause diagnosis
+- **Usage:** Deployed Google-wide, only 5.8% "not helpful" ratings
+- **Impact:** Significant reduction in debugging time for integration failures
+- **Link:** [arXiv:2604.12108](https://arxiv.org/abs/2604.12108)
+
+**"AnyPoC: Universal Proof-of-Concept Test Generation" (2026)**
+- **Authors:** Zhao, Yang, et al.
+- **Innovation:** Multi-agent framework for executable PoC generation
+- **Performance:** 1.3x more valid PoCs than Claude Code
+- **Impact:** Discovered 122 new bugs (105 confirmed, 86 fixed)
+- **Link:** [arXiv:2604.11950](https://arxiv.org/abs/2604.11950)
+
+---
+
+## 3. Research Trends
+
+### Current Trends (2024-2025)
+
+| Trend | Description | Key Papers |
+|-------|-------------|------------|
+| **LLM-Augmented Testing** | RAG pipelines, automated test generation, failure diagnosis | E-Test, Google Auto-Diagnose |
+| **Continuous/Adaptive Testing** | Formal models for agile regression testing | Das & Gary (2025) |
+| **Deep Learning Testing** | Mutation testing for neural networks, adversarial testing | DeepMutation, Ahuja et al. |
+| **Fuzzing Evolution** | Prompt fuzzing, RL-based fuzzing, directed fuzzing | PromptFuzz, Deep RL Fuzzing |
+| **Quantum Software Testing** | First frameworks emerging for quantum programs | Long & Zhao (2023) |
+| **Metamorphic Testing** | Expansion to LLM quality testing, stochastic systems | METAL, Spieker et al. |
+| **E-values in Testing** | Alternative to p-values with stronger guarantees | Koning (2024) |
+
+### Emerging Techniques
+
+| Technique | Description | Source |
+|-----------|-------------|--------|
+| **Property-Based Mutation Testing** | Combines mutation testing with formal property validation | Recent workshop papers |
+| **Inline Testing** | Statement-level testing with negligible overhead | Liu et al. (ASE 2022) |
+| **Behavioral Diversity** | Using mutation to measure test suite behavior diversity | Follow-up to Petrović et al. |
+| **Test Smells Analysis** | Flaky test prediction using test smells | Follow-up research |
+
+---
+
+## 4. Algorithmic and Performance Insights
+
+### Key Algorithmic Contributions
+
+| Algorithm/Framework | Contribution | Performance |
+|--------------------|--------------|-------------|
+| **E-value Framework** | Generalizes tests to continuous domain | Stronger evidence guarantees than p-values |
+| **LLVM-based Mutation (Mull)** | Language-independent mutation via IR manipulation | Faster via JIT compilation |
+| **Coverage-Guided Prompt Fuzzing** | Iterative LLM-based fuzz driver generation | 1.61-1.63x higher branch coverage |
+| **Active Fuzzing** | Online active learning for CPS network attacks | Adaptive test generation |
+| **Token-Level Fuzzing** | Mutations at token level | Finds bugs byte/grammar fuzzing miss |
+
+### Performance Metrics
+
+| Metric | Value | Context |
+|--------|-------|---------|
+| E-Test F1-score | 0.55 | vs 0.34 regression, 0.39 vanilla LLM |
+| Inline Testing Overhead | 0.007x–0.014x | Negligible runtime impact |
+| Auto-Diagnose Accuracy | 90.14% | Google integration test failures |
+| PromptFuzz Coverage | 1.61-1.63x | vs OSS-Fuzz and Hopper |
+| Mutation vs Coverage | 96.01% vs 55.68% | Defect detection rate |
+| Mull Processing | Significant speedup | Via IR-level manipulation |
+
+### Benchmarks Referenced
+
+| Benchmark | Description | Papers Using |
+|-----------|-------------|--------------|
+| **OSS-Fuzz** | Google's continuous fuzzing service (~1.12M sessions, 878 projects) | Shirai et al. (2025) |
+| **Defects4J** | Widely-used bug benchmark for Java | Multiple validation studies |
+| **CVEFixes** | Vulnerability fixing dataset | Security testing research |
+| **SIR-Bench** | Security incident response (794 test cases) | CPS testing |
+
+---
+
+## 5. Key Research Insights
+
+### Validated Findings
+
+1. **Mutation Testing Value** - Petrović et al.'s 15-million-mutant study confirms mutants correlate with real faults and drive better testing practices
+
+2. **LLM Effectiveness** - RAG-augmented LLMs significantly outperform standard approaches (F1: 0.55 vs 0.39) for test generation
+
+3. **Industrial Success** - Google's Auto-Diagnose demonstrates 90%+ accuracy for test failure diagnosis at scale
+
+4. **Fuzzing Effectiveness** - Coverage-guided approaches (especially LLM-augmented) consistently outperform random fuzzing
+
+5. **E-value Superiority** - E-values provide stronger statistical guarantees than p-values for sequential testing
+
+### Research Gaps Identified
+
+- Limited work on quantum software testing (emerging field)
+- Property-based mutation testing still underexplored
+- Unified frameworks for multi-paradigm testing lacking
+- Tool integration with modern CI/CD workflows needs improvement
+
+---
+
+## 6. Implications for Practice
+
+### Evidence-Based Recommendations
+
+| Practice | Evidence | Source |
+|----------|----------|--------|
+| Adopt mutation testing | 96% defect detection vs 55% coverage | Petrović et al. |
+| Use LLM+RAG for test generation | 40% F1 improvement over vanilla LLM | Fingleton et al. |
+| Implement continuous fuzzing | High early detection, sustained coverage growth | Shirai et al. |
+| Consider e-values for sequential testing | Stronger guarantees than p-values | Koning |
+| Explore metamorphic testing for ML | Effective for oracle-less scenarios | Multiple papers |
+
+### Emerging Practical Tools
+
+| Tool/Approach | Status | Source |
+|---------------|--------|--------|
+| Auto-Diagnose (Google) | Production deployed | Google research |
+| AnyPoC | Research prototype | Zhao et al. |
+| Prompt Fuzzing | Academic prototype | Lyu et al. |
+| Inline Testing (I-Test) | Research prototype | Liu et al. |
+
+---
+
+## 7. Conclusion
+
+The research landscape on testing is experiencing rapid evolution, particularly with the integration of **large language models** into testing workflows and the maturation of **mutation testing** as a quality metric. Key developments include:
+
+- **AI-augmented testing** showing production-ready results (90% accuracy at Google)
+- **E-values** emerging as a statistical foundation for continuous testing
+- **Quantum testing** representing a new frontier
+- **Metamorphic testing** expanding beyond traditional applications
+
+The overarching trend is toward **intelligent, continuous, and adaptive** testing systems that leverage both rigorous statistical foundations and modern AI capabilities.
+
+---
+
+*Research compiled: April 2026*  
+*Sources: arXiv, ACM Digital Library, IEEE Xplore, conference proceedings*
\ No newline at end of file
diff --git a/research/web-summary.md b/research/web-summary.md
new file mode 100644
index 0000000..5a032f0
--- /dev/null
+++ b/research/web-summary.md
@@ -0,0 +1,242 @@
+# Web Research Summary: Testing Methodologies
+
+A comprehensive summary of the current testing landscape, frameworks, tools, and emerging trends based on web research conducted April 2026.
+
+---
+
+## 1. Main Domains Where "Test" is Relevant
+
+### Software Testing (Primary Focus)
+The dominant context for "test" is software quality assurance, encompassing:
+
+| Category | Description |
+|----------|-------------|
+| **Unit Testing** | Testing individual components in isolation |
+| **Integration Testing** | Testing interactions between components |
+| **End-to-End (E2E)** | Full application workflow testing |
+| **Performance/Load Testing** | System behavior under load |
+| **Contract Testing** | API contract validation between services |
+| **Property-Based Testing** | Testing with generated inputs |
+| **Mutation Testing** | Evaluating test quality via code mutation |
+| **Visual/Regression Testing** | UI appearance validation |
+| **Security Testing** | Vulnerability and penetration testing |
+| **Chaos Engineering** | System resilience through induced failures |
+
+### Other Domains
+- **Medical Testing** - Diagnostic health tests
+- **A/B Testing** - Production experimentation frameworks
+- **Infrastructure Testing** - Testing Infrastructure-as-Code
+
+---
+
+## 2. Key Frameworks and Tools
+
+### A. Unit Testing Frameworks
+
+| Tool | Stars | Best For |
+|------|-------|----------|
+| [Jest](https://github.com/jestjs/jest) | 45,337 | JavaScript/TypeScript, snapshot testing |
+| [Vitest](https://github.com/vitest-dev/vitest) | 16,375 | Vite projects, native ESM |
+| [Mocha](https://github.com/mochajs/mocha) | 22,882 | Flexible Node.js testing |
+| [pytest](https://github.com/pytest-dev/pytest) | 13,776 | Python ecosystem |
+| **JUnit** | Industry standard | Java applications |
+
+### B. End-to-End (Browser) Testing
+
+| Tool | Stars | Key Strengths |
+|------|-------|---------------|
+| [Playwright](https://github.com/microsoft/playwright) | 86,678 | Cross-browser (Chromium, Firefox, WebKit), auto-waiting, trace viewer |
+| [Cypress](https://github.com/cypress-io/cypress) | 49,626 | Fast execution, great DX, time-travel debugging |
+| [Selenium](https://github.com/SeleniumHQ/selenium) | 34,083 | Mature, multi-language, WebDriver standard |
+| [WebdriverIO](https://github.com/webdriverio/webdriverio) | 9,793 | Next-gen browser/mobile automation |
+| [Nightwatch.js](https://github.com/nightwatchjs/nightwatch) | 11,942 | W3C WebDriver API compliance |
+
+**Trend Alert:** Playwright (86K+ stars) is overtaking Cypress (49K+ stars) due to superior cross-browser support and reliability.
+
+### C. API Testing
+
+| Tool | Stars | Description |
+|------|-------|-------------|
+| [Hoppscotch](https://github.com/hoppscotch/hoppscotch) | 78,953 | Open-source Postman alternative |
+| [Bruno](https://github.com/usebruno/bruno) | 43,023 | Git-friendly API testing IDE |
+| [Supertest](https://github.com/ladjs/supertest) | 14,346 | HTTP assertions for Node.js |
+| **REST Assured** | Popular | Java API testing |
+
+### D. Load/Performance Testing
+
+| Tool | Stars | Description |
+|------|-------|-------------|
+| [k6](https://github.com/grafana/k6) | 30,379 | Modern Go-based load testing with JS scripting |
+| [Locust](https://github.com/locustio/locust) | 27,720 | Python-based, highly scalable |
+| [JMeter](https://github.com/apache/jmeter) | 9,348 | Apache's mature load testing tool |
+| [Vegeta](https://github.com/tsenart/vegeta) | 25,004 | HTTP load testing CLI tool |
+
+### E. Component/UI Testing
+
+| Tool | Stars | Description |
+|------|-------|-------------|
+| [Storybook](https://github.com/storybookjs/storybook) | 89,727 | Component development and testing isolation |
+| [Testing Library](https://github.com/testing-library/react-testing-library) | 19,572 | Behavior-focused testing utilities |
+| **Enzyme** | Legacy | Being replaced by Testing Library |
+
+### F. Contract Testing
+
+| Tool | Stars | Description |
+|------|-------|-------------|
+| [Pact](https://github.com/pact-foundation/pact-ruby) | 2,193 | Consumer-driven contract testing standard |
+| [Pact JS](https://github.com/pact-foundation/pact-js) | 1,757 | JavaScript implementation |
+| [Pact JVM](https://github.com/pact-foundation/pact-jvm) | 1,127 | JVM/Kotlin implementation |
+
+### G. Advanced Testing Techniques
+
+| Tool | Purpose |
+|------|---------|
+| [Stryker JS](https://github.com/stryker-mutator/stryker-js) | JavaScript mutation testing |
+| [Infection](https://github.com/infection/infection) | PHP mutation testing |
+| [Hypothesis](https://github.com/HypothesisWorks/hypothesis) | Python property-based testing |
+| [fast-check](https://github.com/dubzzz/fast-check) | JavaScript property-based testing |
+
+### H. Infrastructure Testing
+
+| Tool | Stars | Description |
+|------|-------|-------------|
+| [Terratest](https://github.com/gruntwork-io/terratest) | 7,894 | Go library for testing Terraform |
+| [terraform-compliance](https://github.com/eerkunt/terraform-compliance) | 1,447 | BDD-style security testing for Terraform |
+
+### I. Security Testing
+
+| Tool | Stars | Description |
+|------|-------|-------------|
+| [MobSF](https://github.com/MobSF/Mobile-Security-Framework-MobSF) | 20,812 | Mobile security testing framework |
+| [OWASP MASTG](https://github.com/OWASP/owasp-mastg) | 12,830 | Mobile security testing guide |
+| [OWASP WSTG](https://github.com/OWASP/wstg) | 9,091 | Web security testing guide |
+
+---
+
+## 3. Recent Trends (2024-2025)
+
+### Major Shifts
+
+1. **Playwright Dominance** - Crossing 86K stars, replacing Selenium and challenging Cypress
+   - Better cross-browser support (WebKit, Firefox, Chromium)
+   - Built-in trace viewer and code generation
+   - Superior reliability with auto-waiting
+
+2. **Vitest Rising** - 16K+ stars, becoming the default for Vite projects
+   - Native ESM support
+   - Jest-compatible API
+   - Significantly faster execution
+
+3. **AI-Assisted Testing** - Rapid emergence of AI-powered tools
+   - [Browser-use](https://github.com/browser-use/browser-use) (88K+ stars) - AI browser automation
+   - Automated test generation and maintenance
+
+4. **Git-Native API Testing** - Bruno's 43K+ stars signal demand for version-controlled collections
+   - Alternative to proprietary formats (Postman)
+   - Better CI/CD integration
+
+5. **Component Testing Maturity** - Storybook with built-in testing
+   - Visual regression via Chromatic
+   - Interaction testing in isolation
+
+6. **Mutation Testing Adoption** - Stryker and Infection gaining traction
+   - Focus on test quality, not just coverage
+   - CI integration for quality gates
+
+7. **Shift-Left Security** - Earlier integration of security testing
+   - OWASP tools in CI pipelines
+   - Security-as-code practices
+
+---
+
+## 4. Important Resources
+
+### Best Practice Guides
+
+| Resource | Link | Description |
+|----------|------|-------------|
+| JavaScript Testing Best Practices | [github.com/goldbergyoni/javascript-testing-best-practices](https://github.com/goldbergyoni/javascript-testing-best-practices) | 24,602 ⭐ comprehensive guide |
+| Node.js Best Practices | [github.com/goldbergyoni/nodebestpractices](https://github.com/goldbergyoni/nodebestpractices) | 105,208 ⭐ includes testing section |
+
+### Official Documentation
+
+| Framework | Documentation |
+|-----------|---------------|
+| Jest | [jestjs.io](https://jestjs.io) |
+| Vitest | [vitest.dev](https://vitest.dev) |
+| Playwright | [playwright.dev](https://playwright.dev) |
+| Cypress | [docs.cypress.io](https://docs.cypress.io) |
+| Storybook | [storybook.js.org](https://storybook.js.org) |
+| Pact | [pact.io](https://pact.io) |
+| k6 | [k6.io](https://k6.io) |
+
+---
+
+## 5. Emerging Best Practices
+
+### Testing Strategy (2024-2025)
+
+The modern testing pyramid has evolved:
+
+```
+      ╱╲
+     ╱  ╲    Visual regression tests
+    ╱────╲
+   ╱      ╲   E2E tests (critical paths only)
+  ╱────────╲
+ ╱          ╲  Integration tests
+╱────────────╲
+              Unit tests + Static analysis
+```
+
+### Key Principles
+
+1. **Testing Library Philosophy**
+   - Test behavior, not implementation
+   - Query elements as users would (getByRole, getByText)
+   - Avoid testing component internals
+
+2. **Mocking Best Practices**
+   - Prefer MSW (Mock Service Worker) for API mocking
+   - Limit mocking to boundaries (network, filesystem)
+   - Use real implementations where possible
+
+3. **Test Organization**
+   - Colocate tests with source or use `__tests__` directories
+   - Descriptive names: "should display user name after login"
+   - Group related tests with describe blocks
+
+4. **CI/CD Integration**
+   - Fast tests (unit) on every commit
+   - Slower tests (E2E) on PR/push to main
+   - Parallel execution
+   - Coverage tracking with regression detection
+
+5. **Modern TypeScript/JavaScript Stack (2025)**
+   | Purpose | Tool |
+   |---------|------|
+   | Framework | Vitest (Vite) or Jest |
+   | E2E | Playwright |
+   | Component | Storybook + Testing Library |
+   | API | Supertest or MSW |
+   | Mocking | Built-in or MSW |
+
+---
+
+## 6. Key Takeaways
+
+The testing landscape is rapidly evolving toward:
+
+- **Faster, more reliable tools** - Playwright over Selenium, Vitest over traditional runners
+- **Developer-friendly experiences** - Better DX, watch modes, clearer error messages
+- **AI integration** - Emerging tools for test generation and maintenance
+- **Quality over coverage** - Mutation testing gaining adoption
+- **Shift-left practices** - Earlier testing, security in CI/CD
+- **Open-source alternatives** - Bruno/Hoppscotch vs proprietary tools
+
+The dominant trend is **seamless integration** of testing into modern development workflows with minimal friction and maximum feedback value.
+
+---
+
+*Research conducted: April 2026*  
+*Sources: Open-source repositories, official documentation, community discussions*
\ No newline at end of file
diff --git a/skills/web-search-bash/SKILL.md b/skills/web-search-bash/SKILL.md
new file mode 100644
index 0000000..94a233d
--- /dev/null
+++ b/skills/web-search-bash/SKILL.md
@@ -0,0 +1,61 @@
+---
+name: web-search-bash
+description: Web search and fetch using curl/wget against a local SearXNG server.
+---
+
+Web search and content fetching via local SearXNG server.
+
+## Server URL
+
+The SearXNG instance URL is available in the `SEARXNG_URL` environment variable. If unset, fall back to `http://192.168.178.58:7777`.
+
+```bash
+SEARXNG_URL="${SEARXNG_URL:-http://192.168.178.58:7777}"
+```
+
+## Endpoints
+
+### Search
+- **URL**: `$SEARXNG_URL/search`
+- **Parameters**: `q` (query), `format=json` (structured results)
+
+### Fetch
+- Direct HTTP GET to any URL with custom headers.
+
+---
+
+## Search Examples
+
+```bash
+# Basic search
+SEARXNG_URL="${SEARXNG_URL:-http://192.168.178.58:7777}"
+curl -s "$SEARXNG_URL/search?q=your+query&format=json"
+
+# Extract titles and URLs
+curl -s "$SEARXNG_URL/search?q=your+query&format=json" | \
+  jq -r '.results[] | "[\(.title)] \(.url)"'
+```
+
+## Fetch Examples
+
+```bash
+# Fetch with timeout and user-agent
+curl -s --max-time 15 \
+  -A "Mozilla/5.0 (compatible; PiCodingAgent/1.0)" \
+  -H "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.7" \
+  "https://example.com"
+
+# Strip HTML to plain text
+curl -s -A "Mozilla/5.0" "https://example.com" | \
+  sed -e 's/<[^>]*>//g' | \
+  tr -s '[:space:]' ' ' | \
+  sed 's/^ *//;s/ *$//' | \
+  head -c 20000
+```
+
+## Tips
+
+- Search returns up to ~20 results.
+- Use `jq` to parse JSON if available.
+- For large pages, limit output with `head -c <chars>`.
+- Always include source URLs in summaries.
diff --git a/src/cli.ts b/src/cli.ts
new file mode 100644
index 0000000..27effe8
--- /dev/null
+++ b/src/cli.ts
@@ -0,0 +1,103 @@
+export interface Args {
+  model?: string;
+  mode?: "start_research" | "onboarding" | "new_feature";
+  task?: string;
+  webSearchMode: "extension" | "mcp" | "skill";
+  mcpUrl?: string;
+  outputDir: string;
+  timeout: number;
+  verbose: boolean;
+  help: boolean;
+}
+
+export function showHelp(): void {
+  console.log(`research - Headless research orchestrator for pi-coding-agent
+
+Usage:
+  research --model <alias> --start_research --task "<description>"
+  research --model <alias> --onboarding [--task "<scope>"]
+  research --model <alias> --new_feature --task "<description>"
+
+Options:
+  --model <alias>              Model alias from ~/.pi/agent/models.json (required)
+  --start_research             Research a new topic from scratch
+  --onboarding                 Map and document an existing codebase
+  --new_feature                Plan how to add a feature to the current project
+  --task "<description>"       Task description / scope (required for start_research and new_feature)
+  --web-search-mode <mode>     Web search backend: extension (default), mcp, or skill
+  --mcp-url <url>              MCP server URL (required when --web-search-mode=mcp)
+  --output-dir <path>          Where to write deliverables (default: cwd)
+  --timeout <minutes>          Per-agent timeout (default: 15)
+  --verbose                    Stream full orchestrator output to stderr
+  --help                       Show this help message
+
+Examples:
+  research --model k2p5 --start_research --task "native android app using gemma 4 e4b"
+  research --model kimi-for-coding --onboarding
+  research --model minimax-token-plan/MiniMax-M2.7 --new_feature --task "add comment section to react blog"
+`);
+}
+
+export function parseArgs(argv: string[]): Args {
+  const args: Args = {
+    webSearchMode: "extension",
+    outputDir: process.cwd(),
+    timeout: 15,
+    verbose: false,
+    help: false,
+  };
+
+  for (let i = 0; i < argv.length; i++) {
+    const a = argv[i];
+    switch (a) {
+      case "--model":
+        args.model = argv[++i];
+        break;
+      case "--start_research":
+        args.mode = "start_research";
+        break;
+      case "--onboarding":
+        args.mode = "onboarding";
+        break;
+      case "--new_feature":
+        args.mode = "new_feature";
+        break;
+      case "--task":
+        args.task = argv[++i];
+        break;
+      case "--web-search-mode":
+        args.webSearchMode = argv[++i] as any;
+        break;
+      case "--mcp-url":
+        args.mcpUrl = argv[++i];
+        break;
+      case "--output-dir":
+        args.outputDir = argv[++i];
+        break;
+      case "--timeout":
+        args.timeout = parseInt(argv[++i], 10);
+        break;
+      case "--verbose":
+        args.verbose = true;
+        break;
+      case "--help":
+      case "-h":
+        args.help = true;
+        break;
+    }
+  }
+
+  return args;
+}
+
+export function validateArgs(args: Args): void {
+  if (!args.model) {
+    throw new Error("--model is required");
+  }
+  if (!args.mode) {
+    throw new Error("One of --start_research, --onboarding, or --new_feature is required");
+  }
+  if (args.webSearchMode === "mcp" && !args.mcpUrl) {
+    throw new Error("--mcp-url is required when --web-search-mode=mcp");
+  }
+}
diff --git a/src/launcher.ts b/src/launcher.ts
new file mode 100644
index 0000000..4e97bd1
--- /dev/null
+++ b/src/launcher.ts
@@ -0,0 +1,154 @@
+import { spawn } from "child_process";
+import * as fs from "fs";
+import * as path from "path";
+import { Args } from "./cli.js";
+import { resolveModel } from "./models.js";
+import { ensureDir, loadConfig, makeTempSession } from "./utils.js";
+
+function statusLog(msg: string): void {
+  const ts = new Date().toISOString().replace("T", " ").slice(0, 19);
+  process.stderr.write(`[research-status] ${ts} ${msg}\n`);
+}
+
+export function launch(args: Args): void {
+  const config = loadConfig();
+  const resolvedModel = resolveModel(args.model!);
+
+  // Repo root is where this script is installed (e.g. ~/.pi/research)
+  // Launcher runs from dist/src/launcher.js, so go up two levels
+  const repoRoot = path.resolve(path.dirname(process.argv[1]), "..");
+
+  const subagentSpawnerExt = path.join(repoRoot, "extensions", "subagent-spawner.ts");
+  const webSearchExt = path.join(repoRoot, "extensions", "web-search.ts");
+  const mcpWebSearchExt = path.join(repoRoot, "extensions", "mcp-web-search.ts");
+  const webSearchSkill = path.join(repoRoot, "skills", "web-search-bash");
+  const orchestratorPrompt = path.join(repoRoot, "agents", "orchestrator.md");
+
+  const extensions: string[] = [subagentSpawnerExt];
+  const skills: string[] = [];
+
+  if (args.webSearchMode === "extension") {
+    extensions.push(webSearchExt);
+  } else if (args.webSearchMode === "mcp") {
+    extensions.push(mcpWebSearchExt);
+  } else {
+    skills.push(webSearchSkill);
+  }
+
+  const modeLine = `MODE: ${args.mode}`;
+  const taskLine = args.task ? `TASK: ${args.task}` : "";
+  const outputLine = `OUTPUT_DIR: ${path.resolve(args.outputDir)}`;
+  const timeoutLine = `TIMEOUT_MINUTES: ${args.timeout}`;
+  const webSearchModeLine = `WEB_SEARCH_MODE: ${args.webSearchMode}`;
+  const mcpUrlLine = args.mcpUrl ? `MCP_URL: ${args.mcpUrl}` : "";
+  const configLine = `CONFIG: ${JSON.stringify(config)}`;
+
+  const prompt = [
+    modeLine,
+    taskLine,
+    outputLine,
+    timeoutLine,
+    webSearchModeLine,
+    mcpUrlLine,
+    configLine,
+    "Begin.",
+  ]
+    .filter(Boolean)
+    .join("\n");
+
+  const piArgs = [
+    "--mode", "json",
+    "--print",
+    "--no-extensions",
+    "--no-skills",
+    "--model", resolvedModel,
+    "--tools", "read,write,bash,grep,find,ls",
+    "--session", makeTempSession(),
+    "--thinking", "off",
+  ];
+
+  for (const ext of extensions) {
+    piArgs.push("--extension", ext);
+  }
+  for (const skill of skills) {
+    piArgs.push("--skill", skill);
+  }
+
+  piArgs.push("--append-system-prompt", orchestratorPrompt);
+  piArgs.push(prompt);
+
+  statusLog("Launching research orchestrator...");
+  if (args.verbose) {
+    console.error("[research] Spawning pi with args:", piArgs.join(" "));
+  }
+
+  const proc = spawn("pi", piArgs, {
+    stdio: ["ignore", "pipe", "pipe"],
+    env: {
+      ...process.env,
+      RESEARCH_PI_ROOT: repoRoot,
+      MCP_URL: args.mcpUrl || "",
+    },
+  });
+
+  let buffer = "";
+
+  proc.stdout!.setEncoding("utf-8");
+  proc.stdout!.on("data", (chunk: string) => {
+    buffer += chunk;
+    const lines = buffer.split("\n");
+    buffer = lines.pop() || "";
+    for (const line of lines) {
+      if (!line.trim()) continue;
+      try {
+        const event = JSON.parse(line);
+        if (event.type === "message_update") {
+          const delta = event.assistantMessageEvent;
+          if (delta?.type === "text_delta" && args.verbose) {
+            process.stderr.write(delta.delta);
+          }
+        } else if (event.type === "tool_execution_start" && args.verbose) {
+          const name = event.toolCall?.name || "tool";
+          process.stderr.write(`\n[tool:${name}]\n`);
+        }
+      } catch {
+        if (args.verbose) {
+          process.stderr.write(line + "\n");
+        }
+      }
+    }
+  });
+
+  let stderrBuffer = "";
+  proc.stderr!.setEncoding("utf-8");
+  proc.stderr!.on("data", (chunk: string) => {
+    stderrBuffer += chunk;
+    const lines = stderrBuffer.split("\n");
+    stderrBuffer = lines.pop() || "";
+    for (const line of lines) {
+      if (line.startsWith("[research-status]")) {
+        process.stderr.write(line + "\n");
+      } else if (args.verbose) {
+        process.stderr.write(line + "\n");
+      }
+    }
+  });
+
+  proc.on("close", (code) => {
+    if (stderrBuffer.trim()) {
+      const line = stderrBuffer.trim();
+      if (line.startsWith("[research-status]")) {
+        process.stderr.write(line + "\n");
+      } else if (args.verbose) {
+        process.stderr.write(line + "\n");
+      }
+    }
+    statusLog(`Orchestrator finished (exit code ${code ?? 0})`);
+    process.exit(code ?? 0);
+  });
+
+  proc.on("error", (err) => {
+    console.error("[research] Failed to spawn pi:", err.message);
+    process.exit(1);
+  });
+}
diff --git a/src/main.ts b/src/main.ts
new file mode 100644
index 0000000..7cc52b2
--- /dev/null
+++ b/src/main.ts
@@ -0,0 +1,18 @@
+#!/usr/bin/env node
+import { parseArgs, validateArgs, showHelp } from "./cli.js";
+import { launch } from "./launcher.js";
+
+const args = parseArgs(process.argv.slice(2));
+
+if (args.help) {
+  showHelp();
+  process.exit(0);
+}
+
+try {
+  validateArgs(args);
+  launch(args);
+} catch (err: any) {
+  console.error("Error:", err?.message || err);
+  process.exit(1);
+}
diff --git a/src/models.ts b/src/models.ts
new file mode 100644
index 0000000..fb6002c
--- /dev/null
+++ b/src/models.ts
@@ -0,0 +1,95 @@
+import * as fs from "fs";
+import * as path from "path";
+import * as os from "os";
+
+export interface ModelEntry {
+  id: string;
+  name?: string;
+  contextWindow?: number;
+  reasoning?: boolean;
+}
+
+export interface Provider {
+  baseUrl: string;
+  api: string;
+  apiKey: string;
+  models: ModelEntry[];
+}
+
+export interface ModelsJson {
+  providers: Record<string, Provider>;
+}
+
+export function loadModelsJson(): ModelsJson {
+  const p = path.join(os.homedir(), ".pi", "agent", "models.json");
+  const raw = fs.readFileSync(p, "utf-8");
+  return JSON.parse(raw) as ModelsJson;
+}
+
+function normalize(s: string): string {
+  return s.toLowerCase().replace(/[^a-z0-9]/g, "");
+}
+
+function normalizeWithSubs(s: string): string {
+  // Common shorthand substitutions
+  return normalize(s).replace(/p(?=\d)/g, ""); // k2p5 -> k25
+}
+
+export function resolveModel(alias: string): string {
+  const data = loadModelsJson();
+
+  // 1. Exact provider/id path (e.g. "minimax-token-plan/MiniMax-M2.7")
+  if (alias.includes("/") && !alias.includes(" ")) {
+    const [providerName, modelId] = alias.split("/");
+    const provider = data.providers[providerName];
+    if (provider) {
+      const model = provider.models.find((m) => m.id === modelId);
+      if (model) return `${providerName}/${modelId}`;
+    }
+  }
+
+  // 2. Exact provider match (first model in that provider)
+  const provider = data.providers[alias];
+  if (provider && provider.models.length > 0) {
+    return `${alias}/${provider.models[0].id}`;
+  }
+
+  // 3. Fuzzy match across all models (id or name)
+  const lowerAlias = alias.toLowerCase();
+  const normAlias = normalize(alias);
+  const normSubAlias = normalizeWithSubs(alias);
+  const candidates: { score: number; fullId: string }[] = [];
+
+  for (const [providerName, p] of Object.entries(data.providers)) {
+    for (const m of p.models) {
+      const idLower = m.id.toLowerCase();
+      const nameLower = (m.name || "").toLowerCase();
+      const normId = normalize(m.id);
+      const normName = normalize(m.name || "");
+
+      if (idLower === lowerAlias || nameLower === lowerAlias) {
+        return `${providerName}/${m.id}`;
+      }
+
+      let score = 0;
+      if (idLower.includes(lowerAlias)) score += 10;
+      if (nameLower.includes(lowerAlias)) score += 8;
+      if (normId.includes(normAlias)) score += 6;
+      if (normName.includes(normAlias)) score += 5;
+      if (normId.includes(normSubAlias)) score += 4;
+      if (normName.includes(normSubAlias)) score += 3;
+      if ((m.name || "").toLowerCase().split(/[^a-z0-9]+/).some((w) => w === lowerAlias)) score += 2;
+
+      if (score > 0) {
+        candidates.push({ score, fullId: `${providerName}/${m.id}` });
+      }
+    }
+  }
+
+  if (candidates.length === 0) {
+    throw new Error(`Could not resolve model alias "${alias}". Check ~/.pi/agent/models.json`);
+  }
+
+  candidates.sort((a, b) => b.score - a.score);
+  return candidates[0].fullId;
+}
diff --git a/src/utils.ts b/src/utils.ts
new file mode 100644
index 0000000..1bb18b7
--- /dev/null
+++ b/src/utils.ts
@@ -0,0 +1,31 @@
+import * as fs from "fs";
+import * as path from "path";
+import * as os from "os";
+
+export function ensureDir(p: string): void {
+  fs.mkdirSync(p, { recursive: true });
+}
+
+export function getResearchDir(): string {
+  return path.join(os.homedir(), ".pi", "research");
+}
+
+export function getConfigPath(): string {
+  return path.join(getResearchDir(), "config.json");
+}
+
+export function loadConfig(): Record<string, any> {
+  try {
+    const p = getConfigPath();
+    if (fs.existsSync(p)) {
+      return JSON.parse(fs.readFileSync(p, "utf-8"));
+    }
+  } catch {}
+  return {};
+}
+
+export function makeTempSession(): string {
+  const dir = path.join(os.homedir(), ".pi", "research", "sessions");
+  ensureDir(dir);
+  return path.join(dir, `session-${Date.now()}-${Math.random().toString(36).slice(2)}.jsonl`);
+}
diff --git a/tsconfig.json b/tsconfig.json
new file mode 100644
index 0000000..f15a70f
--- /dev/null
+++ b/tsconfig.json
@@ -0,0 +1,16 @@
+{
+  "compilerOptions": {
+    "target": "ES2022",
+    "module": "Node16",
+    "moduleResolution": "Node16",
+    "outDir": "./dist",
+    "rootDir": "./src",
+    "strict": true,
+    "esModuleInterop": true,
+    "skipLibCheck": true,
+    "forceConsistentCasingInFileNames": true,
+    "resolveJsonModule": true
+  },
+  "include": ["src/**/*"],
+  "exclude": ["node_modules"]
+}