commit 047cf4b606ef3c175c4b9919bd5c6e1e930f5b57 Author: Kaloyan Nikolov Date: Fri Apr 17 15:30:19 2026 +0200 Initial commit: research-pi headless research orchestrator diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..dd6e803 --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +node_modules/ +dist/ +*.log +.DS_Store diff --git a/PLAN.md b/PLAN.md new file mode 100644 index 0000000..109cf7c --- /dev/null +++ b/PLAN.md @@ -0,0 +1,173 @@ +# POC Plan: Testing Framework Platform + +## Overview + +Based on comprehensive web and academic research on "test" methodologies, this document outlines a Proof of Concept (POC) plan for building a next-generation testing platform that integrates modern testing paradigms, AI-augmented testing, and developer experience improvements. + +## Recommended Stack + +### Core Technology Stack + +| Layer | Technology | Rationale | +|-------|------------|-----------| +| **Runtime** | Node.js 20+ with TypeScript | Dominant ecosystem, excellent testing tool support | +| **Test Runner** | Vitest | Native ESM support, Vite integration, faster than Jest | +| **E2E Testing** | Playwright | Industry leader, cross-browser support, reliable auto-waiting | +| **UI Components** | Storybook + Testing Library | Component isolation, behavior-focused testing | +| **API Testing** | MSW (Mock Service Worker) + Supertest | Realistic mocking, boundary testing | +| **Coverage** | V8/Built-in (Vitest) | Fast, accurate, native integration | +| **CI/CD** | GitHub Actions | Native integration, extensive testing actions | + +### Optional Advanced Components + +| Component | Technology | Use Case | +|-----------|------------|----------| +| **Contract Testing** | Pact | Microservices with API contracts | +| **Load Testing** | k6 | Performance validation | +| **Mutation Testing** | Stryker JS | Test quality assurance | +| **Visual Regression** | Chromatic/Storybook | UI consistency testing | + +## Architecture Overview + +``` +┌─────────────────────────────────────────────────────────────────┐ +│ TESTING PLATFORM │ +├─────────────────────────────────────────────────────────────────┤ +│ Test Orchestrator │ +│ ├── Test Discovery & Scheduling │ +│ ├── Parallel Execution Engine │ +│ ├── Result Aggregation & Reporting │ +│ └── CI/CD Integration Layer │ +├─────────────────────────────────────────────────────────────────┤ +│ Test Type Modules │ +│ ├── Unit Tests (Vitest) │ +│ ├── Integration Tests (Supertest/MSW) │ +│ ├── E2E Tests (Playwright) │ +│ ├── Component Tests (Storybook) │ +│ └── Contract Tests (Pact - optional) │ +├─────────────────────────────────────────────────────────────────┤ +│ AI-Augmented Layer (Future) │ +│ ├── Test Generation (LLM-based) │ +│ ├── Test Failure Diagnosis │ +│ └── Coverage Gap Analysis │ +├─────────────────────────────────────────────────────────────────┤ +│ Developer Experience │ +│ ├── Watch Mode with Hot Reload │ +│ ├── Interactive HTML Reports │ +│ ├── VS Code Extension │ +│ └── Slack/Discord Notifications │ +└─────────────────────────────────────────────────────────────────┘ +``` + +## POC Scope & Timeline + +### Week 1: Foundation (Days 1-5) + +**Day 1-2: Project Setup** +- Initialize TypeScript + Vitest project +- Configure linting (ESLint) and formatting (Prettier) +- Set up directory structure +- Basic test runner configuration + +**Day 3-5: Core Testing Infrastructure** +- Implement test discovery mechanism +- Create test execution engine +- Build result formatting/reporting +- Add coverage collection + +**Deliverable:** CLI tool that can discover and run tests with basic reporting + +### Week 2: E2E & Component Testing (Days 6-10) + +**Day 6-7: Playwright Integration** +- Install and configure Playwright +- Create reusable page object patterns +- Implement E2E test scaffolding + +**Day 8-10: Storybook Integration** +- Set up Storybook for UI components +- Configure component testing +- Create example component with tests + +**Deliverable:** Working E2E and component test examples + +### Week 3: Developer Experience (Days 11-15) + +**Day 11-12: Watch Mode & IDE Support** +- Implement file watching for test re-execution +- Create VS Code task configurations +- Add debug configurations + +**Day 13-14: Reporting & CI/CD** +- Build HTML test reports +- Create GitHub Actions workflow +- Add coverage badges + +**Day 15: Documentation** +- Write comprehensive README +- Create usage examples +- Document architecture decisions + +**Deliverable:** Production-ready testing platform with documentation + +## Key Risks + +| Risk | Likelihood | Impact | Mitigation | +|------|------------|--------|------------| +| **Tool Compatibility Issues** | Medium | High | Test integrations early, maintain fallback options | +| **Performance at Scale** | Medium | High | Design for parallelization from start, benchmark regularly | +| **LLM Integration Complexity** | High | Medium | Defer AI features to post-POC phase | +| **Developer Adoption** | Medium | High | Focus on DX, provide migration guides from Jest | +| **CI/CD Integration Complexity** | Low | Medium | Use well-documented GitHub Actions patterns | + +## Success Criteria + +### Technical Metrics +- [ ] Test execution speed >= 2x faster than Jest equivalent +- [ ] 100% coverage reporting accuracy +- [ ] <100ms watch mode feedback loop +- [ ] Successful parallel execution without flakiness + +### Developer Experience Metrics +- [ ] Zero-config startup for common project types +- [ ] Clear, actionable error messages +- [ ] Interactive HTML report with filtering +- [ ] VS Code integration for running/debugging tests + +## Future Enhancements (Post-POC) + +### Phase 2: AI-Augmented Testing +- LLM-based test generation from code analysis +- Automated test failure root cause analysis (inspired by Google's 90% accuracy approach) +- Coverage gap identification with suggested test cases + +### Phase 3: Advanced Testing Modes +- Property-based testing integration (fast-check) +- Mutation testing integration (Stryker) +- Chaos engineering hooks + +### Phase 4: Enterprise Features +- Multi-project monorepo support +- Distributed test execution +- Advanced reporting dashboards +- Test flakiness detection and quarantine + +## Research References + +This plan is informed by: +- [Web Research Summary](./research/web-summary.md) - Current tooling landscape and trends +- [Paper Research Summary](./research/paper-summary.md) - Academic research on testing methodologies + +### Key Insights Applied + +1. **Playwright over Cypress** - Research shows Playwright has better cross-browser support and reliability +2. **Vitest for Vite projects** - Emerging as the modern alternative to Jest with native ESM +3. **Testing Library philosophy** - Test behavior, not implementation +4. **LLM-augmented testing** - Research shows significant potential (90%+ accuracy in failure diagnosis) +5. **Mutation testing value** - Studies confirm it improves actual test quality beyond coverage metrics + +--- + +*Document Version: 1.0* +*Created: April 2026* +*Status: Draft for Review* diff --git a/README.md b/README.md new file mode 100644 index 0000000..2a45048 --- /dev/null +++ b/README.md @@ -0,0 +1,75 @@ +# research-pi + +Headless research orchestrator for [`pi-coding-agent`](https://github.com/mariozechner/pi-coding-agent). + +Spawns a headless Pi orchestrator that delegates to read-only subagents to: +- Research new topics (`--start_research`) +- Onboard existing codebases (`--onboarding`) +- Plan new features (`--new_feature`) + +## Install + +```bash +curl -fsSL https://raw.githubusercontent.com/YOUR_USERNAME/YOUR_REPO/main/install.sh | bash +``` + +Requires: `node`, `pnpm`, and `pi` (pi-coding-agent) installed. + +## Usage + +```bash +# Research a new topic +research --model kimi-for-coding --start_research \ + --task "native android app using gemma 4 e4b" + +# Onboard an existing project +research --model minimax-token-plan/MiniMax-M2.7 --onboarding + +# Onboard a specific part of a project +research --model k2p5 --onboarding \ + --task "i6_experiments/user/nikolov/experiments/voxpopuli" + +# Plan a new feature +research --model kimi-for-coding --new_feature \ + --task "add a comment section to my react blog" +``` + +## Outputs + +| Mode | Files written | +|------|---------------| +| `--start_research` | `PLAN.md`, `research/web-summary.md`, `research/paper-summary.md` | +| `--onboarding` | `MAP.md`, `ONBOARDING.md` | +| `--new_feature` | `FEATURE.md` | + +## Configuration + +Create `~/.pi/research/config.json`: + +```json +{ + "webSearch": { + "mode": "extension", + "searxngUrl": "http://192.168.178.58:7777", + "mcpUrl": "http://sleepy-think:3001/mcp" + }, + "models": { + "default": "kimi-for-coding", + "web-researcher": "k2p5", + "paper-researcher": "minimax-token-plan/MiniMax-M2.7" + } +} +``` + +- `webSearch.mode`: `extension` (default, ships embedded SearXNG extension), `mcp` (proxy to MCP server), or `skill` (raw curl fallback) +- `models.default`: fallback if no `--model` passed +- `models.`: per-subagent model override + +## Architecture + +- `bin/research` — thin launcher that resolves models, builds the Pi CLI invocation, and streams output +- `extensions/subagent-spawner.ts` — registers `spawn_subagent` tool so the orchestrator can delegate +- `extensions/web-search.ts` — SearXNG-based `web_search` + `web_fetch` tools +- `extensions/mcp-web-search.ts` — MCP-proxy variant +- `agents/orchestrator.md` — system prompt for the orchestrator +- The orchestrator is the **only** agent with `write` access. All subagents are strictly read-only. diff --git a/agents/orchestrator.md b/agents/orchestrator.md new file mode 100644 index 0000000..10f9c01 --- /dev/null +++ b/agents/orchestrator.md @@ -0,0 +1,71 @@ +You are a Research Orchestrator. You coordinate headless pi subagents to gather information and write structured markdown deliverables. You are the ONLY agent allowed to write files. + +## Your Tools +- `read` — inspect files in the local project +- `write` — create or overwrite markdown deliverables +- `bash` — run quick commands (e.g. count files, check versions) +- `grep`, `find`, `ls` — inspect the codebase structure +- `spawn_subagent` — delegate research/mapping tasks to specialist subagents + +## Modes +The user invoked you in exactly one of these modes. Your goal is to produce the listed files. + +### start_research +Goal: research a topic from scratch (no codebase, or a fresh project folder). +Outputs to write: +- `PLAN.md`: high-level POC plan, recommended stack, risks, timeline +- `research/web-summary.md`: 1-2 page summary of web findings with links +- `research/paper-summary.md`: 1-2 page summary of papers/reports (if relevant) + +### onboarding +Goal: understand an existing codebase. +Outputs to write: +- `MAP.md`: concise feature-to-location mapping, architecture overview +- `ONBOARDING.md`: project description + per-feature guide (where it lives, how to use it, inputs/outputs) + +### new_feature +Goal: figure out how to implement a specific feature in the existing project. +Outputs to write: +- `FEATURE.md`: findings on how this is done currently (SOTA, libraries, patterns), plus tailored integration advice for this specific codebase + +## How to Work +1. **Assess the situation.** Use `ls`, `find`, `bash` (e.g. `find . -type f | wc -l`, `tokei` if available) to gauge codebase size. +2. **Decide your attack plan.** You do NOT need to ask permission. Spawn subagents as you see fit, in parallel or sequence. +3. **Delegate via `spawn_subagent`.** Give each subagent a clear, self-contained task. +4. **Synthesize and write.** Collect outputs, then write the final markdown files yourself. + +## Subagent Conventions +When you spawn a subagent, choose the appropriate toolset. They are read-only unless you explicitly give them write/edit tools (which you should NOT do). + +- **Web/Paper researchers** — `tools: "read,bash"`, load the `web-search` extension (or `mcp-web-search` if the config says MCP). They may use bash only for curl/search/fetch. Forbidden: git modifications, redirects to files, rm, etc. +- **Codebase mappers** — `tools: "read,grep,find,ls"`. No bash, no write, no edit. They crawl the source and return structured findings. +- **Project analyzers** — `tools: "read,grep,find,ls"`. No bash, no write, no edit. They analyze package files and integration points. + +Recommended subagent names and roles: +- `web-researcher`: searches frameworks, docs, blogs, repos, latest implementations +- `paper-researcher`: searches arxiv, technical reports, research implementations +- `codebase-discovery`: top-level scan, lists major modules/features +- `module-mapper`: deep-dive into one directory/module +- `dependency-analyzer`: extracts deps, versions, build configs +- `project-analyzer`: understands current stack and where a new feature fits +- `feature-researcher`: researches best practices for a specific feature + +You may spawn multiple `module-mapper` agents in parallel for large codebases. +You may spawn `paper-researcher` whenever a topic feels scientific, algorithmic, performance-oriented, or ML-adjacent. Use your judgment. + +## Read-Only Enforcement +Include this exact paragraph in every subagent task: +> "You are read-only. You may NOT write files, edit code, run git commands that modify state, or use shell redirects (`>`, `>>`). Return all findings as text in your response." + +## Fast-Forward Hints +- In `onboarding` mode, if `MAP.md` or `ONBOARDING.md` already exist, you may still regenerate them if the user wants a fresh pass, but you can also read them to save time. +- In `new_feature` mode, if `MAP.md` or `ONBOARDING.md` exist, read them first to understand the project before spawning analyzers. + +## Output Quality +- Be concise but complete. +- Include file paths and code references where relevant. +- For links, use markdown `[title](url)` format. +- Use `bash` to `mkdir -p` parent directories before `write` if needed. +- If a subagent times out or fails, note the gap explicitly in your deliverables. + +Now begin. diff --git a/bin/research b/bin/research new file mode 100755 index 0000000..41f0146 --- /dev/null +++ b/bin/research @@ -0,0 +1,17 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Resolve symlinks to find the actual script location +SCRIPT_PATH="${BASH_SOURCE[0]}" +while [ -L "$SCRIPT_PATH" ]; do + SCRIPT_DIR="$(cd "$(dirname "$SCRIPT_PATH")" && pwd)" + SCRIPT_PATH="$(readlink "$SCRIPT_PATH")" + case "$SCRIPT_PATH" in + /*) ;; + *) SCRIPT_PATH="$SCRIPT_DIR/$SCRIPT_PATH" ;; + esac +done +SCRIPT_DIR="$(cd "$(dirname "$SCRIPT_PATH")" && pwd)" +ROOT_DIR="$(cd "$SCRIPT_DIR/.." && pwd)" + +exec node "$ROOT_DIR/dist/main.js" "$@" diff --git a/config/default.json b/config/default.json new file mode 100644 index 0000000..99abf46 --- /dev/null +++ b/config/default.json @@ -0,0 +1,10 @@ +{ + "webSearch": { + "mode": "extension", + "searxngUrl": "http://192.168.178.58:7777", + "mcpUrl": "http://sleepy-think:3001/mcp" + }, + "models": { + "default": "kimi-for-coding" + } +} diff --git a/extensions/mcp-web-search.ts b/extensions/mcp-web-search.ts new file mode 100644 index 0000000..ada6c32 --- /dev/null +++ b/extensions/mcp-web-search.ts @@ -0,0 +1,102 @@ +/** + * MCP Web Search Proxy + * Proxies web_search / web_fetch to an MCP server endpoint. + */ + +import { Type } from "@sinclair/typebox"; +import type { ExtensionAPI } from "@mariozechner/pi-coding-agent"; + +function getMcpUrl(): string { + return process.env.MCP_URL || "http://sleepy-think:3001/mcp"; +} + +const WEB_TOOLS_SECTION = `\`web_search\` — Web lookup via MCP proxy. Returns up to 20 results per query. Follow-up with web_fetch for content from promising URLs. +\`web_fetch\` — extract page text via MCP proxy. Scale maxLength to content type.`; + +async function mcpCall(toolName: string, args: Record): Promise { + const url = getMcpUrl(); + const body = { + jsonrpc: "2.0", + id: Date.now(), + method: "tools/call", + params: { name: toolName, arguments: args }, + }; + + const res = await fetch(url, { + method: "POST", + headers: { "Content-Type": "application/json" }, + signal: AbortSignal.timeout(20000), + body: JSON.stringify(body), + }); + + if (!res.ok) { + throw new Error(`MCP proxy returned ${res.status} ${res.statusText}`); + } + + const data = (await res.json()) as any; + if (data.error) { + throw new Error(`MCP error: ${data.error.message || JSON.stringify(data.error)}`); + } + + // MCP returns content as an array of { type: "text", text: "..." } + const content = data.result?.content; + if (!content || !Array.isArray(content)) { + throw new Error("Unexpected MCP response format"); + } + + return content; +} + +export default function mcpWebSearchExtension(pi: ExtensionAPI) { + pi.on("before_agent_start", async (event) => { + if (!event.systemPrompt.includes("web_search")) { + return { systemPrompt: event.systemPrompt + "\n" + WEB_TOOLS_SECTION }; + } + return { systemPrompt: event.systemPrompt }; + }); + + pi.registerTool({ + name: "web_search", + label: "Web Search (MCP)", + description: "Search the web via MCP proxy. Returns up to 20 results.", + parameters: Type.Object({ + query: Type.String({ description: "The search query to execute (max 2000 characters)" }), + }), + + async execute(_toolCallId, params, _signal, _onUpdate, _ctx) { + const { query } = params as { query: string }; + try { + const content = await mcpCall("web_search", { query }); + return { content }; + } catch (error) { + return { + content: [{ type: "text" as const, text: `MCP web_search error: ${(error as Error).message}` }], + isError: true, + }; + } + }, + }); + + pi.registerTool({ + name: "web_fetch", + label: "Web Fetch (MCP)", + description: "Fetch a URL as text via MCP proxy.", + parameters: Type.Object({ + url: Type.String({ description: "The URL to fetch" }), + maxLength: Type.Number({ description: "Maximum characters", default: 20000 }), + }), + + async execute(_toolCallId, params, _signal, _onUpdate, _ctx) { + const { url, maxLength = 20000 } = params as { url: string; maxLength?: number }; + try { + const content = await mcpCall("web_fetch", { url, maxLength }); + return { content }; + } catch (error) { + return { + content: [{ type: "text" as const, text: `MCP web_fetch error: ${(error as Error).message}` }], + isError: true, + }; + } + }, + }); +} diff --git a/extensions/subagent-spawner.ts b/extensions/subagent-spawner.ts new file mode 100644 index 0000000..91d0841 --- /dev/null +++ b/extensions/subagent-spawner.ts @@ -0,0 +1,171 @@ +import { Type } from "@sinclair/typebox"; +import type { ExtensionAPI } from "@mariozechner/pi-coding-agent"; +import { spawn } from "child_process"; +import * as fs from "fs"; +import * as path from "path"; +import * as os from "os"; + +function getRepoRoot(): string { + if (process.env.RESEARCH_PI_ROOT) { + return process.env.RESEARCH_PI_ROOT; + } + throw new Error("RESEARCH_PI_ROOT environment variable is not set. Cannot resolve extension/skill paths."); +} + +function resolveExt(name: string): string { + const root = getRepoRoot(); + if (fs.existsSync(name)) return name; + const candidate = path.join(root, "extensions", `${name}.ts`); + if (fs.existsSync(candidate)) return candidate; + throw new Error(`Extension not found: ${name}`); +} + +function resolveSkill(name: string): string { + const root = getRepoRoot(); + if (fs.existsSync(name)) return name; + const candidate = path.join(root, "skills", name); + if (fs.existsSync(candidate)) return candidate; + throw new Error(`Skill not found: ${name}`); +} + +function makeTempSession(): string { + const dir = path.join(os.homedir(), ".pi", "research", "sessions"); + fs.mkdirSync(dir, { recursive: true }); + return path.join(dir, `subagent-${Date.now()}-${Math.random().toString(36).slice(2)}.jsonl`); +} + +function statusLog(msg: string): void { + const ts = new Date().toISOString().replace("T", " ").slice(0, 19); + process.stderr.write(`[research-status] ${ts} ${msg}\n`); +} + +export default function (pi: ExtensionAPI) { + pi.registerTool({ + name: "spawn_subagent", + description: + "Spawn a headless pi subagent to perform a task. Waits for it to finish and returns its complete output text. Subagents are read-only (no write/edit tools unless explicitly given).", + parameters: Type.Object({ + name: Type.String({ description: "Name of the subagent for logging" }), + task: Type.String({ description: "Full task prompt for the subagent" }), + tools: Type.String({ description: "Comma-separated tools, e.g. read,bash or read,grep,find,ls" }), + extensions: Type.Optional(Type.Array(Type.String(), { description: "Extension names or paths to load" })), + skills: Type.Optional(Type.Array(Type.String(), { description: "Skill names or paths to load" })), + model: Type.Optional(Type.String({ description: "Model override (provider/id or alias). Defaults to orchestrator model." })), + timeoutMinutes: Type.Optional(Type.Number({ default: 15, description: "Timeout in minutes" })), + }), + + async execute(callId, params, signal, onUpdate, ctx) { + const { name, task, tools, extensions = [], skills = [], model, timeoutMinutes = 15 } = params as any; + + const resolvedModel = model || (ctx.model ? `${ctx.model.provider}/${ctx.model.id}` : undefined); + if (!resolvedModel) { + return { + content: [{ type: "text", text: "Error: no model specified and orchestrator model unknown." }], + isError: true, + }; + } + + const piArgs = [ + "--mode", "json", + "--print", + "--no-extensions", + "--no-skills", + "--model", resolvedModel, + "--tools", tools, + "--session", makeTempSession(), + "--thinking", "off", + ]; + + for (const ext of extensions) { + piArgs.push("--extension", resolveExt(ext)); + } + for (const skill of skills) { + piArgs.push("--skill", resolveSkill(skill)); + } + + piArgs.push(task); + + statusLog(`Spawning subagent "${name}" (model: ${resolvedModel}, tools: ${tools})`); + + if (onUpdate) { + onUpdate({ + content: [{ type: "text", text: `Spawning subagent "${name}"...` }], + }); + } + + const startTime = Date.now(); + const timeoutMs = timeoutMinutes * 60 * 1000; + + return new Promise((resolve) => { + const proc = spawn("pi", piArgs, { + stdio: ["ignore", "pipe", "pipe"], + env: { + ...process.env, + RESEARCH_PI_ROOT: getRepoRoot(), + }, + }); + + let killed = false; + const timer = setTimeout(() => { + killed = true; + proc.kill("SIGTERM"); + }, timeoutMs); + + let buffer = ""; + const textChunks: string[] = []; + + proc.stdout!.setEncoding("utf-8"); + proc.stdout!.on("data", (chunk: string) => { + buffer += chunk; + const lines = buffer.split("\n"); + buffer = lines.pop() || ""; + for (const line of lines) { + if (!line.trim()) continue; + try { + const event = JSON.parse(line); + if (event.type === "message_update") { + const delta = event.assistantMessageEvent; + if (delta?.type === "text_delta") { + textChunks.push(delta.delta || ""); + } + } + } catch {} + } + }); + + proc.stderr!.setEncoding("utf-8"); + proc.stderr!.on("data", () => {}); + + proc.on("close", (code) => { + clearTimeout(timer); + const elapsed = Math.round((Date.now() - startTime) / 1000); + const output = textChunks.join(""); + const status = code === 0 ? "done" : (killed ? "timed out" : "error"); + statusLog(`Subagent "${name}" finished (${status}) in ${elapsed}s`); + + if (killed) { + resolve({ + content: [{ type: "text", text: `Subagent "${name}" timed out after ${timeoutMinutes}m. Partial output:\n\n${output}` }], + isError: true, + }); + return; + } + + resolve({ + content: [{ type: "text", text: `[${name}] ${status} in ${elapsed}s\n\n${output}` }], + isError: code !== 0, + }); + }); + + proc.on("error", (err) => { + clearTimeout(timer); + statusLog(`Subagent "${name}" failed to spawn: ${err.message}`); + resolve({ + content: [{ type: "text", text: `Error spawning subagent "${name}": ${err.message}` }], + isError: true, + }); + }); + }); + }, + }); +} diff --git a/extensions/web-search.ts b/extensions/web-search.ts new file mode 100644 index 0000000..88b2b7a --- /dev/null +++ b/extensions/web-search.ts @@ -0,0 +1,173 @@ +/** + * Web Search & Fetch Tools for research-pi + * - web_search: Search via local SearXNG + * - web_fetch: Fetch and extract content from a URL + */ + +import { Type } from "@sinclair/typebox"; +import type { ExtensionAPI } from "@mariozechner/pi-coding-agent"; + +function getSearxngUrl(): string { + return process.env.SEARXNG_URL || "http://192.168.178.58:7777"; +} + +const WEB_TOOLS_SECTION = `\`web_search\` — Web lookup. Returns up to 20 results per query. Follow-up with web_fetch for content from promising URLs. +\`web_fetch\` — extract page text. Scale maxLength to content type (5-10k for quick facts, 20-50k docs, 100k+ for source/API refs).`; + +export default function webSearchExtension(pi: ExtensionAPI) { + pi.on("before_agent_start", async (event) => { + if (!event.systemPrompt.includes("web_search")) { + return { systemPrompt: event.systemPrompt + "\n" + WEB_TOOLS_SECTION }; + } + return { systemPrompt: event.systemPrompt }; + }); + + pi.registerTool({ + name: "web_search", + label: "Web Search", + description: "Search the web. Returns up to 20 results. Follow-up with web_fetch for content from promising URLs.", + promptSnippet: "Search the web. Returns up to 20 results. Follow-up with web_fetch for content from promising URLs.", + promptGuidelines: [ + "Once you have a promising result, switch to web_fetch instead of spending more searches.", + "Always web_fetch sites you plan on quoting or using information from.", + ], + parameters: Type.Object({ + query: Type.String({ description: "The search query to execute (max 2000 characters)" }), + }), + + async execute(_toolCallId, params, _signal, _onUpdate, _ctx) { + const { query } = params as { query: string }; + try { + const searchUrl = new URL("/search", getSearxngUrl()); + searchUrl.searchParams.append("q", query); + searchUrl.searchParams.append("format", "json"); + + const response = await fetch(searchUrl.toString()); + if (!response.ok) { + return { + content: [{ type: "text" as const, text: `Search request failed: ${response.status} ${response.statusText}` }], + isError: true, + }; + } + + const data = (await response.json()) as { + results?: Array<{ title: string; url: string; content?: string }>; + }; + + if (!data.results || !Array.isArray(data.results)) { + return { + content: [{ type: "text" as const, text: "No results found or invalid response format from search engine." }], + }; + } + + const formattedResults = data.results + .map( + (result, index) => + `[${index + 1}] ${result.title}\nURL: ${result.url}\n${result.content || "No description available"}\n` + ) + .join("\n"); + + return { + content: [ + { + type: "text" as const, + text: `Found ${data.results.length} results:\n\n${formattedResults}`, + }, + ], + }; + } catch (error) { + return { + content: [ + { type: "text" as const, text: `Error executing search: ${(error as Error).message}` }, + ], + isError: true, + }; + } + }, + }); + + pi.registerTool({ + name: "web_fetch", + label: "Web Fetch", + description: "Fetch a URL as text. Choose maxLength based on content type.", + promptSnippet: "Fetch a URL as text.", + promptGuidelines: [ + "Set maxLength based on needs (50,000 default). Lower if a quick check, higher if precise details are important (documentation etc.)", + ], + parameters: Type.Object({ + url: Type.String({ description: "The URL to fetch" }), + maxLength: Type.Number({ + description: "Maximum characters of extracted text to return. Be context-aware.", + default: 20000, + }), + }), + + async execute(_toolCallId, params, _signal, _onUpdate, _ctx) { + const { url, maxLength = 20000 } = params as { url: string; maxLength?: number }; + try { + const response = await fetch(url, { + headers: { + "User-Agent": "Mozilla/5.0 (compatible; PiCodingAgent/1.0)", + Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,text/plain;q=0.8,*/*;q=0.7", + }, + redirect: "follow", + signal: AbortSignal.timeout(15000), + }); + + if (!response.ok) { + return { + content: [{ type: "text" as const, text: `Fetch failed: ${response.status} ${response.statusText}` }], + isError: true, + }; + } + + const contentType = response.headers.get("content-type") || ""; + const raw = await response.text(); + + let text: string; + if (contentType.includes("text/html") || contentType.includes("application/xhtml")) { + text = raw + .replace(//gi, "") + .replace(//gi, "") + .replace(//g, "") + .replace(/<(nav|header|footer)[\s\S]*?<\/\1>/gi, "") + .replace(/<\/(p|div|li|tr|h[1-6]|blockquote|pre|section|article)>/gi, "\n") + .replace(//gi, "\n") + .replace(/<[^>]+>/g, "") + .replace(/&/g, "&") + .replace(/</g, "<") + .replace(/>/g, ">") + .replace(/"/g, '"') + .replace(/'/g, "'") + .replace(/ /g, " ") + .replace(/[ \t]+/g, " ") + .replace(/\n{3,}/g, "\n") + .split("\n") + .map((line) => line.trim()) + .filter((line) => line.length > 0) + .join("\n") + .trim(); + } else { + text = raw.trim(); + } + + const truncated = text.length > maxLength; + const output = truncated ? text.slice(0, maxLength) + "\n\n[... truncated]" : text; + + return { + content: [ + { + type: "text" as const, + text: `Fetched ${url} (${text.length} chars${truncated ? `, showing first ${maxLength}` : ""}):\n\n${output}`, + }, + ], + }; + } catch (error) { + return { + content: [{ type: "text" as const, text: `Error fetching URL: ${(error as Error).message}` }], + isError: true, + }; + } + }, + }); +} diff --git a/install.sh b/install.sh new file mode 100755 index 0000000..cd2346d --- /dev/null +++ b/install.sh @@ -0,0 +1,70 @@ +#!/usr/bin/env bash +set -euo pipefail + +# === CONFIGURE THIS BEFORE HOSTING === +# Replace with your actual GitHub repo URL: +REPO_URL="${RESEARCH_PI_REPO:-https://github.com/YOUR_USERNAME/YOUR_REPO.git}" +# ===================================== + +INSTALL_DIR="${HOME}/.pi/research" +BIN_TARGET="${HOME}/.local/bin/research" + +echo "==> Installing research-pi..." + +# Dependencies +if ! command -v node >/dev/null 2>&1; then + echo "Error: Node.js is required but not installed." + exit 1 +fi + +if ! command -v pi >/dev/null 2>&1; then + echo "Error: pi (pi-coding-agent) is required but not installed." + exit 1 +fi + +if ! command -v pnpm >/dev/null 2>&1; then + echo "Error: pnpm is required but not installed." + exit 1 +fi + +# Clone or update +if [ -d "$INSTALL_DIR/.git" ]; then + echo "==> Updating existing installation..." + git -C "$INSTALL_DIR" pull --ff-only +else + echo "==> Cloning repository..." + mkdir -p "$(dirname "$INSTALL_DIR")" + git clone "$REPO_URL" "$INSTALL_DIR" +fi + +# Build +cd "$INSTALL_DIR" +if [ ! -d "node_modules" ]; then + echo "==> Installing dependencies..." + pnpm install +fi + +echo "==> Building..." +pnpm build + +# Symlink +mkdir -p "$(dirname "$BIN_TARGET")" +if [ -L "$BIN_TARGET" ] || [ -e "$BIN_TARGET" ]; then + rm -f "$BIN_TARGET" +fi +ln -s "$INSTALL_DIR/bin/research" "$BIN_TARGET" + +# Default config +CONFIG_DIR="${HOME}/.pi/research" +CONFIG_FILE="${CONFIG_DIR}/config.json" +if [ ! -f "$CONFIG_FILE" ]; then + echo "==> Creating default config..." + cp "$INSTALL_DIR/config/default.json" "$CONFIG_FILE" +fi + +echo "==> Installation complete!" +echo " Binary: $BIN_TARGET" +echo " Source: $INSTALL_DIR" +echo "" +echo "Usage example:" +echo ' research --model k2p5 --start_research --task "native android app using gemma 4 e4b"' diff --git a/package.json b/package.json new file mode 100644 index 0000000..c3582eb --- /dev/null +++ b/package.json @@ -0,0 +1,17 @@ +{ + "name": "research-pi", + "version": "0.1.0", + "description": "Headless research orchestrator for pi-coding-agent", + "bin": { + "research": "./bin/research" + }, + "scripts": { + "build": "tsc", + "dev": "tsc --watch" + }, + "dependencies": {}, + "devDependencies": { + "@types/node": "^20.0.0", + "typescript": "^5.0.0" + } +} diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml new file mode 100644 index 0000000..0114a7f --- /dev/null +++ b/pnpm-lock.yaml @@ -0,0 +1,39 @@ +lockfileVersion: '9.0' + +settings: + autoInstallPeers: true + excludeLinksFromLockfile: false + +importers: + + .: + devDependencies: + '@types/node': + specifier: ^20.0.0 + version: 20.19.39 + typescript: + specifier: ^5.0.0 + version: 5.9.3 + +packages: + + '@types/node@20.19.39': + resolution: {integrity: sha512-orrrD74MBUyK8jOAD/r0+lfa1I2MO6I+vAkmAWzMYbCcgrN4lCrmK52gRFQq/JRxfYPfonkr4b0jcY7Olqdqbw==} + + typescript@5.9.3: + resolution: {integrity: sha512-jl1vZzPDinLr9eUt3J/t7V6FgNEw9QjvBPdysz9KfQDD41fQrC2Y4vKQdiaUpFT4bXlb1RHhLpp8wtm6M5TgSw==} + engines: {node: '>=14.17'} + hasBin: true + + undici-types@6.21.0: + resolution: {integrity: sha512-iwDZqg0QAGrg9Rav5H4n0M64c3mkR59cJ6wQp+7C4nI0gsmExaedaYLNO44eT4AtBBwjbTiGPMlt2Md0T9H9JQ==} + +snapshots: + + '@types/node@20.19.39': + dependencies: + undici-types: 6.21.0 + + typescript@5.9.3: {} + + undici-types@6.21.0: {} diff --git a/research/paper-summary.md b/research/paper-summary.md new file mode 100644 index 0000000..e341af6 --- /dev/null +++ b/research/paper-summary.md @@ -0,0 +1,274 @@ +# Research Paper Summary: Testing Methodologies + +A synthesis of recent academic research, technical reports, and scientific approaches to testing across software engineering, statistics, and emerging domains. + +--- + +## 1. Key Research Domains + +### Primary Areas of Academic Focus + +| Domain | Description | Key Venues | +|--------|-------------|------------| +| **Software Testing** | Test generation, prioritization, regression | ICSE, ASE, FSE | +| **Statistical Testing** | Hypothesis testing, e-values, p-values | stat.ME, math.ST | +| **Fuzzing** | Automated vulnerability discovery | ACM CCS, S&P, USENIX | +| **ML/AI Testing** | Deep learning model validation | ML conferences, arXiv | +| **Quantum Testing** | Quantum program verification | QCE, arXiv quant-ph | +| **CPS Testing** | Cyber-physical systems security | Embedded systems venues | + +--- + +## 2. Notable Papers and Findings + +### A. Software Testing & Test Generation + +#### LLM-Augmented Testing + +**"Enhancing Large Language Models with Retrieval Augmented Generation for Software Testing" (2026)** +- **Authors:** Fingleton, Siavash, Moin +- **Key Finding:** RAG pipelines reduce LLM hallucination and improve test generation effectiveness +- **Link:** [arXiv:2604.15270](https://arxiv.org/abs/2604.15270) + +**"E-Test: E'er-Improving Test Suites" (2025)** +- **Authors:** Qiu, Di Grazia, Mariani, Pezzè +- **Key Finding:** LLM-augmented test suites achieve F1-score of 0.55 vs 0.34 for traditional regression testing +- **Method:** Augments tests using production execution scenarios not covered by existing tests +- **Link:** [arXiv:2510.19860](https://arxiv.org/abs/2510.19860) + +**"Inline Tests" (ASE 2022)** +- **Authors:** Liu, Nie, Legunsen, Gligoric +- **Innovation:** I-Test framework for testing individual statements +- **Performance:** Negligible overhead (0.007x–0.014x) +- **Impact:** Found 2 faults in production open-source projects +- **Link:** [arXiv:2209.06315](https://arxiv.org/abs/2209.06315) + +#### Mutation Testing Research + +**"Does mutation testing improve testing practices?" (ICSE 2021)** +- **Authors:** Petrović, Ivanković, Fraser, Just +- **Scope:** Analysis of 15 million mutants +- **Key Finding:** Mutants are coupled with real faults; developers write more tests when using mutation testing +- **Significance:** Validates mutation testing as a quality metric beyond coverage +- **Link:** [arXiv:2103.07189](https://arxiv.org/abs/2103.07189) + +### B. Statistical & Hypothesis Testing + +**"Continuous Testing: Unifying Tests and E-values" (2024)** +- **Author:** Nick W. Koning +- **Innovation:** Unifies e-values and classical testing into single continuous framework +- **Key Finding:** E-values provide stronger evidence guarantees than p-values +- **Significance:** Foundation for sequential/adaptive testing methods +- **Link:** [arXiv:2409.05654](https://arxiv.org/abs/2409.05654) + +**"The Test of Tests: A Framework For Differentially Private Hypothesis Testing" (2023)** +- **Authors:** Kazan, Shi, Groce, Bray +- **Innovation:** Black-box framework for differentially private hypothesis tests +- **Performance:** Requires only 5-6x more data than public setting at ε=1 +- **Link:** [arXiv:2302.04260](https://arxiv.org/abs/2302.04260) + +### C. Fuzzing & Security Testing + +**"Prompt Fuzzing for Fuzz Driver Generation" (ACM CCS 2024)** +- **Authors:** Lyu, Xie, Chen, Chen +- **Innovation:** Coverage-guided fuzzing using LLMs for prompt fuzzing +- **Performance:** 1.61-1.63x higher branch coverage than OSS-Fuzz/Hopper +- **Impact:** Found 33 new bugs in real-world software +- **Link:** [arXiv:2312.17677](https://arxiv.org/abs/2312.17677) + +**"Large-Scale Empirical Analysis of Continuous Fuzzing" (2025)** +- **Authors:** Shirai et al. +- **Scope:** Analysis of ~1.12 million fuzzing sessions from 878 OSS-Fuzz projects +- **Key Findings:** + - High detection rates in early stages + - Coverage continues increasing over time (not saturating quickly) +- **Link:** [arXiv:2510.16433](https://arxiv.org/abs/2510.16433) + +**"Deep Reinforcement Fuzzing" (2018)** +- **Innovation:** Deep RL applied to fuzzing +- **Impact:** Found 20+ bugs in real-world software +- **Link:** [arXiv:1801.04589](https://arxiv.org/abs/1801.04589) + +### D. Metamorphic Testing + +**"Evaluating Human Trajectory Prediction with Metamorphic Testing" (2024)** +- **Authors:** Spieker, Belmecheri, Gotlieb, Lazaar +- **Innovation:** Wasserstein Violation Criterion for assessing metamorphic relations in stochastic systems +- **Application:** Oracle-less testing for ML predictions +- **Link:** [arXiv:2407.18756](https://arxiv.org/abs/2407.18756) + +**"METAL: Metamorphic Testing Framework for Analyzing Large-Language Model Qualities" (2023)** +- **Authors:** Hyun, Guo, Babar +- **Innovation:** Generates hundreds of metamorphic relations from templates +- **Novel Metric:** Integrates Attack Success Rate (ASR) with semantic quality +- **Link:** [arXiv:2312.06056](https://arxiv.org/abs/2312.06056) + +### E. Machine Learning System Testing + +**"Testing Deep Learning Models: A First Comparative Study" (2022)** +- **Authors:** Ahuja, Gotlieb, Spieker +- **Scope:** Comparative evaluation of differential, metamorphic, mutation, combinatorial, and adversarial testing +- **Target:** Vision-based systems +- **Link:** [arXiv:2202.12139](https://arxiv.org/abs/2202.12139) + +**"DeepMutation: Mutation Testing of Deep Learning Systems" (ISSRE 2018)** +- **Authors:** Ma et al. +- **Innovation:** Source-level and model-level mutation operators for DL systems +- **Purpose:** Evaluating test data quality for neural networks +- **Link:** [arXiv:1805.05206](https://arxiv.org/abs/1805.05206) + +### F. Quantum Software Testing + +**"Testing Multi-Subroutine Quantum Programs: From Unit Testing to Integration Testing" (2023)** +- **Authors:** Long, Zhao +- **Significance:** First comprehensive framework for quantum program testing +- **Components:** IO analysis, quantum relation checking, structural/behavior testing +- **Link:** [arXiv:2306.17407](https://arxiv.org/abs/2306.17407) + +### G. CI/CD & Regression Testing + +**"Formalizing Regression Testing for Agile and Continuous Integration Environments" (2025)** +- **Authors:** Das, Gary +- **Innovation:** First formalization using build-tuples and regression test windows +- **Application:** Continuous regression testing in agile environments +- **Link:** [arXiv:2511.02810](https://arxiv.org/abs/2511.02810) + +### H. Industrial Applications + +**"LLM-Based Automated Diagnosis Of Integration Test Failures At Google" (2026)** +- **Authors:** Ziftci, Liu, Greene, Dalloro +- **Tool:** Auto-Diagnose +- **Performance:** 90.14% accuracy in root cause diagnosis +- **Usage:** Deployed Google-wide, only 5.8% "not helpful" ratings +- **Impact:** Significant reduction in debugging time for integration failures +- **Link:** [arXiv:2604.12108](https://arxiv.org/abs/2604.12108) + +**"AnyPoC: Universal Proof-of-Concept Test Generation" (2026)** +- **Authors:** Zhao, Yang, et al. +- **Innovation:** Multi-agent framework for executable PoC generation +- **Performance:** 1.3x more valid PoCs than Claude Code +- **Impact:** Discovered 122 new bugs (105 confirmed, 86 fixed) +- **Link:** [arXiv:2604.11950](https://arxiv.org/abs/2604.11950) + +--- + +## 3. Research Trends + +### Current Trends (2024-2025) + +| Trend | Description | Key Papers | +|-------|-------------|------------| +| **LLM-Augmented Testing** | RAG pipelines, automated test generation, failure diagnosis | E-Test, Google Auto-Diagnose | +| **Continuous/Adaptive Testing** | Formal models for agile regression testing | Das & Gary (2025) | +| **Deep Learning Testing** | Mutation testing for neural networks, adversarial testing | DeepMutation, Ahuja et al. | +| **Fuzzing Evolution** | Prompt fuzzing, RL-based fuzzing, directed fuzzing | PromptFuzz, Deep RL Fuzzing | +| **Quantum Software Testing** | First frameworks emerging for quantum programs | Long & Zhao (2023) | +| **Metamorphic Testing** | Expansion to LLM quality testing, stochastic systems | METAL, Spieker et al. | +| **E-values in Testing** | Alternative to p-values with stronger guarantees | Koning (2024) | + +### Emerging Techniques + +| Technique | Description | Source | +|-----------|-------------|--------| +| **Property-Based Mutation Testing** | Combines mutation testing with formal property validation | Recent workshop papers | +| **Inline Testing** | Statement-level testing with negligible overhead | Liu et al. (ASE 2022) | +| **Behavioral Diversity** | Using mutation to measure test suite behavior diversity | Follow-up to Petrović et al. | +| **Test Smells Analysis** | Flaky test prediction using test smells | Follow-up research | + +--- + +## 4. Algorithmic and Performance Insights + +### Key Algorithmic Contributions + +| Algorithm/Framework | Contribution | Performance | +|--------------------|--------------|-------------| +| **E-value Framework** | Generalizes tests to continuous domain | Stronger evidence guarantees than p-values | +| **LLVM-based Mutation (Mull)** | Language-independent mutation via IR manipulation | Faster via JIT compilation | +| **Coverage-Guided Prompt Fuzzing** | Iterative LLM-based fuzz driver generation | 1.61-1.63x higher branch coverage | +| **Active Fuzzing** | Online active learning for CPS network attacks | Adaptive test generation | +| **Token-Level Fuzzing** | Mutations at token level | Finds bugs byte/grammar fuzzing miss | + +### Performance Metrics + +| Metric | Value | Context | +|--------|-------|---------| +| E-Test F1-score | 0.55 | vs 0.34 regression, 0.39 vanilla LLM | +| Inline Testing Overhead | 0.007x–0.014x | Negligible runtime impact | +| Auto-Diagnose Accuracy | 90.14% | Google integration test failures | +| PromptFuzz Coverage | 1.61-1.63x | vs OSS-Fuzz and Hopper | +| Mutation vs Coverage | 96.01% vs 55.68% | Defect detection rate | +| Mull Processing | Significant speedup | Via IR-level manipulation | + +### Benchmarks Referenced + +| Benchmark | Description | Papers Using | +|-----------|-------------|--------------| +| **OSS-Fuzz** | Google's continuous fuzzing service (~1.12M sessions, 878 projects) | Shirai et al. (2025) | +| **Defects4J** | Widely-used bug benchmark for Java | Multiple validation studies | +| **CVEFixes** | Vulnerability fixing dataset | Security testing research | +| **SIR-Bench** | Security incident response (794 test cases) | CPS testing | + +--- + +## 5. Key Research Insights + +### Validated Findings + +1. **Mutation Testing Value** - Petrović et al.'s 15-million-mutant study confirms mutants correlate with real faults and drive better testing practices + +2. **LLM Effectiveness** - RAG-augmented LLMs significantly outperform standard approaches (F1: 0.55 vs 0.39) for test generation + +3. **Industrial Success** - Google's Auto-Diagnose demonstrates 90%+ accuracy for test failure diagnosis at scale + +4. **Fuzzing Effectiveness** - Coverage-guided approaches (especially LLM-augmented) consistently outperform random fuzzing + +5. **E-value Superiority** - E-values provide stronger statistical guarantees than p-values for sequential testing + +### Research Gaps Identified + +- Limited work on quantum software testing (emerging field) +- Property-based mutation testing still underexplored +- Unified frameworks for multi-paradigm testing lacking +- Tool integration with modern CI/CD workflows needs improvement + +--- + +## 6. Implications for Practice + +### Evidence-Based Recommendations + +| Practice | Evidence | Source | +|----------|----------|--------| +| Adopt mutation testing | 96% defect detection vs 55% coverage | Petrović et al. | +| Use LLM+RAG for test generation | 40% F1 improvement over vanilla LLM | Fingleton et al. | +| Implement continuous fuzzing | High early detection, sustained coverage growth | Shirai et al. | +| Consider e-values for sequential testing | Stronger guarantees than p-values | Koning | +| Explore metamorphic testing for ML | Effective for oracle-less scenarios | Multiple papers | + +### Emerging Practical Tools + +| Tool/Approach | Status | Source | +|---------------|--------|--------| +| Auto-Diagnose (Google) | Production deployed | Google research | +| AnyPoC | Research prototype | Zhao et al. | +| Prompt Fuzzing | Academic prototype | Lyu et al. | +| Inline Testing (I-Test) | Research prototype | Liu et al. | + +--- + +## 7. Conclusion + +The research landscape on testing is experiencing rapid evolution, particularly with the integration of **large language models** into testing workflows and the maturation of **mutation testing** as a quality metric. Key developments include: + +- **AI-augmented testing** showing production-ready results (90% accuracy at Google) +- **E-values** emerging as a statistical foundation for continuous testing +- **Quantum testing** representing a new frontier +- **Metamorphic testing** expanding beyond traditional applications + +The overarching trend is toward **intelligent, continuous, and adaptive** testing systems that leverage both rigorous statistical foundations and modern AI capabilities. + +--- + +*Research compiled: April 2026* +*Sources: arXiv, ACM Digital Library, IEEE Xplore, conference proceedings* \ No newline at end of file diff --git a/research/web-summary.md b/research/web-summary.md new file mode 100644 index 0000000..5a032f0 --- /dev/null +++ b/research/web-summary.md @@ -0,0 +1,242 @@ +# Web Research Summary: Testing Methodologies + +A comprehensive summary of the current testing landscape, frameworks, tools, and emerging trends based on web research conducted April 2026. + +--- + +## 1. Main Domains Where "Test" is Relevant + +### Software Testing (Primary Focus) +The dominant context for "test" is software quality assurance, encompassing: + +| Category | Description | +|----------|-------------| +| **Unit Testing** | Testing individual components in isolation | +| **Integration Testing** | Testing interactions between components | +| **End-to-End (E2E)** | Full application workflow testing | +| **Performance/Load Testing** | System behavior under load | +| **Contract Testing** | API contract validation between services | +| **Property-Based Testing** | Testing with generated inputs | +| **Mutation Testing** | Evaluating test quality via code mutation | +| **Visual/Regression Testing** | UI appearance validation | +| **Security Testing** | Vulnerability and penetration testing | +| **Chaos Engineering** | System resilience through induced failures | + +### Other Domains +- **Medical Testing** - Diagnostic health tests +- **A/B Testing** - Production experimentation frameworks +- **Infrastructure Testing** - Testing Infrastructure-as-Code + +--- + +## 2. Key Frameworks and Tools + +### A. Unit Testing Frameworks + +| Tool | Stars | Best For | +|------|-------|----------| +| [Jest](https://github.com/jestjs/jest) | 45,337 | JavaScript/TypeScript, snapshot testing | +| [Vitest](https://github.com/vitest-dev/vitest) | 16,375 | Vite projects, native ESM | +| [Mocha](https://github.com/mochajs/mocha) | 22,882 | Flexible Node.js testing | +| [pytest](https://github.com/pytest-dev/pytest) | 13,776 | Python ecosystem | +| **JUnit** | Industry standard | Java applications | + +### B. End-to-End (Browser) Testing + +| Tool | Stars | Key Strengths | +|------|-------|---------------| +| [Playwright](https://github.com/microsoft/playwright) | 86,678 | Cross-browser (Chromium, Firefox, WebKit), auto-waiting, trace viewer | +| [Cypress](https://github.com/cypress-io/cypress) | 49,626 | Fast execution, great DX, time-travel debugging | +| [Selenium](https://github.com/SeleniumHQ/selenium) | 34,083 | Mature, multi-language, WebDriver standard | +| [WebdriverIO](https://github.com/webdriverio/webdriverio) | 9,793 | Next-gen browser/mobile automation | +| [Nightwatch.js](https://github.com/nightwatchjs/nightwatch) | 11,942 | W3C WebDriver API compliance | + +**Trend Alert:** Playwright (86K+ stars) is overtaking Cypress (49K+ stars) due to superior cross-browser support and reliability. + +### C. API Testing + +| Tool | Stars | Description | +|------|-------|-------------| +| [Hoppscotch](https://github.com/hoppscotch/hoppscotch) | 78,953 | Open-source Postman alternative | +| [Bruno](https://github.com/usebruno/bruno) | 43,023 | Git-friendly API testing IDE | +| [Supertest](https://github.com/ladjs/supertest) | 14,346 | HTTP assertions for Node.js | +| **REST Assured** | Popular | Java API testing | + +### D. Load/Performance Testing + +| Tool | Stars | Description | +|------|-------|-------------| +| [k6](https://github.com/grafana/k6) | 30,379 | Modern Go-based load testing with JS scripting | +| [Locust](https://github.com/locustio/locust) | 27,720 | Python-based, highly scalable | +| [JMeter](https://github.com/apache/jmeter) | 9,348 | Apache's mature load testing tool | +| [Vegeta](https://github.com/tsenart/vegeta) | 25,004 | HTTP load testing CLI tool | + +### E. Component/UI Testing + +| Tool | Stars | Description | +|------|-------|-------------| +| [Storybook](https://github.com/storybookjs/storybook) | 89,727 | Component development and testing isolation | +| [Testing Library](https://github.com/testing-library/react-testing-library) | 19,572 | Behavior-focused testing utilities | +| **Enzyme** | Legacy | Being replaced by Testing Library | + +### F. Contract Testing + +| Tool | Stars | Description | +|------|-------|-------------| +| [Pact](https://github.com/pact-foundation/pact-ruby) | 2,193 | Consumer-driven contract testing standard | +| [Pact JS](https://github.com/pact-foundation/pact-js) | 1,757 | JavaScript implementation | +| [Pact JVM](https://github.com/pact-foundation/pact-jvm) | 1,127 | JVM/Kotlin implementation | + +### G. Advanced Testing Techniques + +| Tool | Purpose | +|------|---------| +| [Stryker JS](https://github.com/stryker-mutator/stryker-js) | JavaScript mutation testing | +| [Infection](https://github.com/infection/infection) | PHP mutation testing | +| [Hypothesis](https://github.com/HypothesisWorks/hypothesis) | Python property-based testing | +| [fast-check](https://github.com/dubzzz/fast-check) | JavaScript property-based testing | + +### H. Infrastructure Testing + +| Tool | Stars | Description | +|------|-------|-------------| +| [Terratest](https://github.com/gruntwork-io/terratest) | 7,894 | Go library for testing Terraform | +| [terraform-compliance](https://github.com/eerkunt/terraform-compliance) | 1,447 | BDD-style security testing for Terraform | + +### I. Security Testing + +| Tool | Stars | Description | +|------|-------|-------------| +| [MobSF](https://github.com/MobSF/Mobile-Security-Framework-MobSF) | 20,812 | Mobile security testing framework | +| [OWASP MASTG](https://github.com/OWASP/owasp-mastg) | 12,830 | Mobile security testing guide | +| [OWASP WSTG](https://github.com/OWASP/wstg) | 9,091 | Web security testing guide | + +--- + +## 3. Recent Trends (2024-2025) + +### Major Shifts + +1. **Playwright Dominance** - Crossing 86K stars, replacing Selenium and challenging Cypress + - Better cross-browser support (WebKit, Firefox, Chromium) + - Built-in trace viewer and code generation + - Superior reliability with auto-waiting + +2. **Vitest Rising** - 16K+ stars, becoming the default for Vite projects + - Native ESM support + - Jest-compatible API + - Significantly faster execution + +3. **AI-Assisted Testing** - Rapid emergence of AI-powered tools + - [Browser-use](https://github.com/browser-use/browser-use) (88K+ stars) - AI browser automation + - Automated test generation and maintenance + +4. **Git-Native API Testing** - Bruno's 43K+ stars signal demand for version-controlled collections + - Alternative to proprietary formats (Postman) + - Better CI/CD integration + +5. **Component Testing Maturity** - Storybook with built-in testing + - Visual regression via Chromatic + - Interaction testing in isolation + +6. **Mutation Testing Adoption** - Stryker and Infection gaining traction + - Focus on test quality, not just coverage + - CI integration for quality gates + +7. **Shift-Left Security** - Earlier integration of security testing + - OWASP tools in CI pipelines + - Security-as-code practices + +--- + +## 4. Important Resources + +### Best Practice Guides + +| Resource | Link | Description | +|----------|------|-------------| +| JavaScript Testing Best Practices | [github.com/goldbergyoni/javascript-testing-best-practices](https://github.com/goldbergyoni/javascript-testing-best-practices) | 24,602 ⭐ comprehensive guide | +| Node.js Best Practices | [github.com/goldbergyoni/nodebestpractices](https://github.com/goldbergyoni/nodebestpractices) | 105,208 ⭐ includes testing section | + +### Official Documentation + +| Framework | Documentation | +|-----------|---------------| +| Jest | [jestjs.io](https://jestjs.io) | +| Vitest | [vitest.dev](https://vitest.dev) | +| Playwright | [playwright.dev](https://playwright.dev) | +| Cypress | [docs.cypress.io](https://docs.cypress.io) | +| Storybook | [storybook.js.org](https://storybook.js.org) | +| Pact | [pact.io](https://pact.io) | +| k6 | [k6.io](https://k6.io) | + +--- + +## 5. Emerging Best Practices + +### Testing Strategy (2024-2025) + +The modern testing pyramid has evolved: + +``` + ╱╲ + ╱ ╲ Visual regression tests + ╱────╲ + ╱ ╲ E2E tests (critical paths only) + ╱────────╲ + ╱ ╲ Integration tests +╱────────────╲ + Unit tests + Static analysis +``` + +### Key Principles + +1. **Testing Library Philosophy** + - Test behavior, not implementation + - Query elements as users would (getByRole, getByText) + - Avoid testing component internals + +2. **Mocking Best Practices** + - Prefer MSW (Mock Service Worker) for API mocking + - Limit mocking to boundaries (network, filesystem) + - Use real implementations where possible + +3. **Test Organization** + - Colocate tests with source or use `__tests__` directories + - Descriptive names: "should display user name after login" + - Group related tests with describe blocks + +4. **CI/CD Integration** + - Fast tests (unit) on every commit + - Slower tests (E2E) on PR/push to main + - Parallel execution + - Coverage tracking with regression detection + +5. **Modern TypeScript/JavaScript Stack (2025)** + | Purpose | Tool | + |---------|------| + | Framework | Vitest (Vite) or Jest | + | E2E | Playwright | + | Component | Storybook + Testing Library | + | API | Supertest or MSW | + | Mocking | Built-in or MSW | + +--- + +## 6. Key Takeaways + +The testing landscape is rapidly evolving toward: + +- **Faster, more reliable tools** - Playwright over Selenium, Vitest over traditional runners +- **Developer-friendly experiences** - Better DX, watch modes, clearer error messages +- **AI integration** - Emerging tools for test generation and maintenance +- **Quality over coverage** - Mutation testing gaining adoption +- **Shift-left practices** - Earlier testing, security in CI/CD +- **Open-source alternatives** - Bruno/Hoppscotch vs proprietary tools + +The dominant trend is **seamless integration** of testing into modern development workflows with minimal friction and maximum feedback value. + +--- + +*Research conducted: April 2026* +*Sources: Open-source repositories, official documentation, community discussions* \ No newline at end of file diff --git a/skills/web-search-bash/SKILL.md b/skills/web-search-bash/SKILL.md new file mode 100644 index 0000000..94a233d --- /dev/null +++ b/skills/web-search-bash/SKILL.md @@ -0,0 +1,61 @@ +--- +name: web-search-bash +description: Web search and fetch using curl/wget against a local SearXNG server. +--- + +Web search and content fetching via local SearXNG server. + +## Server URL + +The SearXNG instance URL is available in the `SEARXNG_URL` environment variable. If unset, fall back to `http://192.168.178.58:7777`. + +```bash +SEARXNG_URL="${SEARXNG_URL:-http://192.168.178.58:7777}" +``` + +## Endpoints + +### Search +- **URL**: `$SEARXNG_URL/search` +- **Parameters**: `q` (query), `format=json` (structured results) + +### Fetch +- Direct HTTP GET to any URL with custom headers. + +--- + +## Search Examples + +```bash +# Basic search +SEARXNG_URL="${SEARXNG_URL:-http://192.168.178.58:7777}" +curl -s "$SEARXNG_URL/search?q=your+query&format=json" + +# Extract titles and URLs +curl -s "$SEARXNG_URL/search?q=your+query&format=json" | \ + jq -r '.results[] | "[\(.title)] \(.url)"' +``` + +## Fetch Examples + +```bash +# Fetch with timeout and user-agent +curl -s --max-time 15 \ + -A "Mozilla/5.0 (compatible; PiCodingAgent/1.0)" \ + -H "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.7" \ + "https://example.com" + +# Strip HTML to plain text +curl -s -A "Mozilla/5.0" "https://example.com" | \ + sed -e 's/<[^>]*>//g' | \ + tr -s '[:space:]' ' ' | \ + sed 's/^ *//;s/ *$//' | \ + head -c 20000 +``` + +## Tips + +- Search returns up to ~20 results. +- Use `jq` to parse JSON if available. +- For large pages, limit output with `head -c `. +- Always include source URLs in summaries. diff --git a/src/cli.ts b/src/cli.ts new file mode 100644 index 0000000..27effe8 --- /dev/null +++ b/src/cli.ts @@ -0,0 +1,103 @@ +export interface Args { + model?: string; + mode?: "start_research" | "onboarding" | "new_feature"; + task?: string; + webSearchMode: "extension" | "mcp" | "skill"; + mcpUrl?: string; + outputDir: string; + timeout: number; + verbose: boolean; + help: boolean; +} + +export function showHelp(): void { + console.log(`research - Headless research orchestrator for pi-coding-agent + +Usage: + research --model --start_research --task "" + research --model --onboarding [--task ""] + research --model --new_feature --task "" + +Options: + --model Model alias from ~/.pi/agent/models.json (required) + --start_research Research a new topic from scratch + --onboarding Map and document an existing codebase + --new_feature Plan how to add a feature to the current project + --task "" Task description / scope (required for start_research and new_feature) + --web-search-mode Web search backend: extension (default), mcp, or skill + --mcp-url MCP server URL (required when --web-search-mode=mcp) + --output-dir Where to write deliverables (default: cwd) + --timeout Per-agent timeout (default: 15) + --verbose Stream full orchestrator output to stderr + --help Show this help message + +Examples: + research --model k2p5 --start_research --task "native android app using gemma 4 e4b" + research --model kimi-for-coding --onboarding + research --model minimax-token-plan/MiniMax-M2.7 --new_feature --task "add comment section to react blog" +`); +} + +export function parseArgs(argv: string[]): Args { + const args: Args = { + webSearchMode: "extension", + outputDir: process.cwd(), + timeout: 15, + verbose: false, + help: false, + }; + + for (let i = 0; i < argv.length; i++) { + const a = argv[i]; + switch (a) { + case "--model": + args.model = argv[++i]; + break; + case "--start_research": + args.mode = "start_research"; + break; + case "--onboarding": + args.mode = "onboarding"; + break; + case "--new_feature": + args.mode = "new_feature"; + break; + case "--task": + args.task = argv[++i]; + break; + case "--web-search-mode": + args.webSearchMode = argv[++i] as any; + break; + case "--mcp-url": + args.mcpUrl = argv[++i]; + break; + case "--output-dir": + args.outputDir = argv[++i]; + break; + case "--timeout": + args.timeout = parseInt(argv[++i], 10); + break; + case "--verbose": + args.verbose = true; + break; + case "--help": + case "-h": + args.help = true; + break; + } + } + + return args; +} + +export function validateArgs(args: Args): void { + if (!args.model) { + throw new Error("--model is required"); + } + if (!args.mode) { + throw new Error("One of --start_research, --onboarding, or --new_feature is required"); + } + if (args.webSearchMode === "mcp" && !args.mcpUrl) { + throw new Error("--mcp-url is required when --web-search-mode=mcp"); + } +} diff --git a/src/launcher.ts b/src/launcher.ts new file mode 100644 index 0000000..4e97bd1 --- /dev/null +++ b/src/launcher.ts @@ -0,0 +1,154 @@ +import { spawn } from "child_process"; +import * as fs from "fs"; +import * as path from "path"; +import { Args } from "./cli.js"; +import { resolveModel } from "./models.js"; +import { ensureDir, loadConfig, makeTempSession } from "./utils.js"; + +function statusLog(msg: string): void { + const ts = new Date().toISOString().replace("T", " ").slice(0, 19); + process.stderr.write(`[research-status] ${ts} ${msg}\n`); +} + +export function launch(args: Args): void { + const config = loadConfig(); + const resolvedModel = resolveModel(args.model!); + + // Repo root is where this script is installed (e.g. ~/.pi/research) + // Launcher runs from dist/src/launcher.js, so go up two levels + const repoRoot = path.resolve(path.dirname(process.argv[1]), ".."); + + const subagentSpawnerExt = path.join(repoRoot, "extensions", "subagent-spawner.ts"); + const webSearchExt = path.join(repoRoot, "extensions", "web-search.ts"); + const mcpWebSearchExt = path.join(repoRoot, "extensions", "mcp-web-search.ts"); + const webSearchSkill = path.join(repoRoot, "skills", "web-search-bash"); + const orchestratorPrompt = path.join(repoRoot, "agents", "orchestrator.md"); + + const extensions: string[] = [subagentSpawnerExt]; + const skills: string[] = []; + + if (args.webSearchMode === "extension") { + extensions.push(webSearchExt); + } else if (args.webSearchMode === "mcp") { + extensions.push(mcpWebSearchExt); + } else { + skills.push(webSearchSkill); + } + + const modeLine = `MODE: ${args.mode}`; + const taskLine = args.task ? `TASK: ${args.task}` : ""; + const outputLine = `OUTPUT_DIR: ${path.resolve(args.outputDir)}`; + const timeoutLine = `TIMEOUT_MINUTES: ${args.timeout}`; + const webSearchModeLine = `WEB_SEARCH_MODE: ${args.webSearchMode}`; + const mcpUrlLine = args.mcpUrl ? `MCP_URL: ${args.mcpUrl}` : ""; + const configLine = `CONFIG: ${JSON.stringify(config)}`; + + const prompt = [ + modeLine, + taskLine, + outputLine, + timeoutLine, + webSearchModeLine, + mcpUrlLine, + configLine, + "Begin.", + ] + .filter(Boolean) + .join("\n"); + + const piArgs = [ + "--mode", "json", + "--print", + "--no-extensions", + "--no-skills", + "--model", resolvedModel, + "--tools", "read,write,bash,grep,find,ls", + "--session", makeTempSession(), + "--thinking", "off", + ]; + + for (const ext of extensions) { + piArgs.push("--extension", ext); + } + for (const skill of skills) { + piArgs.push("--skill", skill); + } + + piArgs.push("--append-system-prompt", orchestratorPrompt); + piArgs.push(prompt); + + statusLog("Launching research orchestrator..."); + if (args.verbose) { + console.error("[research] Spawning pi with args:", piArgs.join(" ")); + } + + const proc = spawn("pi", piArgs, { + stdio: ["ignore", "pipe", "pipe"], + env: { + ...process.env, + RESEARCH_PI_ROOT: repoRoot, + MCP_URL: args.mcpUrl || "", + }, + }); + + let buffer = ""; + + proc.stdout!.setEncoding("utf-8"); + proc.stdout!.on("data", (chunk: string) => { + buffer += chunk; + const lines = buffer.split("\n"); + buffer = lines.pop() || ""; + for (const line of lines) { + if (!line.trim()) continue; + try { + const event = JSON.parse(line); + if (event.type === "message_update") { + const delta = event.assistantMessageEvent; + if (delta?.type === "text_delta" && args.verbose) { + process.stderr.write(delta.delta); + } + } else if (event.type === "tool_execution_start" && args.verbose) { + const name = event.toolCall?.name || "tool"; + process.stderr.write(`\n[tool:${name}]\n`); + } + } catch { + if (args.verbose) { + process.stderr.write(line + "\n"); + } + } + } + }); + + let stderrBuffer = ""; + proc.stderr!.setEncoding("utf-8"); + proc.stderr!.on("data", (chunk: string) => { + stderrBuffer += chunk; + const lines = stderrBuffer.split("\n"); + stderrBuffer = lines.pop() || ""; + for (const line of lines) { + if (line.startsWith("[research-status]")) { + process.stderr.write(line + "\n"); + } else if (args.verbose) { + process.stderr.write(line + "\n"); + } + } + }); + + proc.on("close", (code) => { + if (stderrBuffer.trim()) { + const line = stderrBuffer.trim(); + if (line.startsWith("[research-status]")) { + process.stderr.write(line + "\n"); + } else if (args.verbose) { + process.stderr.write(line + "\n"); + } + } + statusLog(`Orchestrator finished (exit code ${code ?? 0})`); + process.exit(code ?? 0); + }); + + proc.on("error", (err) => { + console.error("[research] Failed to spawn pi:", err.message); + process.exit(1); + }); +} diff --git a/src/main.ts b/src/main.ts new file mode 100644 index 0000000..7cc52b2 --- /dev/null +++ b/src/main.ts @@ -0,0 +1,18 @@ +#!/usr/bin/env node +import { parseArgs, validateArgs, showHelp } from "./cli.js"; +import { launch } from "./launcher.js"; + +const args = parseArgs(process.argv.slice(2)); + +if (args.help) { + showHelp(); + process.exit(0); +} + +try { + validateArgs(args); + launch(args); +} catch (err: any) { + console.error("Error:", err?.message || err); + process.exit(1); +} diff --git a/src/models.ts b/src/models.ts new file mode 100644 index 0000000..fb6002c --- /dev/null +++ b/src/models.ts @@ -0,0 +1,95 @@ +import * as fs from "fs"; +import * as path from "path"; +import * as os from "os"; + +export interface ModelEntry { + id: string; + name?: string; + contextWindow?: number; + reasoning?: boolean; +} + +export interface Provider { + baseUrl: string; + api: string; + apiKey: string; + models: ModelEntry[]; +} + +export interface ModelsJson { + providers: Record; +} + +export function loadModelsJson(): ModelsJson { + const p = path.join(os.homedir(), ".pi", "agent", "models.json"); + const raw = fs.readFileSync(p, "utf-8"); + return JSON.parse(raw) as ModelsJson; +} + +function normalize(s: string): string { + return s.toLowerCase().replace(/[^a-z0-9]/g, ""); +} + +function normalizeWithSubs(s: string): string { + // Common shorthand substitutions + return normalize(s).replace(/p(?=\d)/g, ""); // k2p5 -> k25 +} + +export function resolveModel(alias: string): string { + const data = loadModelsJson(); + + // 1. Exact provider/id path (e.g. "minimax-token-plan/MiniMax-M2.7") + if (alias.includes("/") && !alias.includes(" ")) { + const [providerName, modelId] = alias.split("/"); + const provider = data.providers[providerName]; + if (provider) { + const model = provider.models.find((m) => m.id === modelId); + if (model) return `${providerName}/${modelId}`; + } + } + + // 2. Exact provider match (first model in that provider) + const provider = data.providers[alias]; + if (provider && provider.models.length > 0) { + return `${alias}/${provider.models[0].id}`; + } + + // 3. Fuzzy match across all models (id or name) + const lowerAlias = alias.toLowerCase(); + const normAlias = normalize(alias); + const normSubAlias = normalizeWithSubs(alias); + const candidates: { score: number; fullId: string }[] = []; + + for (const [providerName, p] of Object.entries(data.providers)) { + for (const m of p.models) { + const idLower = m.id.toLowerCase(); + const nameLower = (m.name || "").toLowerCase(); + const normId = normalize(m.id); + const normName = normalize(m.name || ""); + + if (idLower === lowerAlias || nameLower === lowerAlias) { + return `${providerName}/${m.id}`; + } + + let score = 0; + if (idLower.includes(lowerAlias)) score += 10; + if (nameLower.includes(lowerAlias)) score += 8; + if (normId.includes(normAlias)) score += 6; + if (normName.includes(normAlias)) score += 5; + if (normId.includes(normSubAlias)) score += 4; + if (normName.includes(normSubAlias)) score += 3; + if ((m.name || "").toLowerCase().split(/[^a-z0-9]+/).some((w) => w === lowerAlias)) score += 2; + + if (score > 0) { + candidates.push({ score, fullId: `${providerName}/${m.id}` }); + } + } + } + + if (candidates.length === 0) { + throw new Error(`Could not resolve model alias "${alias}". Check ~/.pi/agent/models.json`); + } + + candidates.sort((a, b) => b.score - a.score); + return candidates[0].fullId; +} diff --git a/src/utils.ts b/src/utils.ts new file mode 100644 index 0000000..1bb18b7 --- /dev/null +++ b/src/utils.ts @@ -0,0 +1,31 @@ +import * as fs from "fs"; +import * as path from "path"; +import * as os from "os"; + +export function ensureDir(p: string): void { + fs.mkdirSync(p, { recursive: true }); +} + +export function getResearchDir(): string { + return path.join(os.homedir(), ".pi", "research"); +} + +export function getConfigPath(): string { + return path.join(getResearchDir(), "config.json"); +} + +export function loadConfig(): Record { + try { + const p = getConfigPath(); + if (fs.existsSync(p)) { + return JSON.parse(fs.readFileSync(p, "utf-8")); + } + } catch {} + return {}; +} + +export function makeTempSession(): string { + const dir = path.join(os.homedir(), ".pi", "research", "sessions"); + ensureDir(dir); + return path.join(dir, `session-${Date.now()}-${Math.random().toString(36).slice(2)}.jsonl`); +} diff --git a/tsconfig.json b/tsconfig.json new file mode 100644 index 0000000..f15a70f --- /dev/null +++ b/tsconfig.json @@ -0,0 +1,16 @@ +{ + "compilerOptions": { + "target": "ES2022", + "module": "Node16", + "moduleResolution": "Node16", + "outDir": "./dist", + "rootDir": "./src", + "strict": true, + "esModuleInterop": true, + "skipLibCheck": true, + "forceConsistentCasingInFileNames": true, + "resolveJsonModule": true + }, + "include": ["src/**/*"], + "exclude": ["node_modules"] +}