blitz-swarm/agents.py at main · Joona-t/blitz-swarm · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
"""Agent definitions, role prompts, and swarm planning for Blitz-Swarm."""

import json
from dataclasses import dataclass, field

# ---------------------------------------------------------------------------
# Agent output schema — passed to claude --json-schema for structured output
# ---------------------------------------------------------------------------

AGENT_OUTPUT_SCHEMA = {
    "type": "object",
    "properties": {
        "findings": {
            "type": "string",
            "description": "Your detailed findings, analysis, or synthesis in markdown.",
        },
        "key_points": {
            "type": "array",
            "items": {"type": "string"},
            "description": "Bullet-point list of the most important takeaways.",
        },
        "confidence": {
            "type": "number",
            "minimum": 0,
            "maximum": 1,
            "description": "Your confidence in the accuracy of your output (0.0–1.0).",
        },
        "gaps_identified": {
            "type": "array",
            "items": {"type": "string"},
            "description": "Areas that need more research or have insufficient coverage.",
        },
        "quality_vote": {
            "type": "string",
            "enum": ["ready", "needs_work"],
            "description": "Vote on whether the collective output is ready for finalization.",
        },
        "quality_notes": {
            "type": "string",
            "description": "Explanation for your quality vote.",
        },
        "dissent": {
            "type": "string",
            "description": "Any disagreements with other agents' findings or the emerging consensus.",
        },
    },
    "required": ["findings", "key_points", "confidence", "quality_vote"],
}

AGENT_OUTPUT_SCHEMA_JSON = json.dumps(AGENT_OUTPUT_SCHEMA)

# ---------------------------------------------------------------------------
# Agent dataclass
# ---------------------------------------------------------------------------


@dataclass
class BlitzAgent:
    id: str
    role: str
    subtopic: str
    system_prompt: str
    model: str = "sonnet"
    max_iterations: int = 3


# ---------------------------------------------------------------------------
# Role prompt templates
# ---------------------------------------------------------------------------

ROLE_PROMPTS = {
    "researcher": """You are a Researcher agent in a parallel multi-agent swarm.

Your job is to deeply research your assigned subtopic and produce thorough, accurate findings. You are one of several researchers working simultaneously on different facets of the same overarching topic.

Guidelines:
- Go deep, not broad. Cover your assigned subtopic exhaustively.
- Cite specific mechanisms, algorithms, trade-offs, and implementation details.
- Note your confidence level honestly — flag areas where you're uncertain.
- Identify gaps: what would a reader still need to know after reading your findings?
- Your findings will be cross-checked by Critic and Fact-Checker agents — be precise.""",

    "critic": """You are a Critic agent in a parallel multi-agent swarm.

Your job is to read all researcher findings and identify weaknesses, gaps, contradictions, and unsupported claims. You are the quality gate — nothing ships without your scrutiny.

Guidelines:
- Look for factual contradictions between different researchers' outputs.
- Flag claims that lack evidence or have low confidence.
- Identify critical subtopics that received zero or insufficient coverage.
- Check logical consistency — do the findings tell a coherent story?
- Be specific about what's wrong and what would fix it.
- Vote "needs_work" if there are unresolved issues. Vote "ready" only when you're genuinely satisfied.""",

    "fact_checker": """You are a Fact-Checker agent in a parallel multi-agent swarm.

Your job is to cross-validate specific claims made by researcher agents. You verify accuracy by checking claims against your knowledge.

Guidelines:
- Focus on verifiable facts: numbers, dates, algorithm names, performance claims.
- Flag any claim that appears incorrect or misleading.
- Distinguish between factual errors (wrong) and imprecise statements (vague but not wrong).
- If a claim is correct but lacks nuance, note the missing context.
- Vote "needs_work" if you find factual errors. Vote "ready" if claims check out.""",

    "quality_judge": """You are a Quality Judge agent in a parallel multi-agent swarm.

Your job is to evaluate the overall quality of the swarm's collective output. You score on four dimensions: coverage, accuracy, clarity, and depth.

Guidelines:
- Coverage: Does the output address all important aspects of the topic?
- Accuracy: Are the claims well-supported and factually correct?
- Clarity: Is the output well-organized and easy to follow?
- Depth: Does it go beyond surface-level into implementation details and trade-offs?
- Your quality_notes should explain your scores on each dimension.
- Vote "ready" only when all four dimensions meet a high bar.
- Your vote carries significant weight in the consensus decision.""",

    "synthesizer": """You are a Synthesizer agent in a parallel multi-agent swarm.

Your job is to integrate all findings from researchers, incorporate critic and fact-checker feedback, and produce a single coherent, well-structured technical summary.

Guidelines:
- Organize findings into a logical structure with clear sections.
- Resolve contradictions — when researchers disagree, note both views and indicate which is better supported.
- Incorporate critic feedback — if a gap was flagged, acknowledge it.
- Preserve dissenting views in a dedicated section rather than hiding them.
- The output should read as a single authoritative document, not a patchwork of agent outputs.
- Aim for depth and precision over length. Every sentence should earn its place.
- Include: core concepts, key findings, implementation implications, open questions, and a dissent section.""",
}

# ---------------------------------------------------------------------------
# Subtopic splitting
# ---------------------------------------------------------------------------


def _split_subtopics_heuristic(topic: str, count: int) -> list[str]:
    """Split a topic into subtopics using static research angles.

    Fallback for when LLM planning is unavailable or disabled.
    """
    angles = [
        "core concepts, definitions, and foundational principles",
        "implementation details, algorithms, and technical architecture",
        "trade-offs, limitations, failure modes, and alternatives",
        "real-world applications, case studies, and current state of the art",
    ]
    subtopics = []
    for i in range(count):
        angle = angles[i % len(angles)]
        subtopics.append(f"{topic} — focusing on {angle}")
    return subtopics


def _split_subtopics_llm(topic: str, count: int) -> list[str]:
    """Split a topic into subtopics using an LLM call.

    Invokes claude -p to analyze the topic and generate targeted subtopics.
    Falls back to heuristic if the LLM call fails.
    """
    import subprocess

    schema = json.dumps({
        "type": "object",
        "properties": {
            "subtopics": {
                "type": "array",
                "items": {"type": "string"},
                "description": f"Exactly {count} specific, non-overlapping subtopics.",
            },
        },
        "required": ["subtopics"],
    })

    prompt = (
        f"Analyze this research topic and split it into exactly {count} specific, "
        f"non-overlapping subtopics that together provide comprehensive coverage.\n\n"
        f"Topic: {topic}\n\n"
        f"Each subtopic should be a focused research angle that a single researcher "
        f"can deeply investigate. Make them specific to this topic, not generic."
    )

    try:
        result = subprocess.run(
            [
                "claude", "-p", prompt,
                "--system-prompt", "You are a research planning assistant. Return JSON only.",
                "--output-format", "json",
                "--model", "haiku",
                "--dangerously-skip-permissions",
            ],
            capture_output=True, text=True, timeout=30,
        )

        if result.returncode == 0:
            data = json.loads(result.stdout.strip())
            subtopics = data.get("subtopics", [])
            if len(subtopics) >= count:
                return [f"{topic} — focusing on {st}" for st in subtopics[:count]]
    except Exception:
        pass

    return _split_subtopics_heuristic(topic, count)


# ---------------------------------------------------------------------------
# Agent planning
# ---------------------------------------------------------------------------

# Model overrides for specific roles (others use default "sonnet")
ROLE_MODEL_OVERRIDES = {
    "quality_judge": "sonnet",
    "synthesizer": "sonnet",
}

PLANNING_SCHEMA = json.dumps({
    "type": "object",
    "properties": {
        "researcher_count": {
            "type": "integer", "minimum": 2, "maximum": 6,
            "description": "Number of researchers to spawn.",
        },
        "critic_count": {
            "type": "integer", "minimum": 1, "maximum": 3,
            "description": "Number of critics to spawn.",
        },
        "needs_fact_checker": {
            "type": "boolean",
            "description": "Whether a dedicated fact-checker is needed.",
        },
        "subtopics": {
            "type": "array",
            "items": {"type": "string"},
            "description": "Specific subtopics for each researcher.",
        },
    },
    "required": ["researcher_count", "critic_count", "needs_fact_checker", "subtopics"],
})


def plan_agents(topic: str, use_llm: bool = True) -> list[BlitzAgent]:
    """Plan which agents to spawn for a given topic.

    When use_llm=True, invokes an LLM to analyze the topic and determine
    optimal agent count and subtopic assignments. Falls back to heuristic.
    """
    plan = None

    if use_llm:
        plan = _llm_plan(topic)

    if plan is None:
        plan = {
            "researcher_count": 2,
            "critic_count": 1,
            "needs_fact_checker": True,
            "subtopics": None,
        }

    researcher_count = plan["researcher_count"]
    critic_count = plan["critic_count"]
    needs_fc = plan["needs_fact_checker"]

    # Get subtopics
    subtopics = plan.get("subtopics")
    if subtopics and len(subtopics) >= researcher_count:
        subtopics = [f"{topic} — focusing on {st}" for st in subtopics[:researcher_count]]
    else:
        subtopics = _split_subtopics_heuristic(topic, researcher_count)

    agents = []

    # Spawn researchers
    for i, subtopic in enumerate(subtopics):
        agents.append(BlitzAgent(
            id=f"researcher_{i:02d}",
            role="researcher",
            subtopic=subtopic,
            system_prompt=ROLE_PROMPTS["researcher"],
            model=ROLE_MODEL_OVERRIDES.get("researcher", "sonnet"),
        ))

    # Spawn critics
    for i in range(critic_count):
        suffix = f"_{i:02d}" if critic_count > 1 else ""
        agents.append(BlitzAgent(
            id=f"critic{suffix}",
            role="critic",
            subtopic=topic,
            system_prompt=ROLE_PROMPTS["critic"],
            model="sonnet",
        ))

    # Spawn fact-checker
    if needs_fc:
        agents.append(BlitzAgent(
            id="fact_checker",
            role="fact_checker",
            subtopic=topic,
            system_prompt=ROLE_PROMPTS["fact_checker"],
            model="sonnet",
        ))

    # Always: 1 quality judge
    agents.append(BlitzAgent(
        id="quality_judge",
        role="quality_judge",
        subtopic=topic,
        system_prompt=ROLE_PROMPTS["quality_judge"],
        model=ROLE_MODEL_OVERRIDES.get("quality_judge", "sonnet"),
    ))

    # Always: 1 synthesizer
    agents.append(BlitzAgent(
        id="synthesizer",
        role="synthesizer",
        subtopic=topic,
        system_prompt=ROLE_PROMPTS["synthesizer"],
        model=ROLE_MODEL_OVERRIDES.get("synthesizer", "sonnet"),
    ))

    return agents


def _llm_plan(topic: str) -> dict | None:
    """Use an LLM to determine optimal swarm composition for a topic."""
    import subprocess

    prompt = (
        f"Analyze this research topic and determine the optimal agent swarm composition.\n\n"
        f"Topic: {topic}\n\n"
        f"Consider:\n"
        f"- How broad is this topic? (narrow=2 researchers, broad=4-6)\n"
        f"- Does it involve claims that need fact-checking? (empirical/technical=yes)\n"
        f"- How many critics are needed? (controversial=2, straightforward=1)\n"
        f"- What specific subtopics should each researcher focus on?"
    )

    try:
        result = subprocess.run(
            [
                "claude", "-p", prompt,
                "--system-prompt", "You are a research planning assistant. Return JSON only.",
                "--output-format", "json",
                "--model", "haiku",
                "--dangerously-skip-permissions",
            ],
            capture_output=True, text=True, timeout=30,
        )

        if result.returncode == 0:
            data = json.loads(result.stdout.strip())
            # Validate required fields
            if all(k in data for k in ("researcher_count", "critic_count", "needs_fact_checker")):
                data["researcher_count"] = max(2, min(6, int(data["researcher_count"])))
                data["critic_count"] = max(1, min(3, int(data["critic_count"])))
                return data

    except Exception:
        pass

    return None