From 22185051a9758f4da633a2fe608b92d9329df676 Mon Sep 17 00:00:00 2001 From: Harsha Vardhan Date: Wed, 11 Feb 2026 18:19:09 +0530 Subject: [PATCH] Format Python sources with Black --- ariv/cli/arivctl.py | 4 +- ariv/orchestrator/__init__.py | 7 +- ariv/orchestrator/router.py | 2 +- ariv/runner/llama_cli.py | 8 +- ariv/scripts/convert_and_quantize.py | 4 +- benchmarks/arc_benchmark.py | 175 +++++++++++++++------------ benchmarks/arc_hinglish.py | 29 +++-- benchmarks/run_bench.py | 22 +++- benchmarks/sanskriti_eval.py | 65 +++++----- 9 files changed, 181 insertions(+), 135 deletions(-) diff --git a/ariv/cli/arivctl.py b/ariv/cli/arivctl.py index ab602aa..15ceaa1 100644 --- a/ariv/cli/arivctl.py +++ b/ariv/cli/arivctl.py @@ -43,7 +43,9 @@ def cmd_start(host: str, port: int) -> None: def cmd_bench(models: List[str], lang: str, subset: str) -> None: from benchmarks.run_bench import run_benchmark - run_benchmark(models=models, lang=lang, subset=subset, output_dir=Path("benchmarks/results")) + run_benchmark( + models=models, lang=lang, subset=subset, output_dir=Path("benchmarks/results") + ) def cmd_download(dry_run: bool) -> None: diff --git a/ariv/orchestrator/__init__.py b/ariv/orchestrator/__init__.py index e398159..bd22a17 100644 --- a/ariv/orchestrator/__init__.py +++ b/ariv/orchestrator/__init__.py @@ -1,5 +1,10 @@ """Orchestration components.""" -from ariv.orchestrator.router import HardwareProfile, ModelManager, RouteDecision, Router +from ariv.orchestrator.router import ( + HardwareProfile, + ModelManager, + RouteDecision, + Router, +) __all__ = ["HardwareProfile", "ModelManager", "RouteDecision", "Router"] diff --git a/ariv/orchestrator/router.py b/ariv/orchestrator/router.py index dd55c09..4b71a84 100644 --- a/ariv/orchestrator/router.py +++ b/ariv/orchestrator/router.py @@ -53,7 +53,7 @@ def _detect_indic(preferred_lang: Optional[str], text: str) -> bool: if preferred_lang and preferred_lang.lower() in INDIC_LANGS: return True for char in text: - if "\u0900" <= char <= "\u0DFF": + if "\u0900" <= char <= "\u0dff": return True return False diff --git a/ariv/runner/llama_cli.py b/ariv/runner/llama_cli.py index b134036..4c751fa 100644 --- a/ariv/runner/llama_cli.py +++ b/ariv/runner/llama_cli.py @@ -81,7 +81,9 @@ async def _collect_stderr() -> None: except json.JSONDecodeError: yield line continue - token = payload.get("token") or payload.get("content") or payload.get("text") + token = ( + payload.get("token") or payload.get("content") or payload.get("text") + ) if token: yield str(token) @@ -89,7 +91,9 @@ async def _collect_stderr() -> None: await stderr_task if process.returncode != 0: - stderr_tail = b"".join(stderr_chunks).decode("utf-8", errors="replace").strip() + stderr_tail = ( + b"".join(stderr_chunks).decode("utf-8", errors="replace").strip() + ) if len(stderr_tail) > 1200: stderr_tail = stderr_tail[-1200:] raise RuntimeError( diff --git a/ariv/scripts/convert_and_quantize.py b/ariv/scripts/convert_and_quantize.py index 03646ea..dd3a818 100644 --- a/ariv/scripts/convert_and_quantize.py +++ b/ariv/scripts/convert_and_quantize.py @@ -17,7 +17,9 @@ def main() -> None: parser.add_argument("--hf-repo", required=True, help="Hugging Face repo path") parser.add_argument("--output", required=True, help="Output GGUF path") parser.add_argument("--quant", default="Q4_K_M", choices=["Q4_K_M", "Q5_0", "Q4_0"]) - parser.add_argument("--llama-cpp", default="llama.cpp", help="Path to llama.cpp repo") + parser.add_argument( + "--llama-cpp", default="llama.cpp", help="Path to llama.cpp repo" + ) args = parser.parse_args() llama_dir = Path(args.llama_cpp) diff --git a/benchmarks/arc_benchmark.py b/benchmarks/arc_benchmark.py index e4629ca..81bccdf 100644 --- a/benchmarks/arc_benchmark.py +++ b/benchmarks/arc_benchmark.py @@ -13,6 +13,7 @@ # Add parent directory to path for imports import sys + sys.path.append(str(Path(__file__).parent.parent)) from core.orchestrator import JugaadOrchestrator @@ -22,9 +23,10 @@ logging.basicConfig(level=logging.INFO) logger = logging.getLogger("ARC-Benchmark") + class ARCBenchmark: """ARC-AGI 2 Benchmark Runner""" - + def __init__(self, pipeline: TRVPipeline): self.pipeline = pipeline self.results = { @@ -36,31 +38,33 @@ def __init__(self, pipeline: TRVPipeline): "failed": 0, "timeout": 0, "total_time": 0.0, - "problems": [] + "problems": [], } - + def load_problems(self, problem_file: str) -> List[Dict[str, Any]]: """Load ARC-AGI problems from file""" try: - with open(problem_file, 'r', encoding='utf-8') as f: + with open(problem_file, "r", encoding="utf-8") as f: data = json.load(f) return data.get("problems", []) except Exception as e: logger.error(f"Failed to load problems: {e}") return [] - - def evaluate_problem(self, problem: Dict[str, Any], timeout: int = 300) -> Dict[str, Any]: + + def evaluate_problem( + self, problem: Dict[str, Any], timeout: int = 300 + ) -> Dict[str, Any]: """Evaluate a single problem""" problem_id = problem.get("id", "unknown") query = problem.get("query", "") expected = problem.get("expected", "") language = problem.get("language", "english") problem_type = problem.get("type", "abstract") - + logger.info(f"🧪 Evaluating {problem_id}: {query[:50]}...") - + start_time = time.time() - + try: # Execute pipeline with ARC-AGI optimized settings result = self.pipeline.execute( @@ -69,18 +73,16 @@ def evaluate_problem(self, problem: Dict[str, Any], timeout: int = 300) -> Dict[ enable_critic=True, enable_deep_cot=True, enable_self_consistency=True, - reasoning_model="reasoner" + reasoning_model="reasoner", ) - + elapsed = time.time() - start_time - + # Check if solution is correct is_correct = self._check_solution( - result["final_answer"], - expected, - problem_type + result["final_answer"], expected, problem_type ) - + problem_result = { "id": problem_id, "query": query, @@ -91,121 +93,129 @@ def evaluate_problem(self, problem: Dict[str, Any], timeout: int = 300) -> Dict[ "solved": is_correct, "time": elapsed, "critic_iterations": result["critic_iterations"], - "trace": result["reasoning_trace"][:3] # First 3 steps + "trace": result["reasoning_trace"][:3], # First 3 steps } - + if is_correct: logger.info(f"✅ {problem_id}: SOLVED in {elapsed:.1f}s") self.results["solved"] += 1 else: logger.info(f"❌ {problem_id}: FAILED in {elapsed:.1f}s") self.results["failed"] += 1 - + return problem_result - + except Exception as e: elapsed = time.time() - start_time logger.error(f"❌ {problem_id}: ERROR in {elapsed:.1f}s - {e}") self.results["failed"] += 1 - + return { "id": problem_id, "query": query, "error": str(e), "solved": False, - "time": elapsed + "time": elapsed, } - - def evaluate(self, problem_file: str, max_problems: Optional[int] = None) -> Dict[str, Any]: + + def evaluate( + self, problem_file: str, max_problems: Optional[int] = None + ) -> Dict[str, Any]: """Evaluate all problems""" problems = self.load_problems(problem_file) - + if not problems: logger.error("No problems to evaluate") return self.results - + if max_problems: problems = problems[:max_problems] - + self.results["total_problems"] = len(problems) - + logger.info("=" * 60) logger.info(f"🧪 Starting ARC-AGI 2 Benchmark") logger.info(f"📊 Total problems: {len(problems)}") logger.info(f"🎯 Max problems: {max_problems or 'all'}") logger.info("=" * 60) - + start_time = time.time() - + for i, problem in enumerate(problems, 1): logger.info("") logger.info(f"Problem {i}/{len(problems)}") logger.info("-" * 40) - + result = self.evaluate_problem(problem) self.results["problems"].append(result) - + total_time = time.time() - start_time self.results["total_time"] = total_time - + # Calculate summary statistics - self.results["accuracy"] = self.results["solved"] / len(problems) if problems else 0 + self.results["accuracy"] = ( + self.results["solved"] / len(problems) if problems else 0 + ) self.results["average_time"] = total_time / len(problems) if problems else 0 - + # Print summary self._print_summary() - + return self.results - + def _check_solution(self, generated: str, expected: str, problem_type: str) -> bool: """Check if solution is correct""" if not expected: return True # No expected answer - + gen_normalized = generated.strip().lower() exp_normalized = expected.strip().lower() - + # Exact match if gen_normalized == exp_normalized: return True - + # Containment check if exp_normalized in gen_normalized or gen_normalized in exp_normalized: return True - + # For numerical answers, extract numbers if problem_type in ["numerical", "math"]: import re - gen_numbers = re.findall(r'\d+(?:\.\d+)?', gen_normalized) - exp_numbers = re.findall(r'\d+(?:\.\d+)?', exp_normalized) - + + gen_numbers = re.findall(r"\d+(?:\.\d+)?", gen_normalized) + exp_numbers = re.findall(r"\d+(?:\.\d+)?", exp_normalized) + if gen_numbers and exp_numbers: return gen_numbers[0] == exp_numbers[0] - + return False - + def _print_summary(self): """Print benchmark summary""" print("\n" + "=" * 60) print("📊 ARC-AGI 2 BENCHMARK RESULTS") print("=" * 60) - print(f"🎯 Accuracy: {self.results['accuracy']*100:.1f}% ({self.results['solved']}/{self.results['total_problems']})") + print( + f"🎯 Accuracy: {self.results['accuracy']*100:.1f}% ({self.results['solved']}/{self.results['total_problems']})" + ) print(f"⏱️ Total time: {self.results['total_time']:.1f}s") print(f"📈 Average time: {self.results['average_time']:.1f}s") print(f"✅ Solved: {self.results['solved']}") print(f"❌ Failed: {self.results['failed']}") print(f"⏰ Timeout: {self.results['timeout']}") print("=" * 60) - + def save_results(self, output_file: str): """Save results to file""" try: - with open(output_file, 'w', encoding='utf-8') as f: + with open(output_file, "w", encoding="utf-8") as f: json.dump(self.results, f, ensure_ascii=False, indent=2) logger.info(f"✅ Results saved to: {output_file}") except Exception as e: logger.error(f"Failed to save results: {e}") + def create_sample_problems(): """Create sample ARC-AGI problems for testing""" problems = { @@ -216,15 +226,15 @@ def create_sample_problems(): "expected": "32", "language": "english", "type": "numerical", - "category": "pattern_recognition" + "category": "pattern_recognition", }, { - "id": "logic_001", + "id": "logic_001", "query": "All roses are flowers. Some flowers fade quickly. Therefore, some roses fade quickly. Is this reasoning correct?", "expected": "No, this reasoning is not necessarily correct", "language": "english", "type": "logical", - "category": "syllogism" + "category": "syllogism", }, { "id": "math_001", @@ -232,7 +242,7 @@ def create_sample_problems(): "expected": "60 किमी प्रति घंटा", "language": "hindi", "type": "math", - "category": "word_problem" + "category": "word_problem", }, { "id": "tamil_001", @@ -240,7 +250,7 @@ def create_sample_problems(): "expected": "8 ஆப்பிள்கள்", "language": "tamil", "type": "math", - "category": "subtraction" + "category": "subtraction", }, { "id": "pattern_002", @@ -248,15 +258,15 @@ def create_sample_problems(): "expected": "I", "language": "english", "type": "pattern", - "category": "alphabet_sequence" + "category": "alphabet_sequence", }, { "id": "spatial_001", "query": "If you rotate a square 90 degrees clockwise, what shape do you get?", "expected": "A square", - "language": "english", + "language": "english", "type": "spatial", - "category": "rotation" + "category": "rotation", }, { "id": "bengali_001", @@ -264,16 +274,17 @@ def create_sample_problems(): "expected": "40 বর্গমিটার", "language": "bengali", "type": "math", - "category": "area" - } + "category": "area", + }, ] } - + return problems + def main(): import argparse - + parser = argparse.ArgumentParser( description="ARC-AGI 2 Benchmark for Ariv", formatter_class=argparse.RawDescriptionHelpFormatter, @@ -290,31 +301,35 @@ def main(): # Create sample problems file python arc_benchmark.py --create-sample - """ + """, ) - + parser.add_argument("--problems", help="JSON file containing problems") parser.add_argument("--sample", action="store_true", help="Use sample problems") parser.add_argument("--max", type=int, help="Maximum number of problems to run") - parser.add_argument("--create-sample", action="store_true", help="Create sample problems file") - parser.add_argument("--output", default="arc_results.json", help="Output file for results") - + parser.add_argument( + "--create-sample", action="store_true", help="Create sample problems file" + ) + parser.add_argument( + "--output", default="arc_results.json", help="Output file for results" + ) + args = parser.parse_args() - + # Handle create sample if args.create_sample: problems = create_sample_problems() - with open("sample_problems.json", 'w', encoding='utf-8') as f: + with open("sample_problems.json", "w", encoding="utf-8") as f: json.dump(problems, f, ensure_ascii=False, indent=2) print("✅ Sample problems created: sample_problems.json") return - + # Determine problem file problem_file = None if args.sample: problems = create_sample_problems() # Save temporarily - with open("temp_problems.json", 'w', encoding='utf-8') as f: + with open("temp_problems.json", "w", encoding="utf-8") as f: json.dump(problems, f, ensure_ascii=False, indent=2) problem_file = "temp_problems.json" elif args.problems: @@ -322,47 +337,49 @@ def main(): else: print("❌ Please specify --problems, --sample, or --create-sample") return - + # Check if file exists if not Path(problem_file).exists(): print(f"❌ Problem file not found: {problem_file}") return - + # Initialize system print("🚀 Initializing Ariv system...") try: model_paths = get_model_paths() orchestrator = JugaadOrchestrator(model_paths) - + # Load prompts prompts_file = Path(__file__).parent.parent / "prompts" / "meta_prompts.yaml" if prompts_file.exists(): import yaml - with open(prompts_file, 'r', encoding='utf-8') as f: + + with open(prompts_file, "r", encoding="utf-8") as f: prompts = yaml.safe_load(f) else: prompts = {} - + pipeline = TRVPipeline(orchestrator, prompts) print("✅ System initialized") - + except Exception as e: print(f"❌ Failed to initialize system: {e}") return - + # Run benchmark try: benchmark = ARCBenchmark(pipeline) results = benchmark.evaluate(problem_file, max_problems=args.max) benchmark.save_results(args.output) - + except Exception as e: print(f"❌ Benchmark failed: {e}") - + finally: # Cleanup temp file if args.sample and Path("temp_problems.json").exists(): Path("temp_problems.json").unlink() + if __name__ == "__main__": main() diff --git a/benchmarks/arc_hinglish.py b/benchmarks/arc_hinglish.py index fffef57..f5ec8af 100644 --- a/benchmarks/arc_hinglish.py +++ b/benchmarks/arc_hinglish.py @@ -15,6 +15,7 @@ logging.basicConfig(level=logging.INFO) logger = logging.getLogger("ARC-Hinglish") + class ARCHinglishRunner: """ Adapts ARC-AGI style abstract reasoning for Hinglish (Hindi-English code-mixing) @@ -48,12 +49,14 @@ def solve_task(self, task_examples: List[Dict], test_input: Dict) -> Dict: prompt_parts.append("Main neeche diye gaye examples se pattern seekhna hai:") for i, ex in enumerate(task_examples, 1): - desc = ex.get('description', '') + desc = ex.get("description", "") prompt_parts.append(f"\nExample {i}: {desc}") prompt_parts.append(f"Input: {ex['input']}") prompt_parts.append(f"Output: {ex['output']}") - prompt_parts.append(f"\nAb isko solve karo: {test_input.get('description', '')}") + prompt_parts.append( + f"\nAb isko solve karo: {test_input.get('description', '')}" + ) prompt_parts.append(f"Test Input: {test_input['input']}") prompt_parts.append("Test Output:") @@ -68,9 +71,9 @@ def solve_task(self, task_examples: List[Dict], test_input: Dict) -> Dict: result = self.pipeline.execute( query=full_prompt, language="hinglish", - enable_critic=True # Self-correction + enable_critic=True, # Self-correction ) - candidates.append(result['final_answer']) + candidates.append(result["final_answer"]) # Simple majority voting (exact match on grid) prediction = self._majority_vote(candidates) @@ -79,18 +82,20 @@ def solve_task(self, task_examples: List[Dict], test_input: Dict) -> Dict: "prediction": prediction, "candidates": candidates, "confidence": self._calculate_confidence(candidates), - "reasoning": result['reasoning_trace'] + "reasoning": result["reasoning_trace"], } def _majority_vote(self, candidates: List[str]) -> str: """Select most common answer""" from collections import Counter + vote_counts = Counter(candidates) return vote_counts.most_common(1)[0][0] def _calculate_confidence(self, candidates: List[str]) -> float: """Calculate agreement ratio between candidates""" from collections import Counter + vote_counts = Counter(candidates) top_count = vote_counts.most_common(1)[0][1] return top_count / len(candidates) @@ -108,12 +113,9 @@ def evaluate_dataset(self, dataset_path: str) -> Dict: for task in tasks: try: - result = self.solve_task( - task['train'], - task['test'] - ) + result = self.solve_task(task["train"], task["test"]) - if result['prediction'] == task['test']['output']: + if result["prediction"] == task["test"]["output"]: correct += 1 total += 1 @@ -125,11 +127,8 @@ def evaluate_dataset(self, dataset_path: str) -> Dict: accuracy = correct / total if total > 0 else 0 logger.info(f"ARC-Hinglish Accuracy: {accuracy:.2%} ({correct}/{total})") - return { - "accuracy": accuracy, - "correct": correct, - "total": total - } + return {"accuracy": accuracy, "correct": correct, "total": total} + if __name__ == "__main__": import argparse diff --git a/benchmarks/run_bench.py b/benchmarks/run_bench.py index 86e45bc..13ba62d 100644 --- a/benchmarks/run_bench.py +++ b/benchmarks/run_bench.py @@ -94,7 +94,9 @@ async def _collect() -> None: return "".join(result), duration, token_count -def run_benchmark(models: List[str], lang: str, subset: str, output_dir: Path) -> Tuple[Path, Path]: +def run_benchmark( + models: List[str], lang: str, subset: str, output_dir: Path +) -> Tuple[Path, Path]: dataset = _load_dataset(lang, subset) if not dataset: raise ValueError("Dataset not found or empty") @@ -134,7 +136,16 @@ def run_benchmark(models: List[str], lang: str, subset: str, output_dir: Path) - with csv_path.open("w", newline="", encoding="utf-8") as handle: writer = csv.writer(handle) writer.writerow( - ["model", "lang", "subset", "bleu", "chrf", "throughput_tps", "latency_p50", "latency_p95"] + [ + "model", + "lang", + "subset", + "bleu", + "chrf", + "throughput_tps", + "latency_p50", + "latency_p95", + ] ) for row in rows: writer.writerow( @@ -174,7 +185,12 @@ def main() -> None: parser.add_argument("--subset", default="dev") args = parser.parse_args() - run_benchmark(models=args.models, lang=args.lang, subset=args.subset, output_dir=Path("benchmarks/results")) + run_benchmark( + models=args.models, + lang=args.lang, + subset=args.subset, + output_dir=Path("benchmarks/results"), + ) if __name__ == "__main__": diff --git a/benchmarks/sanskriti_eval.py b/benchmarks/sanskriti_eval.py index 60b3d45..f691ebb 100644 --- a/benchmarks/sanskriti_eval.py +++ b/benchmarks/sanskriti_eval.py @@ -16,12 +16,13 @@ logging.basicConfig(level=logging.INFO) logger = logging.getLogger("SANSKRITI") + class SanskritiBenchmark: """ Evaluates the Maha-System on the SANSKRITI dataset: 21,853 question-answer pairs across Indian states/UTs covering: - Rituals and festivals - - Regional cuisine + - Regional cuisine - Local customs and Little Traditions - Traditional medicine (Ayurveda) """ @@ -48,7 +49,7 @@ def load_dataset(self) -> List[Dict]: with open(self.data_path) as f: data = json.load(f) - return data.get('examples', []) + return data.get("examples", []) def evaluate(self, max_samples: int = None) -> Dict: """ @@ -56,7 +57,7 @@ def evaluate(self, max_samples: int = None) -> Dict: Returns accuracy metrics by category: - Overall accuracy - - Rituals accuracy + - Rituals accuracy - Cuisine accuracy - Regional customs accuracy """ @@ -69,26 +70,24 @@ def evaluate(self, max_samples: int = None) -> Dict: "rituals": {"correct": 0, "total": 0}, "cuisine": {"correct": 0, "total": 0}, "customs": {"correct": 0, "total": 0}, - "festivals": {"correct": 0, "total": 0} + "festivals": {"correct": 0, "total": 0}, } logger.info(f"🧪 Running SANSKRITI evaluation on {len(dataset)} samples...") for item in tqdm(dataset): - question = item['question'] - expected = item['answer'] - category = item.get('category', 'general') - language = item.get('language', 'hindi') + question = item["question"] + expected = item["answer"] + category = item.get("category", "general") + language = item.get("language", "hindi") try: # Run TRV pipeline result = self.pipeline.execute( - query=question, - language=language, - enable_critic=True + query=question, language=language, enable_critic=True ) - predicted = result['final_answer'] + predicted = result["final_answer"] # Simple exact match (can be improved with semantic similarity) is_correct = self._check_answer(predicted, expected) @@ -101,13 +100,15 @@ def evaluate(self, max_samples: int = None) -> Dict: if category in category_stats: category_stats[category]["total"] += 1 - self.results.append({ - "question": question, - "expected": expected, - "predicted": predicted, - "correct": is_correct, - "category": category - }) + self.results.append( + { + "question": question, + "expected": expected, + "predicted": predicted, + "correct": is_correct, + "category": category, + } + ) except Exception as e: logger.error(f"Error on question: {e}") @@ -121,7 +122,7 @@ def evaluate(self, max_samples: int = None) -> Dict: "overall_accuracy": accuracy, "total_samples": total, "correct": correct, - "by_category": {} + "by_category": {}, } for cat, stats in category_stats.items(): @@ -130,7 +131,7 @@ def evaluate(self, max_samples: int = None) -> Dict: metrics["by_category"][cat] = { "accuracy": cat_acc, "correct": stats["correct"], - "total": stats["total"] + "total": stats["total"], } return metrics @@ -145,22 +146,22 @@ def _check_answer(self, predicted: str, expected: str) -> bool: def save_results(self, output_path: str = "sanskriti_results.json"): """Save detailed results to file""" - with open(output_path, 'w') as f: - json.dump({ - "metrics": self.get_metrics(), - "predictions": self.results - }, f, indent=2, ensure_ascii=False) + with open(output_path, "w") as f: + json.dump( + {"metrics": self.get_metrics(), "predictions": self.results}, + f, + indent=2, + ensure_ascii=False, + ) logger.info(f"💾 Results saved to {output_path}") def get_metrics(self): """Return current metrics""" if not self.results: return {} - correct = sum(1 for r in self.results if r['correct']) - return { - "accuracy": correct / len(self.results), - "samples": len(self.results) - } + correct = sum(1 for r in self.results if r["correct"]) + return {"accuracy": correct / len(self.results), "samples": len(self.results)} + if __name__ == "__main__": import argparse @@ -179,7 +180,7 @@ def get_metrics(self): print(f"\n📊 SANSKRITI Results:") print(f"Overall Accuracy: {metrics['overall_accuracy']:.2%}") - for cat, stats in metrics['by_category'].items(): + for cat, stats in metrics["by_category"].items(): print(f" {cat}: {stats['accuracy']:.2%} ({stats['correct']}/{stats['total']})") benchmark.save_results(args.output)