From 22185051a9758f4da633a2fe608b92d9329df676 Mon Sep 17 00:00:00 2001
From: Harsha Vardhan <harvatechs@gmail.com>
Date: Wed, 11 Feb 2026 18:19:09 +0530
Subject: [PATCH] Format Python sources with Black

---
 ariv/cli/arivctl.py                  |   4 +-
 ariv/orchestrator/__init__.py        |   7 +-
 ariv/orchestrator/router.py          |   2 +-
 ariv/runner/llama_cli.py             |   8 +-
 ariv/scripts/convert_and_quantize.py |   4 +-
 benchmarks/arc_benchmark.py          | 175 +++++++++++++++------------
 benchmarks/arc_hinglish.py           |  29 +++--
 benchmarks/run_bench.py              |  22 +++-
 benchmarks/sanskriti_eval.py         |  65 +++++-----
 9 files changed, 181 insertions(+), 135 deletions(-)

diff --git a/ariv/cli/arivctl.py b/ariv/cli/arivctl.py
index ab602aa..15ceaa1 100644
--- a/ariv/cli/arivctl.py
+++ b/ariv/cli/arivctl.py
@@ -43,7 +43,9 @@ def cmd_start(host: str, port: int) -> None:
 def cmd_bench(models: List[str], lang: str, subset: str) -> None:
     from benchmarks.run_bench import run_benchmark
 
-    run_benchmark(models=models, lang=lang, subset=subset, output_dir=Path("benchmarks/results"))
+    run_benchmark(
+        models=models, lang=lang, subset=subset, output_dir=Path("benchmarks/results")
+    )
 
 
 def cmd_download(dry_run: bool) -> None:
diff --git a/ariv/orchestrator/__init__.py b/ariv/orchestrator/__init__.py
index e398159..bd22a17 100644
--- a/ariv/orchestrator/__init__.py
+++ b/ariv/orchestrator/__init__.py
@@ -1,5 +1,10 @@
 """Orchestration components."""
 
-from ariv.orchestrator.router import HardwareProfile, ModelManager, RouteDecision, Router
+from ariv.orchestrator.router import (
+    HardwareProfile,
+    ModelManager,
+    RouteDecision,
+    Router,
+)
 
 __all__ = ["HardwareProfile", "ModelManager", "RouteDecision", "Router"]
diff --git a/ariv/orchestrator/router.py b/ariv/orchestrator/router.py
index dd55c09..4b71a84 100644
--- a/ariv/orchestrator/router.py
+++ b/ariv/orchestrator/router.py
@@ -53,7 +53,7 @@ def _detect_indic(preferred_lang: Optional[str], text: str) -> bool:
     if preferred_lang and preferred_lang.lower() in INDIC_LANGS:
         return True
     for char in text:
-        if "\u0900" <= char <= "\u0DFF":
+        if "\u0900" <= char <= "\u0dff":
             return True
     return False
 
diff --git a/ariv/runner/llama_cli.py b/ariv/runner/llama_cli.py
index b134036..4c751fa 100644
--- a/ariv/runner/llama_cli.py
+++ b/ariv/runner/llama_cli.py
@@ -81,7 +81,9 @@ async def _collect_stderr() -> None:
             except json.JSONDecodeError:
                 yield line
                 continue
-            token = payload.get("token") or payload.get("content") or payload.get("text")
+            token = (
+                payload.get("token") or payload.get("content") or payload.get("text")
+            )
             if token:
                 yield str(token)
 
@@ -89,7 +91,9 @@ async def _collect_stderr() -> None:
         await stderr_task
 
         if process.returncode != 0:
-            stderr_tail = b"".join(stderr_chunks).decode("utf-8", errors="replace").strip()
+            stderr_tail = (
+                b"".join(stderr_chunks).decode("utf-8", errors="replace").strip()
+            )
             if len(stderr_tail) > 1200:
                 stderr_tail = stderr_tail[-1200:]
             raise RuntimeError(
diff --git a/ariv/scripts/convert_and_quantize.py b/ariv/scripts/convert_and_quantize.py
index 03646ea..dd3a818 100644
--- a/ariv/scripts/convert_and_quantize.py
+++ b/ariv/scripts/convert_and_quantize.py
@@ -17,7 +17,9 @@ def main() -> None:
     parser.add_argument("--hf-repo", required=True, help="Hugging Face repo path")
     parser.add_argument("--output", required=True, help="Output GGUF path")
     parser.add_argument("--quant", default="Q4_K_M", choices=["Q4_K_M", "Q5_0", "Q4_0"])
-    parser.add_argument("--llama-cpp", default="llama.cpp", help="Path to llama.cpp repo")
+    parser.add_argument(
+        "--llama-cpp", default="llama.cpp", help="Path to llama.cpp repo"
+    )
     args = parser.parse_args()
 
     llama_dir = Path(args.llama_cpp)
diff --git a/benchmarks/arc_benchmark.py b/benchmarks/arc_benchmark.py
index e4629ca..81bccdf 100644
--- a/benchmarks/arc_benchmark.py
+++ b/benchmarks/arc_benchmark.py
@@ -13,6 +13,7 @@
 
 # Add parent directory to path for imports
 import sys
+
 sys.path.append(str(Path(__file__).parent.parent))
 
 from core.orchestrator import JugaadOrchestrator
@@ -22,9 +23,10 @@
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger("ARC-Benchmark")
 
+
 class ARCBenchmark:
     """ARC-AGI 2 Benchmark Runner"""
-    
+
     def __init__(self, pipeline: TRVPipeline):
         self.pipeline = pipeline
         self.results = {
@@ -36,31 +38,33 @@ def __init__(self, pipeline: TRVPipeline):
             "failed": 0,
             "timeout": 0,
             "total_time": 0.0,
-            "problems": []
+            "problems": [],
         }
-        
+
     def load_problems(self, problem_file: str) -> List[Dict[str, Any]]:
         """Load ARC-AGI problems from file"""
         try:
-            with open(problem_file, 'r', encoding='utf-8') as f:
+            with open(problem_file, "r", encoding="utf-8") as f:
                 data = json.load(f)
                 return data.get("problems", [])
         except Exception as e:
             logger.error(f"Failed to load problems: {e}")
             return []
-            
-    def evaluate_problem(self, problem: Dict[str, Any], timeout: int = 300) -> Dict[str, Any]:
+
+    def evaluate_problem(
+        self, problem: Dict[str, Any], timeout: int = 300
+    ) -> Dict[str, Any]:
         """Evaluate a single problem"""
         problem_id = problem.get("id", "unknown")
         query = problem.get("query", "")
         expected = problem.get("expected", "")
         language = problem.get("language", "english")
         problem_type = problem.get("type", "abstract")
-        
+
         logger.info(f"🧪 Evaluating {problem_id}: {query[:50]}...")
-        
+
         start_time = time.time()
-        
+
         try:
             # Execute pipeline with ARC-AGI optimized settings
             result = self.pipeline.execute(
@@ -69,18 +73,16 @@ def evaluate_problem(self, problem: Dict[str, Any], timeout: int = 300) -> Dict[
                 enable_critic=True,
                 enable_deep_cot=True,
                 enable_self_consistency=True,
-                reasoning_model="reasoner"
+                reasoning_model="reasoner",
             )
-            
+
             elapsed = time.time() - start_time
-            
+
             # Check if solution is correct
             is_correct = self._check_solution(
-                result["final_answer"], 
-                expected, 
-                problem_type
+                result["final_answer"], expected, problem_type
             )
-            
+
             problem_result = {
                 "id": problem_id,
                 "query": query,
@@ -91,121 +93,129 @@ def evaluate_problem(self, problem: Dict[str, Any], timeout: int = 300) -> Dict[
                 "solved": is_correct,
                 "time": elapsed,
                 "critic_iterations": result["critic_iterations"],
-                "trace": result["reasoning_trace"][:3]  # First 3 steps
+                "trace": result["reasoning_trace"][:3],  # First 3 steps
             }
-            
+
             if is_correct:
                 logger.info(f"✅ {problem_id}: SOLVED in {elapsed:.1f}s")
                 self.results["solved"] += 1
             else:
                 logger.info(f"❌ {problem_id}: FAILED in {elapsed:.1f}s")
                 self.results["failed"] += 1
-                
+
             return problem_result
-            
+
         except Exception as e:
             elapsed = time.time() - start_time
             logger.error(f"❌ {problem_id}: ERROR in {elapsed:.1f}s - {e}")
             self.results["failed"] += 1
-            
+
             return {
                 "id": problem_id,
                 "query": query,
                 "error": str(e),
                 "solved": False,
-                "time": elapsed
+                "time": elapsed,
             }
-            
-    def evaluate(self, problem_file: str, max_problems: Optional[int] = None) -> Dict[str, Any]:
+
+    def evaluate(
+        self, problem_file: str, max_problems: Optional[int] = None
+    ) -> Dict[str, Any]:
         """Evaluate all problems"""
         problems = self.load_problems(problem_file)
-        
+
         if not problems:
             logger.error("No problems to evaluate")
             return self.results
-            
+
         if max_problems:
             problems = problems[:max_problems]
-            
+
         self.results["total_problems"] = len(problems)
-        
+
         logger.info("=" * 60)
         logger.info(f"🧪 Starting ARC-AGI 2 Benchmark")
         logger.info(f"📊 Total problems: {len(problems)}")
         logger.info(f"🎯 Max problems: {max_problems or 'all'}")
         logger.info("=" * 60)
-        
+
         start_time = time.time()
-        
+
         for i, problem in enumerate(problems, 1):
             logger.info("")
             logger.info(f"Problem {i}/{len(problems)}")
             logger.info("-" * 40)
-            
+
             result = self.evaluate_problem(problem)
             self.results["problems"].append(result)
-            
+
         total_time = time.time() - start_time
         self.results["total_time"] = total_time
-        
+
         # Calculate summary statistics
-        self.results["accuracy"] = self.results["solved"] / len(problems) if problems else 0
+        self.results["accuracy"] = (
+            self.results["solved"] / len(problems) if problems else 0
+        )
         self.results["average_time"] = total_time / len(problems) if problems else 0
-        
+
         # Print summary
         self._print_summary()
-        
+
         return self.results
-        
+
     def _check_solution(self, generated: str, expected: str, problem_type: str) -> bool:
         """Check if solution is correct"""
         if not expected:
             return True  # No expected answer
-            
+
         gen_normalized = generated.strip().lower()
         exp_normalized = expected.strip().lower()
-        
+
         # Exact match
         if gen_normalized == exp_normalized:
             return True
-            
+
         # Containment check
         if exp_normalized in gen_normalized or gen_normalized in exp_normalized:
             return True
-            
+
         # For numerical answers, extract numbers
         if problem_type in ["numerical", "math"]:
             import re
-            gen_numbers = re.findall(r'\d+(?:\.\d+)?', gen_normalized)
-            exp_numbers = re.findall(r'\d+(?:\.\d+)?', exp_normalized)
-            
+
+            gen_numbers = re.findall(r"\d+(?:\.\d+)?", gen_normalized)
+            exp_numbers = re.findall(r"\d+(?:\.\d+)?", exp_normalized)
+
             if gen_numbers and exp_numbers:
                 return gen_numbers[0] == exp_numbers[0]
-                
+
         return False
-        
+
     def _print_summary(self):
         """Print benchmark summary"""
         print("\n" + "=" * 60)
         print("📊 ARC-AGI 2 BENCHMARK RESULTS")
         print("=" * 60)
-        print(f"🎯 Accuracy: {self.results['accuracy']*100:.1f}% ({self.results['solved']}/{self.results['total_problems']})")
+        print(
+            f"🎯 Accuracy: {self.results['accuracy']*100:.1f}% ({self.results['solved']}/{self.results['total_problems']})"
+        )
         print(f"⏱️  Total time: {self.results['total_time']:.1f}s")
         print(f"📈 Average time: {self.results['average_time']:.1f}s")
         print(f"✅ Solved: {self.results['solved']}")
         print(f"❌ Failed: {self.results['failed']}")
         print(f"⏰ Timeout: {self.results['timeout']}")
         print("=" * 60)
-        
+
     def save_results(self, output_file: str):
         """Save results to file"""
         try:
-            with open(output_file, 'w', encoding='utf-8') as f:
+            with open(output_file, "w", encoding="utf-8") as f:
                 json.dump(self.results, f, ensure_ascii=False, indent=2)
             logger.info(f"✅ Results saved to: {output_file}")
         except Exception as e:
             logger.error(f"Failed to save results: {e}")
 
+
 def create_sample_problems():
     """Create sample ARC-AGI problems for testing"""
     problems = {
@@ -216,15 +226,15 @@ def create_sample_problems():
                 "expected": "32",
                 "language": "english",
                 "type": "numerical",
-                "category": "pattern_recognition"
+                "category": "pattern_recognition",
             },
             {
-                "id": "logic_001", 
+                "id": "logic_001",
                 "query": "All roses are flowers. Some flowers fade quickly. Therefore, some roses fade quickly. Is this reasoning correct?",
                 "expected": "No, this reasoning is not necessarily correct",
                 "language": "english",
                 "type": "logical",
-                "category": "syllogism"
+                "category": "syllogism",
             },
             {
                 "id": "math_001",
@@ -232,7 +242,7 @@ def create_sample_problems():
                 "expected": "60 किमी प्रति घंटा",
                 "language": "hindi",
                 "type": "math",
-                "category": "word_problem"
+                "category": "word_problem",
             },
             {
                 "id": "tamil_001",
@@ -240,7 +250,7 @@ def create_sample_problems():
                 "expected": "8 ஆப்பிள்கள்",
                 "language": "tamil",
                 "type": "math",
-                "category": "subtraction"
+                "category": "subtraction",
             },
             {
                 "id": "pattern_002",
@@ -248,15 +258,15 @@ def create_sample_problems():
                 "expected": "I",
                 "language": "english",
                 "type": "pattern",
-                "category": "alphabet_sequence"
+                "category": "alphabet_sequence",
             },
             {
                 "id": "spatial_001",
                 "query": "If you rotate a square 90 degrees clockwise, what shape do you get?",
                 "expected": "A square",
-                "language": "english", 
+                "language": "english",
                 "type": "spatial",
-                "category": "rotation"
+                "category": "rotation",
             },
             {
                 "id": "bengali_001",
@@ -264,16 +274,17 @@ def create_sample_problems():
                 "expected": "40 বর্গমিটার",
                 "language": "bengali",
                 "type": "math",
-                "category": "area"
-            }
+                "category": "area",
+            },
         ]
     }
-    
+
     return problems
 
+
 def main():
     import argparse
-    
+
     parser = argparse.ArgumentParser(
         description="ARC-AGI 2 Benchmark for Ariv",
         formatter_class=argparse.RawDescriptionHelpFormatter,
@@ -290,31 +301,35 @@ def main():
   
   # Create sample problems file
   python arc_benchmark.py --create-sample
-        """
+        """,
     )
-    
+
     parser.add_argument("--problems", help="JSON file containing problems")
     parser.add_argument("--sample", action="store_true", help="Use sample problems")
     parser.add_argument("--max", type=int, help="Maximum number of problems to run")
-    parser.add_argument("--create-sample", action="store_true", help="Create sample problems file")
-    parser.add_argument("--output", default="arc_results.json", help="Output file for results")
-    
+    parser.add_argument(
+        "--create-sample", action="store_true", help="Create sample problems file"
+    )
+    parser.add_argument(
+        "--output", default="arc_results.json", help="Output file for results"
+    )
+
     args = parser.parse_args()
-    
+
     # Handle create sample
     if args.create_sample:
         problems = create_sample_problems()
-        with open("sample_problems.json", 'w', encoding='utf-8') as f:
+        with open("sample_problems.json", "w", encoding="utf-8") as f:
             json.dump(problems, f, ensure_ascii=False, indent=2)
         print("✅ Sample problems created: sample_problems.json")
         return
-    
+
     # Determine problem file
     problem_file = None
     if args.sample:
         problems = create_sample_problems()
         # Save temporarily
-        with open("temp_problems.json", 'w', encoding='utf-8') as f:
+        with open("temp_problems.json", "w", encoding="utf-8") as f:
             json.dump(problems, f, ensure_ascii=False, indent=2)
         problem_file = "temp_problems.json"
     elif args.problems:
@@ -322,47 +337,49 @@ def main():
     else:
         print("❌ Please specify --problems, --sample, or --create-sample")
         return
-    
+
     # Check if file exists
     if not Path(problem_file).exists():
         print(f"❌ Problem file not found: {problem_file}")
         return
-    
+
     # Initialize system
     print("🚀 Initializing Ariv system...")
     try:
         model_paths = get_model_paths()
         orchestrator = JugaadOrchestrator(model_paths)
-        
+
         # Load prompts
         prompts_file = Path(__file__).parent.parent / "prompts" / "meta_prompts.yaml"
         if prompts_file.exists():
             import yaml
-            with open(prompts_file, 'r', encoding='utf-8') as f:
+
+            with open(prompts_file, "r", encoding="utf-8") as f:
                 prompts = yaml.safe_load(f)
         else:
             prompts = {}
-            
+
         pipeline = TRVPipeline(orchestrator, prompts)
         print("✅ System initialized")
-        
+
     except Exception as e:
         print(f"❌ Failed to initialize system: {e}")
         return
-    
+
     # Run benchmark
     try:
         benchmark = ARCBenchmark(pipeline)
         results = benchmark.evaluate(problem_file, max_problems=args.max)
         benchmark.save_results(args.output)
-        
+
     except Exception as e:
         print(f"❌ Benchmark failed: {e}")
-        
+
     finally:
         # Cleanup temp file
         if args.sample and Path("temp_problems.json").exists():
             Path("temp_problems.json").unlink()
 
+
 if __name__ == "__main__":
     main()
diff --git a/benchmarks/arc_hinglish.py b/benchmarks/arc_hinglish.py
index fffef57..f5ec8af 100644
--- a/benchmarks/arc_hinglish.py
+++ b/benchmarks/arc_hinglish.py
@@ -15,6 +15,7 @@
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger("ARC-Hinglish")
 
+
 class ARCHinglishRunner:
     """
     Adapts ARC-AGI style abstract reasoning for Hinglish (Hindi-English code-mixing)
@@ -48,12 +49,14 @@ def solve_task(self, task_examples: List[Dict], test_input: Dict) -> Dict:
         prompt_parts.append("Main neeche diye gaye examples se pattern seekhna hai:")
 
         for i, ex in enumerate(task_examples, 1):
-            desc = ex.get('description', '')
+            desc = ex.get("description", "")
             prompt_parts.append(f"\nExample {i}: {desc}")
             prompt_parts.append(f"Input: {ex['input']}")
             prompt_parts.append(f"Output: {ex['output']}")
 
-        prompt_parts.append(f"\nAb isko solve karo: {test_input.get('description', '')}")
+        prompt_parts.append(
+            f"\nAb isko solve karo: {test_input.get('description', '')}"
+        )
         prompt_parts.append(f"Test Input: {test_input['input']}")
         prompt_parts.append("Test Output:")
 
@@ -68,9 +71,9 @@ def solve_task(self, task_examples: List[Dict], test_input: Dict) -> Dict:
             result = self.pipeline.execute(
                 query=full_prompt,
                 language="hinglish",
-                enable_critic=True  # Self-correction
+                enable_critic=True,  # Self-correction
             )
-            candidates.append(result['final_answer'])
+            candidates.append(result["final_answer"])
 
         # Simple majority voting (exact match on grid)
         prediction = self._majority_vote(candidates)
@@ -79,18 +82,20 @@ def solve_task(self, task_examples: List[Dict], test_input: Dict) -> Dict:
             "prediction": prediction,
             "candidates": candidates,
             "confidence": self._calculate_confidence(candidates),
-            "reasoning": result['reasoning_trace']
+            "reasoning": result["reasoning_trace"],
         }
 
     def _majority_vote(self, candidates: List[str]) -> str:
         """Select most common answer"""
         from collections import Counter
+
         vote_counts = Counter(candidates)
         return vote_counts.most_common(1)[0][0]
 
     def _calculate_confidence(self, candidates: List[str]) -> float:
         """Calculate agreement ratio between candidates"""
         from collections import Counter
+
         vote_counts = Counter(candidates)
         top_count = vote_counts.most_common(1)[0][1]
         return top_count / len(candidates)
@@ -108,12 +113,9 @@ def evaluate_dataset(self, dataset_path: str) -> Dict:
 
         for task in tasks:
             try:
-                result = self.solve_task(
-                    task['train'],
-                    task['test']
-                )
+                result = self.solve_task(task["train"], task["test"])
 
-                if result['prediction'] == task['test']['output']:
+                if result["prediction"] == task["test"]["output"]:
                     correct += 1
 
                 total += 1
@@ -125,11 +127,8 @@ def evaluate_dataset(self, dataset_path: str) -> Dict:
         accuracy = correct / total if total > 0 else 0
         logger.info(f"ARC-Hinglish Accuracy: {accuracy:.2%} ({correct}/{total})")
 
-        return {
-            "accuracy": accuracy,
-            "correct": correct,
-            "total": total
-        }
+        return {"accuracy": accuracy, "correct": correct, "total": total}
+
 
 if __name__ == "__main__":
     import argparse
diff --git a/benchmarks/run_bench.py b/benchmarks/run_bench.py
index 86e45bc..13ba62d 100644
--- a/benchmarks/run_bench.py
+++ b/benchmarks/run_bench.py
@@ -94,7 +94,9 @@ async def _collect() -> None:
     return "".join(result), duration, token_count
 
 
-def run_benchmark(models: List[str], lang: str, subset: str, output_dir: Path) -> Tuple[Path, Path]:
+def run_benchmark(
+    models: List[str], lang: str, subset: str, output_dir: Path
+) -> Tuple[Path, Path]:
     dataset = _load_dataset(lang, subset)
     if not dataset:
         raise ValueError("Dataset not found or empty")
@@ -134,7 +136,16 @@ def run_benchmark(models: List[str], lang: str, subset: str, output_dir: Path) -
     with csv_path.open("w", newline="", encoding="utf-8") as handle:
         writer = csv.writer(handle)
         writer.writerow(
-            ["model", "lang", "subset", "bleu", "chrf", "throughput_tps", "latency_p50", "latency_p95"]
+            [
+                "model",
+                "lang",
+                "subset",
+                "bleu",
+                "chrf",
+                "throughput_tps",
+                "latency_p50",
+                "latency_p95",
+            ]
         )
         for row in rows:
             writer.writerow(
@@ -174,7 +185,12 @@ def main() -> None:
     parser.add_argument("--subset", default="dev")
     args = parser.parse_args()
 
-    run_benchmark(models=args.models, lang=args.lang, subset=args.subset, output_dir=Path("benchmarks/results"))
+    run_benchmark(
+        models=args.models,
+        lang=args.lang,
+        subset=args.subset,
+        output_dir=Path("benchmarks/results"),
+    )
 
 
 if __name__ == "__main__":
diff --git a/benchmarks/sanskriti_eval.py b/benchmarks/sanskriti_eval.py
index 60b3d45..f691ebb 100644
--- a/benchmarks/sanskriti_eval.py
+++ b/benchmarks/sanskriti_eval.py
@@ -16,12 +16,13 @@
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger("SANSKRITI")
 
+
 class SanskritiBenchmark:
     """
     Evaluates the Maha-System on the SANSKRITI dataset:
     21,853 question-answer pairs across Indian states/UTs covering:
     - Rituals and festivals
-    - Regional cuisine  
+    - Regional cuisine
     - Local customs and Little Traditions
     - Traditional medicine (Ayurveda)
     """
@@ -48,7 +49,7 @@ def load_dataset(self) -> List[Dict]:
 
         with open(self.data_path) as f:
             data = json.load(f)
-        return data.get('examples', [])
+        return data.get("examples", [])
 
     def evaluate(self, max_samples: int = None) -> Dict:
         """
@@ -56,7 +57,7 @@ def evaluate(self, max_samples: int = None) -> Dict:
 
         Returns accuracy metrics by category:
         - Overall accuracy
-        - Rituals accuracy  
+        - Rituals accuracy
         - Cuisine accuracy
         - Regional customs accuracy
         """
@@ -69,26 +70,24 @@ def evaluate(self, max_samples: int = None) -> Dict:
             "rituals": {"correct": 0, "total": 0},
             "cuisine": {"correct": 0, "total": 0},
             "customs": {"correct": 0, "total": 0},
-            "festivals": {"correct": 0, "total": 0}
+            "festivals": {"correct": 0, "total": 0},
         }
 
         logger.info(f"🧪 Running SANSKRITI evaluation on {len(dataset)} samples...")
 
         for item in tqdm(dataset):
-            question = item['question']
-            expected = item['answer']
-            category = item.get('category', 'general')
-            language = item.get('language', 'hindi')
+            question = item["question"]
+            expected = item["answer"]
+            category = item.get("category", "general")
+            language = item.get("language", "hindi")
 
             try:
                 # Run TRV pipeline
                 result = self.pipeline.execute(
-                    query=question,
-                    language=language,
-                    enable_critic=True
+                    query=question, language=language, enable_critic=True
                 )
 
-                predicted = result['final_answer']
+                predicted = result["final_answer"]
 
                 # Simple exact match (can be improved with semantic similarity)
                 is_correct = self._check_answer(predicted, expected)
@@ -101,13 +100,15 @@ def evaluate(self, max_samples: int = None) -> Dict:
                 if category in category_stats:
                     category_stats[category]["total"] += 1
 
-                self.results.append({
-                    "question": question,
-                    "expected": expected,
-                    "predicted": predicted,
-                    "correct": is_correct,
-                    "category": category
-                })
+                self.results.append(
+                    {
+                        "question": question,
+                        "expected": expected,
+                        "predicted": predicted,
+                        "correct": is_correct,
+                        "category": category,
+                    }
+                )
 
             except Exception as e:
                 logger.error(f"Error on question: {e}")
@@ -121,7 +122,7 @@ def evaluate(self, max_samples: int = None) -> Dict:
             "overall_accuracy": accuracy,
             "total_samples": total,
             "correct": correct,
-            "by_category": {}
+            "by_category": {},
         }
 
         for cat, stats in category_stats.items():
@@ -130,7 +131,7 @@ def evaluate(self, max_samples: int = None) -> Dict:
                 metrics["by_category"][cat] = {
                     "accuracy": cat_acc,
                     "correct": stats["correct"],
-                    "total": stats["total"]
+                    "total": stats["total"],
                 }
 
         return metrics
@@ -145,22 +146,22 @@ def _check_answer(self, predicted: str, expected: str) -> bool:
 
     def save_results(self, output_path: str = "sanskriti_results.json"):
         """Save detailed results to file"""
-        with open(output_path, 'w') as f:
-            json.dump({
-                "metrics": self.get_metrics(),
-                "predictions": self.results
-            }, f, indent=2, ensure_ascii=False)
+        with open(output_path, "w") as f:
+            json.dump(
+                {"metrics": self.get_metrics(), "predictions": self.results},
+                f,
+                indent=2,
+                ensure_ascii=False,
+            )
         logger.info(f"💾 Results saved to {output_path}")
 
     def get_metrics(self):
         """Return current metrics"""
         if not self.results:
             return {}
-        correct = sum(1 for r in self.results if r['correct'])
-        return {
-            "accuracy": correct / len(self.results),
-            "samples": len(self.results)
-        }
+        correct = sum(1 for r in self.results if r["correct"])
+        return {"accuracy": correct / len(self.results), "samples": len(self.results)}
+
 
 if __name__ == "__main__":
     import argparse
@@ -179,7 +180,7 @@ def get_metrics(self):
     print(f"\n📊 SANSKRITI Results:")
     print(f"Overall Accuracy: {metrics['overall_accuracy']:.2%}")
 
-    for cat, stats in metrics['by_category'].items():
+    for cat, stats in metrics["by_category"].items():
         print(f"  {cat}: {stats['accuracy']:.2%} ({stats['correct']}/{stats['total']})")
 
     benchmark.save_results(args.output)