amdb/benchmark.py at main · BETAER-08/amdb · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
import subprocess
import os
import sys
import re
import glob
import heapq

try:
    import tiktoken
except ImportError:
    print("❌ Error: 'tiktoken' library is required.")
    print("👉 Run: pip install tiktoken")
    sys.exit(1)

ENCODER = tiktoken.get_encoding("cl100k_base")

def run_command(cmd, capture=True):
    try:
        result = subprocess.run(cmd, capture_output=capture, text=True, check=False)
        return result
    except Exception as e:
        return None

def count_tokens(text):
    return len(ENCODER.encode(text))

def scan_project_files(root_dir="src"):
    file_list = []
    total_tokens = 0

    for root, _, files in os.walk(root_dir):
        for file in files:
            if file.endswith(".rs"):
                full_path = os.path.join(root, file)
                query_name = file.replace(".rs", "").replace("_", " ")

                with open(full_path, "r", encoding="utf-8") as f:
                    content = f.read()
                    raw_len = count_tokens(content)
                    total_tokens += raw_len

                file_list.append({
                    "name": query_name,
                    "path": full_path,
                    "tokens": raw_len
                })

    return file_list, total_tokens

def extract_interface_section(md_path, target_file_path):
    if not os.path.exists(md_path): return ""
    with open(md_path, "r", encoding="utf-8") as f:
        content = f.read()

    filename = os.path.basename(target_file_path)
    escaped_name = re.escape(filename)

    pattern = re.compile(f"### .*{escaped_name}(.*?)(?=\n#|\\Z)", re.DOTALL)

    match = pattern.search(content)
    if match:
        return match.group(1).strip()
    return ""

def run_benchmark():
    print(f"\n🚀 Starting AMDB Official Benchmark Suite")
    print(f"   Target: {os.getcwd()}")
    print("-" * 60)

    if not os.path.exists(".database"):
        print("⚙️ Initializing AMDB database...", end=" ")
        run_command(["amdb", "init"])
        print("Done.")

    print("📊 Measuring Codebase Baseline...", end=" ")
    file_list, total_project_tokens = scan_project_files("src")
    print(f"Done.")
    print(f"   - Files Scanned: {len(file_list)}")
    print(f"   - Total Raw Tokens: {total_project_tokens:,} (Full codebase size)")

    results = {
        "retrieval_hits": 0,
        "graph_hits": 0,
        "global_reduction_sum": 0,
        "file_stats": []
    }

    print("\n⚔️ Running Comprehensive Tests...")

    for file_info in file_list:
        query = file_info["name"]
        raw_tokens = file_info["tokens"]
        target_path = file_info["path"]

        if os.path.exists(".amdb"):
            for f in glob.glob(".amdb/*.md"):
                os.remove(f)

        run_command(["amdb", "generate", "--focus", query])

        generated_files = glob.glob(".amdb/*.md")
        if not generated_files: continue
        md_path = generated_files[0]

        with open(md_path, "r", encoding="utf-8") as f:
            full_md_content = f.read()

        if os.path.basename(target_path) in full_md_content:
            results["retrieval_hits"] += 1

        if "```mermaid" in full_md_content:
            results["graph_hits"] += 1

        amdb_full_tokens = count_tokens(full_md_content)
        global_reduction = (1 - amdb_full_tokens / total_project_tokens) * 100
        results["global_reduction_sum"] += global_reduction

        pure_summary = extract_interface_section(md_path, target_path)
        amdb_pure_tokens = count_tokens(pure_summary) if pure_summary else 0

        if raw_tokens > 0 and amdb_pure_tokens > 0:
            compression = (1 - amdb_pure_tokens / raw_tokens) * 100
            results["file_stats"].append({
                "name": query,
                "raw": raw_tokens,
                "amdb": amdb_pure_tokens,
                "compression": compression
            })

    top_5_heavy = heapq.nlargest(5, results["file_stats"], key=lambda x: x["raw"])

    print("\n" + "=" * 95)
    print("🥊 HEAVYWEIGHT MATCH: Implementation (Raw) vs Interface (AMDB)")
    print("   Target: Top 5 Largest Files (Evaluating Noise Reduction)")
    print("=" * 95)
    print(f"{'File Name':<25} | {'Raw Tokens':<10} | {'AMDB Tokens':<12} | {'Compression':<12} | {'Verdict'}")
    print("-" * 95)

    total_heavy_comp = 0
    for match in top_5_heavy:
        winner = "🏆 AMDB" if match["compression"] > 50 else "Raw"
        print(
            f"{match['name']:<25} | {match['raw']:<10} | {match['amdb']:<12} | {match['compression']:5.1f}%       | {winner}")
        total_heavy_comp += match["compression"]

    total_files = len(file_list)

    if total_files == 0:
        print("\n❌ Error: No source files found to benchmark.")
        return

    success_rate = (results["retrieval_hits"] / total_files) * 100
    graph_rate = (results["graph_hits"] / total_files) * 100
    avg_global_reduction = results["global_reduction_sum"] / total_files
    avg_heavy_comp = total_heavy_comp / len(top_5_heavy) if top_5_heavy else 0

    print("\n" + "=" * 60)
    print("🏆 AMDB OFFICIAL BENCHMARK SCORECARD")
    print("=" * 60)

    print(f"\n1. 🎯 Precision Targeting (Retrieval Accuracy)")
    print(f"   - Score: {success_rate:.1f}%")
    print(f"   - Verdict: Finds the exact file requested.")

    print(f"\n2. 📉 Global Efficiency (Token Savings)")
    print(f"   - Score: {avg_global_reduction:.1f}% Reduction")
    print(f"   - Verdict: Saves ~{avg_global_reduction:.0f}% of tokens compared to full-repo context dumping.")

    print(f"\n3. 🗜️ Noise Reduction (Interface Extraction)")
    print(f"   - Score: {avg_heavy_comp:.1f}% Compression (on complex files)")
    print(f"   - Verdict: Strips implementation details, keeping only structural context.")

    print(f"\n4. 🕸️ Context Awareness")
    print(f"   - Score: {graph_rate:.1f}% Graph Inclusion")
    print(f"   - Verdict: Dependency graphs were generated for spatial reasoning.")

    print("\n✅ Benchmark Complete.")

if __name__ == "__main__":
    run_benchmark()