17-native-pdf-rag/evals.py at main · focused-dot-io/17-native-pdf-rag · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
"""Evaluation suite for the native PDF RAG pipeline."""

from pathlib import Path

from langsmith import Client, evaluate
from openevals.llm import create_llm_as_judge

from pipeline import DOCS_DIR, DOCUMENT_STORE, graph, ingest_directory

ls_client = Client()

# Ingest docs if store is empty
if not DOCUMENT_STORE:
    if DOCS_DIR.exists() and list(DOCS_DIR.glob("*.pdf")):
        ingest_directory(DOCS_DIR)
    else:
        print("No PDFs in ./docs/. Add PDFs and re-run.")
        exit(1)

DATASET_NAME = "native-pdf-rag-evals"

if not ls_client.has_dataset(dataset_name=DATASET_NAME):
    dataset = ls_client.create_dataset(
        dataset_name=DATASET_NAME,
        description="Native PDF RAG pipeline evaluation dataset",
    )
    ls_client.create_examples(
        dataset_id=dataset.id,
        inputs=[
            {"question": "What is this document about?"},
            {"question": "Who are the parties or people mentioned?"},
            {"question": "What dates are referenced in the document?"},
            {"question": "What is the main argument or conclusion?"},
        ],
        outputs=[
            {"expected_type": "summary"},
            {"expected_type": "entity_extraction"},
            {"expected_type": "date_extraction"},
            {"expected_type": "argument_analysis"},
        ],
    )

FAITHFULNESS_PROMPT = """\
Question: {inputs[question]}
Retrieved documents and generated answer: {outputs[answer]}

Rate 0.0-1.0 on faithfulness: Is every claim in the answer supported by the
retrieved documents? Unsupported inferences score 0.
Return ONLY: {{"score": <float>, "reasoning": "<explanation>"}}"""

RELEVANCE_PROMPT = """\
Question: {inputs[question]}
Generated answer: {outputs[answer]}

Rate 0.0-1.0 on relevance: Does the answer actually address the user's question?
Return ONLY: {{"score": <float>, "reasoning": "<explanation>"}}"""

faithfulness_judge = create_llm_as_judge(
    prompt=FAITHFULNESS_PROMPT,
    model="anthropic:claude-sonnet-4-5-20250929",
    feedback_key="faithfulness",
    continuous=True,
)

relevance_judge = create_llm_as_judge(
    prompt=RELEVANCE_PROMPT,
    model="anthropic:claude-sonnet-4-5-20250929",
    feedback_key="relevance",
    continuous=True,
)


def retrieval_has_content(inputs: dict, outputs: dict, reference_outputs: dict) -> dict:
    """Did retrieval return documents with actual OCR'd text?"""
    docs = outputs.get("documents", [])
    has_content = any(len(d.get("content", "")) > 50 for d in docs)
    return {"key": "retrieval_has_content", "score": 1.0 if has_content else 0.0}


def confidence_check(inputs: dict, outputs: dict, reference_outputs: dict) -> dict:
    """Does the pipeline report appropriate confidence?"""
    confidence = outputs.get("confidence", "")
    score = 1.0 if confidence == "high" else 0.5
    return {"key": "confidence", "score": score}


def rewrite_efficiency(inputs: dict, outputs: dict, reference_outputs: dict) -> dict:
    """Fewer rewrites = better initial retrieval. PDF embeddings should need fewer."""
    count = outputs.get("rewrite_count", 0)
    score = 1.0 if count == 0 else (0.7 if count == 1 else 0.4)
    return {"key": "rewrite_efficiency", "score": score}


def target(inputs: dict) -> dict:
    return graph.invoke(
        {
            "question": inputs["question"],
            "rewritten_query": "",
            "documents": [],
            "relevance_score": 0.0,
            "rewrite_count": 0,
            "max_rewrites": 2,
            "answer": "",
            "citations": [],
            "confidence": "",
        }
    )


if __name__ == "__main__":
    results = evaluate(
        target,
        data=DATASET_NAME,
        evaluators=[
            faithfulness_judge,
            relevance_judge,
            retrieval_has_content,
            confidence_check,
            rewrite_efficiency,
        ],
        experiment_prefix="native-pdf-rag-v1",
        max_concurrency=2,
    )
    print("\nEvaluation complete. Check LangSmith for results.")