diff --git a/demo/app.py b/demo/app.py
new file mode 100644
index 0000000..e7914e5
--- /dev/null
+++ b/demo/app.py
@@ -0,0 +1,105 @@
+from flask import Flask, render_template, request, jsonify
+import time
+import re
+from utils import *
+from run_genai import prompt_genai
+import sacrebleu
+
+app = Flask(__name__)
+
+def run_treqa(source, reference, candidates):
+ """
+ A mock function to simulate the TREQA pipeline with server-side logging.
+ """
+ print("\n[TREQA] Starting evaluation...")
+
+ # --- Step 1: Question Generation (QAG) ---
+ print("--- Step 1: Generating Questions (QAG) ---")
+ prompt_qag = QAG_TEMPLATE.format(src_passage=source,
+ ref_passage=reference,
+ alternatives=candidates)
+ response = prompt_genai(SYSTEM_PROMPT_QAG, prompt_qag)
+
+ questions = [x[0] for x in parse_output_default(response)]
+ print(f"--- Step 1 COMPLETED. Generated {len(questions)} questions. ---")
+ for i, q in enumerate(questions):
+ print(f" - Q{i+1}: {q}")
+
+ # --- Step 2: Question Answering (QA) ---
+ print("\n--- Step 2: Answering Questions (QA) ---")
+ reference_answers = []
+ candidate_answers = [[] for _ in candidates]
+
+ for q_idx, question in enumerate(questions):
+ print(f" - Answering question {q_idx + 1}/{len(questions)} for reference and {len(candidates)} candidates...")
+ # reference
+ prompt_qa_ref = QA_TEMPLATE.format(passage=reference, question=question)
+ answer_ref = prompt_genai(SYSTEM_PROMPT_QA, prompt_qa_ref)
+ reference_answers.append(answer_ref)
+
+ # candidate
+ for i, candidate in enumerate(candidates):
+ prompt_qa_cand = QA_TEMPLATE.format(passage=candidate, question=question)
+ answer_cand = prompt_genai(SYSTEM_PROMPT_QA, prompt_qa_cand)
+ candidate_answers[i].append(answer_cand)
+ print("--- Step 2 COMPLETED. All questions answered. ---")
+
+
+ # --- Step 3: Answer Correctness & Scoring ---
+ print("\n--- Step 3: Scoring Answers ---")
+ scores = []
+
+ # Loop over each candidate's list of answers
+ for i, cand_answers_list in enumerate(candidate_answers):
+ scores_for_candidate = []
+ # Loop over each answer, which corresponds to a question (j is the question index)
+ for j, cand_ans in enumerate(cand_answers_list):
+ # Get the corresponding reference answer for the same question
+ ref_ans = reference_answers[j]
+
+ score_obj = sacrebleu.sentence_chrf(cand_ans, [ref_ans])
+ score_val = score_obj.score
+
+ threshold = 50.0
+ text = "Excellent Match" if score_val > threshold else "Poor Match"
+ rating = "high-score" if score_val > threshold else "low-score"
+
+ scores_for_candidate.append({"score": score_val, "text": text, "rating": rating})
+
+ scores.append(scores_for_candidate)
+ print("--- Step 3 COMPLETED. All answers scored. ---")
+
+
+ # Simulate network delay for a better user experience
+ time.sleep(1)
+
+ print("\n[TREQA] Evaluation finished. Sending results to client.")
+ return {
+ "questions": questions,
+ "reference_answers": reference_answers,
+ "candidate_answers": candidate_answers,
+ "scores": scores
+ }
+
+@app.route('/')
+def index():
+ """Renders the main home page."""
+ return render_template('index.html')
+
+@app.route('/evaluate', methods=['POST'])
+def evaluate():
+ """API endpoint to run the TREQA evaluation."""
+ data = request.json
+ source = data.get('source')
+ reference = data.get('reference')
+ candidates = data.get('candidates')
+
+ if not source or not reference or not candidates:
+ return jsonify({"error": "Missing source, reference or candidates"}), 400
+
+ results = run_treqa(source, reference, candidates)
+
+ return jsonify(results)
+
+if __name__ == '__main__':
+ app.run(debug=True)
\ No newline at end of file
diff --git a/demo/requirements.txt b/demo/requirements.txt
new file mode 100644
index 0000000..a788c93
--- /dev/null
+++ b/demo/requirements.txt
@@ -0,0 +1,2 @@
+Flask==2.2.2
+google-genai
\ No newline at end of file
diff --git a/demo/run_genai.py b/demo/run_genai.py
new file mode 100644
index 0000000..1be39af
--- /dev/null
+++ b/demo/run_genai.py
@@ -0,0 +1,44 @@
+from google import genai
+import os
+
+
+assert (
+ "GEMINI_API_KEY" in os.environ
+), "Please set the GEMINI_API_KEY environment variable"
+
+client = genai.Client(api_key=os.environ["GEMINI_API_KEY"])
+
+def prompt_genai(system_prompt, user_prompt):
+ config = genai.types.GenerateContentConfig(
+ temperature=0.0,
+ max_output_tokens=32768,
+ top_p=0.7,
+ top_k=100,
+ system_instruction=system_prompt,
+ response_mime_type="text/plain",
+ safety_settings=[
+ genai.types.SafetySetting(
+ category=category, threshold=genai.types.HarmBlockThreshold.BLOCK_NONE
+ )
+ for category in [
+ genai.types.HarmCategory.HARM_CATEGORY_HATE_SPEECH,
+ genai.types.HarmCategory.HARM_CATEGORY_HARASSMENT,
+ genai.types.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT,
+ genai.types.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT,
+ ]
+ ],
+ )
+
+ while(True):
+ response = client.models.generate_content(
+ model="gemini-2.5-flash", contents=user_prompt, config=config
+ )
+ if response.candidates is None:
+ continue
+
+ if response.candidates[0].finish_reason == "MAX_TOKENS":
+ continue
+
+ return response.text
+
+
diff --git a/demo/static/.DS_Store b/demo/static/.DS_Store
new file mode 100644
index 0000000..5f55342
Binary files /dev/null and b/demo/static/.DS_Store differ
diff --git a/demo/static/css/style.css b/demo/static/css/style.css
new file mode 100644
index 0000000..fdb5b53
--- /dev/null
+++ b/demo/static/css/style.css
@@ -0,0 +1,316 @@
+@import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;500;700&family=Lora:wght@400;600&display=swap');
+
+:root {
+ --primary-color: #005f73;
+ --secondary-color: #0a9396;
+ --accent-color: #94d2bd;
+ --background-color: #f8f9fa;
+ --text-color: #343a40;
+ --light-gray: #e9ecef;
+ --dark-gray: #6c757d;
+ --card-shadow: 0 4px 15px rgba(0, 0, 0, 0.07);
+ --border-radius: 12px;
+}
+
+body {
+ font-family: 'Inter', sans-serif;
+ line-height: 1.7;
+ background-color: var(--background-color);
+ color: var(--text-color);
+ margin: 0;
+ padding: 0;
+ scroll-behavior: smooth;
+}
+
+.container {
+ max-width: 900px;
+ margin: 0 auto;
+ padding: 2rem;
+}
+
+header {
+ text-align: center;
+ padding: 3rem 0;
+ border-bottom: 1px solid var(--light-gray);
+}
+
+header h1 {
+ font-family: 'Lora', serif;
+ font-size: 2.8rem;
+ color: var(--primary-color);
+ margin-bottom: 1rem;
+}
+
+header .sub-title {
+ font-size: 1.2rem;
+ color: var(--dark-gray);
+ margin-top: -1rem;
+ margin-bottom: 1.5rem;
+}
+
+header .authors {
+ font-size: 1.1rem;
+ color: var(--dark-gray);
+ margin-bottom: 1.5rem;
+ line-height: 1.5;
+}
+
+header .links a {
+ color: var(--secondary-color);
+ text-decoration: none;
+ font-weight: 500;
+ margin: 0 1rem;
+ padding: 0.5rem 1rem;
+ border-radius: 8px;
+ transition: all 0.3s ease;
+ border: 1px solid transparent;
+}
+
+header .links a:hover {
+ background-color: #eef7f7;
+ border-color: var(--accent-color);
+}
+
+.abstract {
+ background-color: #ffffff;
+ padding: 2rem;
+ border-radius: var(--border-radius);
+ margin-top: 2rem;
+ box-shadow: var(--card-shadow);
+}
+
+.abstract h2 {
+ font-family: 'Lora', serif;
+ color: var(--primary-color);
+ margin-top: 0;
+ text-align: center;
+}
+
+.section {
+ padding: 3rem 0;
+ border-bottom: 1px solid var(--light-gray);
+}
+.section:last-child {
+ border-bottom: none;
+}
+
+h2.section-title {
+ text-align: center;
+ font-family: 'Lora', serif;
+ font-size: 2.2rem;
+ color: var(--primary-color);
+ margin-bottom: 2rem;
+}
+
+.main-diagram img {
+ max-width: 100%;
+ height: auto;
+ border-radius: var(--border-radius);
+ box-shadow: var(--card-shadow);
+ border: 1px solid var(--light-gray);
+}
+.main-diagram p {
+ text-align: center;
+ color: var(--dark-gray);
+ font-style: italic;
+ margin-top: 1rem;
+}
+
+#interactive-demo {
+ background-color: #ffffff;
+ padding: 2.5rem;
+ border-radius: var(--border-radius);
+ box-shadow: var(--card-shadow);
+}
+
+.input-group {
+ margin-bottom: 1.5rem;
+}
+
+.input-group label {
+ display: block;
+ font-weight: 500;
+ margin-bottom: 0.5rem;
+ color: var(--primary-color);
+}
+
+textarea, .candidate-input-group input {
+ width: 100%;
+ padding: 0.8rem;
+ border-radius: 8px;
+ border: 1px solid #ced4da;
+ font-family: 'Inter', sans-serif;
+ font-size: 1rem;
+ box-sizing: border-box;
+ transition: border-color 0.3s ease;
+}
+
+textarea {
+ min-height: 80px;
+ resize: vertical;
+}
+
+textarea:focus, .candidate-input-group input:focus {
+ outline: none;
+ border-color: var(--secondary-color);
+}
+
+#candidates-container .candidate-input-group {
+ display: flex;
+ align-items: center;
+ margin-bottom: 0.75rem;
+}
+
+#candidates-container input {
+ flex-grow: 1;
+}
+
+.remove-candidate-btn {
+ margin-left: 0.5rem;
+ padding: 0.6rem;
+ border: none;
+ background-color: #fde8e8;
+ color: #c0392b;
+ border-radius: 50%;
+ cursor: pointer;
+ font-weight: bold;
+ width: 35px;
+ height: 35px;
+ line-height: 1;
+ transition: all 0.2s ease;
+}
+.remove-candidate-btn:hover {
+ background-color: #e74c3c;
+ color: white;
+}
+
+#add-candidate-btn, #run-treqa-btn {
+ background-color: var(--secondary-color);
+ color: white;
+ border: none;
+ padding: 0.8rem 1.5rem;
+ border-radius: 8px;
+ cursor: pointer;
+ font-weight: 500;
+ font-size: 1rem;
+ transition: background-color 0.3s ease;
+ display: inline-block;
+}
+#add-candidate-btn {
+ background-color: var(--accent-color);
+ color: var(--primary-color);
+ margin-top: 0.5rem;
+}
+
+#add-candidate-btn:hover, #run-treqa-btn:hover {
+ background-color: var(--primary-color);
+}
+
+.button-container {
+ text-align: center;
+ margin-top: 2rem;
+}
+
+#results-container {
+ margin-top: 3rem;
+ opacity: 0;
+ transform: translateY(20px);
+ transition: opacity 0.5s ease, transform 0.5s ease;
+ display: none;
+ text-align: center;
+}
+#results-container.visible {
+ opacity: 1;
+ transform: translateY(0);
+ display: block;
+ text-align: left;
+}
+
+.result-step {
+ background-color: #ffffff;
+ padding: 1.5rem 2rem;
+ border-radius: var(--border-radius);
+ margin-bottom: 1.5rem;
+ border-left: 5px solid var(--accent-color);
+}
+
+.result-step h3 {
+ color: var(--primary-color);
+ margin-top: 0;
+ font-family: 'Lora', serif;
+}
+
+.question-list {
+ list-style-type: none;
+ padding-left: 0;
+}
+.question-list li {
+ background-color: #f1f8f7;
+ padding: 0.8rem 1rem;
+ border-radius: 8px;
+ margin-bottom: 0.5rem;
+}
+
+.answers-table {
+ width: 100%;
+ border-collapse: collapse;
+ margin-top: 1rem;
+}
+.answers-table th, .answers-table td {
+ padding: 1rem;
+ text-align: left;
+ border-bottom: 1px solid var(--light-gray);
+}
+.answers-table th {
+ background-color: #eef7f7;
+ color: var(--primary-color);
+ font-weight: 500;
+}
+
+.score-card {
+ display: flex;
+ justify-content: space-between;
+ align-items: center;
+ background-color: #f1f8f7;
+ padding: 1rem 1.5rem;
+ border-radius: 8px;
+ margin-bottom: 1rem;
+}
+.score-card .score-value {
+ font-size: 1.5rem;
+ font-weight: bold;
+ padding: 0.5rem 1rem;
+ border-radius: 8px;
+}
+
+.score-card.high-score .score-value {
+ background-color: #d4edda;
+ color: #155724;
+}
+.score-card.low-score .score-value {
+ background-color: #f8d7da;
+ color: #721c24;
+}
+
+.loading-spinner {
+ display: inline-block;
+ width: 40px;
+ height: 40px;
+ border: 4px solid var(--accent-color);
+ border-top-color: var(--primary-color);
+ border-radius: 50%;
+ animation: spin 1s ease-in-out infinite;
+ margin-bottom: 1rem;
+}
+
+@keyframes spin {
+ to { transform: rotate(360deg); }
+}
+
+footer {
+ text-align: center;
+ padding: 2rem 0;
+ color: var(--dark-gray);
+ font-size: 0.9rem;
+}
+
diff --git a/demo/static/img/.DS_Store b/demo/static/img/.DS_Store
new file mode 100644
index 0000000..c11ac33
Binary files /dev/null and b/demo/static/img/.DS_Store differ
diff --git a/demo/static/img/treqa_diagram.jpg b/demo/static/img/treqa_diagram.jpg
new file mode 100644
index 0000000..5abc2e2
Binary files /dev/null and b/demo/static/img/treqa_diagram.jpg differ
diff --git a/demo/templates/index.html b/demo/templates/index.html
new file mode 100644
index 0000000..7e1640d
--- /dev/null
+++ b/demo/templates/index.html
@@ -0,0 +1,284 @@
+
+
+
+
+
+ TREQA: Evaluating Paragraph-level MT with Question Answering
+
+
+
+
+
+
+
+
+
+
+ Abstract
+ Despite the steady progress in machine translation evaluation, existing automatic metrics struggle to capture how well meaning is preserved beyond sentence boundaries. We posit that reliance on a single intrinsic quality score, trained to mimic human judgments, might be insufficient for evaluating translations of long, complex passages, and a more “pragmatic” approach that assesses how accurately key information is conveyed by a translation in context is needed. We introduce TREQA (Translation Evaluation via Question-Answering) , a framework that extrinsically evaluates translation quality by assessing how accurately candidate translations answer reading comprehension questions that target key information in the original source or reference texts...
+
+
+
+ The TREQA Framework
+
+
+
Figure 1: TREQA assesses translation quality through a question-answering framework.
+
+
+
+
+ Try TREQA Live
+ This demo uses an example from the paper (Figure 6) to show how TREQA identifies subtle translation errors. You can edit the text or add your own candidates.
+
+ Source Text
+
+
+
+ Reference Translation
+
+
+
+
+
+
+ Run TREQA Evaluation
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/demo/utils.py b/demo/utils.py
new file mode 100644
index 0000000..2bfaaf2
--- /dev/null
+++ b/demo/utils.py
@@ -0,0 +1,68 @@
+SYSTEM_PROMPT_QAG = "You are a helpful AI assistant skilled in generating questions and answers from given passages."
+
+QAG_TEMPLATE = """"Generate question-answer pairs to verify translation accuracy. Each answer should be a key phrase, concept, or entity from the original passage (source or reference) that could help detect errors or mistranslations in the candidate(s).
+The questions and answers must be strictly in English, while ensuring that the meaning of the answer is preserved. The questions should be diverse and cover different aspects of the passage. Answer in the format:
+
+Q:
+A:
+
+Q:
+A:
+
+...
+
+Source Passage:
+{src_passage}
+
+Reference Passage:
+{ref_passage}
+
+Candidate Passage(s):
+{alternatives}
+
+Question-Answer Pairs:
+"""
+
+SYSTEM_PROMPT_QA = (
+ """You are a helpful AI assistant skilled in question answering."""
+)
+
+QA_TEMPLATE = """Given the following passage and question, return the answer in English using only the information from the passage. The answer should be a concise response based on the provided content.
+###
+Passage:
+{passage}
+###
+Question:
+{question}
+###
+Answer:"""
+
+def parse_output_default(
+ output: str,
+ ) -> tuple[list[tuple[str, str]], list[tuple[str, str]]]:
+ pairs = []
+ skipped_outs = []
+ for qa in output.split("\n\n"):
+ # skip if there is no content
+ if not qa.strip():
+ reason = "no content"
+ skipped_outs.append((reason, qa))
+ continue
+ # skip if there are not two lines
+ if len(qa.split("\n")) != 2:
+ reason = "not two lines"
+ skipped_outs.append((reason, qa))
+ continue
+
+ q, a = qa.split("\n")
+ # skip if the qa pair don't start with Q: and A:
+ if not q.startswith("Q:") or not a.startswith("A:"):
+ reason = "no Q: or A:"
+ skipped_outs.append((reason, qa))
+ continue
+
+ q = q.replace("Q:", "").strip()
+ a = a.replace("A:", "").strip()
+ pairs.append((q, a))
+
+ return list(set(pairs))
\ No newline at end of file