diff --git a/demo/app.py b/demo/app.py
new file mode 100644
index 0000000..e7914e5
--- /dev/null
+++ b/demo/app.py
@@ -0,0 +1,105 @@
+from flask import Flask, render_template, request, jsonify
+import time
+import re
+from utils import *
+from run_genai import prompt_genai
+import sacrebleu
+
+app = Flask(__name__)
+
+def run_treqa(source, reference, candidates):
+    """
+    A mock function to simulate the TREQA pipeline with server-side logging.
+    """
+    print("\n[TREQA] Starting evaluation...")
+
+    # --- Step 1: Question Generation (QAG) ---
+    print("--- Step 1: Generating Questions (QAG) ---")
+    prompt_qag = QAG_TEMPLATE.format(src_passage=source,
+                                   ref_passage=reference,
+                                   alternatives=candidates)
+    response = prompt_genai(SYSTEM_PROMPT_QAG, prompt_qag)
+
+    questions = [x[0] for x in parse_output_default(response)]
+    print(f"--- Step 1 COMPLETED. Generated {len(questions)} questions. ---")
+    for i, q in enumerate(questions):
+        print(f"  - Q{i+1}: {q}")
+
+    # --- Step 2: Question Answering (QA) ---
+    print("\n--- Step 2: Answering Questions (QA) ---")
+    reference_answers = []
+    candidate_answers = [[] for _ in candidates]
+    
+    for q_idx, question in enumerate(questions):
+        print(f"  - Answering question {q_idx + 1}/{len(questions)} for reference and {len(candidates)} candidates...")
+        # reference
+        prompt_qa_ref = QA_TEMPLATE.format(passage=reference, question=question)
+        answer_ref = prompt_genai(SYSTEM_PROMPT_QA, prompt_qa_ref)
+        reference_answers.append(answer_ref)
+        
+        # candidate
+        for i, candidate in enumerate(candidates):
+            prompt_qa_cand = QA_TEMPLATE.format(passage=candidate, question=question)
+            answer_cand = prompt_genai(SYSTEM_PROMPT_QA, prompt_qa_cand)
+            candidate_answers[i].append(answer_cand)
+    print("--- Step 2 COMPLETED. All questions answered. ---")
+
+
+    # --- Step 3: Answer Correctness & Scoring ---
+    print("\n--- Step 3: Scoring Answers ---")
+    scores = []
+    
+    # Loop over each candidate's list of answers
+    for i, cand_answers_list in enumerate(candidate_answers):
+        scores_for_candidate = []
+        # Loop over each answer, which corresponds to a question (j is the question index)
+        for j, cand_ans in enumerate(cand_answers_list):
+            # Get the corresponding reference answer for the same question
+            ref_ans = reference_answers[j]
+
+            score_obj = sacrebleu.sentence_chrf(cand_ans, [ref_ans])
+            score_val = score_obj.score
+
+            threshold = 50.0
+            text = "Excellent Match" if score_val > threshold else "Poor Match"
+            rating = "high-score" if score_val > threshold else "low-score"
+
+            scores_for_candidate.append({"score": score_val, "text": text, "rating": rating})
+        
+        scores.append(scores_for_candidate)
+    print("--- Step 3 COMPLETED. All answers scored. ---")
+
+
+    # Simulate network delay for a better user experience
+    time.sleep(1)
+
+    print("\n[TREQA] Evaluation finished. Sending results to client.")
+    return {
+        "questions": questions,
+        "reference_answers": reference_answers,
+        "candidate_answers": candidate_answers,
+        "scores": scores
+    }
+
+@app.route('/')
+def index():
+    """Renders the main home page."""
+    return render_template('index.html')
+
+@app.route('/evaluate', methods=['POST'])
+def evaluate():
+    """API endpoint to run the TREQA evaluation."""
+    data = request.json
+    source = data.get('source')
+    reference = data.get('reference')
+    candidates = data.get('candidates')
+
+    if not source or not reference or not candidates:
+        return jsonify({"error": "Missing source, reference or candidates"}), 400
+
+    results = run_treqa(source, reference, candidates)
+    
+    return jsonify(results)
+
+if __name__ == '__main__':
+    app.run(debug=True)
\ No newline at end of file
diff --git a/demo/requirements.txt b/demo/requirements.txt
new file mode 100644
index 0000000..a788c93
--- /dev/null
+++ b/demo/requirements.txt
@@ -0,0 +1,2 @@
+Flask==2.2.2
+google-genai
\ No newline at end of file
diff --git a/demo/run_genai.py b/demo/run_genai.py
new file mode 100644
index 0000000..1be39af
--- /dev/null
+++ b/demo/run_genai.py
@@ -0,0 +1,44 @@
+from google import genai
+import os
+
+
+assert (
+    "GEMINI_API_KEY" in os.environ
+), "Please set the GEMINI_API_KEY environment variable"
+
+client = genai.Client(api_key=os.environ["GEMINI_API_KEY"])
+
+def prompt_genai(system_prompt, user_prompt):
+    config = genai.types.GenerateContentConfig(
+        temperature=0.0,
+        max_output_tokens=32768,
+        top_p=0.7,
+        top_k=100,
+        system_instruction=system_prompt,
+        response_mime_type="text/plain",
+        safety_settings=[
+            genai.types.SafetySetting(
+                category=category, threshold=genai.types.HarmBlockThreshold.BLOCK_NONE
+            )
+            for category in [
+                genai.types.HarmCategory.HARM_CATEGORY_HATE_SPEECH,
+                genai.types.HarmCategory.HARM_CATEGORY_HARASSMENT,
+                genai.types.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT,
+                genai.types.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT,
+            ]
+        ],
+    )
+
+    while(True):
+        response = client.models.generate_content(
+                model="gemini-2.5-flash", contents=user_prompt, config=config
+            )
+        if response.candidates is None:
+            continue
+
+        if response.candidates[0].finish_reason == "MAX_TOKENS":
+            continue
+        
+        return response.text
+    
+
diff --git a/demo/static/.DS_Store b/demo/static/.DS_Store
new file mode 100644
index 0000000..5f55342
Binary files /dev/null and b/demo/static/.DS_Store differ
diff --git a/demo/static/css/style.css b/demo/static/css/style.css
new file mode 100644
index 0000000..fdb5b53
--- /dev/null
+++ b/demo/static/css/style.css
@@ -0,0 +1,316 @@
+@import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;500;700&family=Lora:wght@400;600&display=swap');
+
+:root {
+    --primary-color: #005f73;
+    --secondary-color: #0a9396;
+    --accent-color: #94d2bd;
+    --background-color: #f8f9fa;
+    --text-color: #343a40;
+    --light-gray: #e9ecef;
+    --dark-gray: #6c757d;
+    --card-shadow: 0 4px 15px rgba(0, 0, 0, 0.07);
+    --border-radius: 12px;
+}
+
+body {
+    font-family: 'Inter', sans-serif;
+    line-height: 1.7;
+    background-color: var(--background-color);
+    color: var(--text-color);
+    margin: 0;
+    padding: 0;
+    scroll-behavior: smooth;
+}
+
+.container {
+    max-width: 900px;
+    margin: 0 auto;
+    padding: 2rem;
+}
+
+header {
+    text-align: center;
+    padding: 3rem 0;
+    border-bottom: 1px solid var(--light-gray);
+}
+
+header h1 {
+    font-family: 'Lora', serif;
+    font-size: 2.8rem;
+    color: var(--primary-color);
+    margin-bottom: 1rem;
+}
+
+header .sub-title {
+    font-size: 1.2rem;
+    color: var(--dark-gray);
+    margin-top: -1rem;
+    margin-bottom: 1.5rem;
+}
+
+header .authors {
+    font-size: 1.1rem;
+    color: var(--dark-gray);
+    margin-bottom: 1.5rem;
+    line-height: 1.5;
+}
+
+header .links a {
+    color: var(--secondary-color);
+    text-decoration: none;
+    font-weight: 500;
+    margin: 0 1rem;
+    padding: 0.5rem 1rem;
+    border-radius: 8px;
+    transition: all 0.3s ease;
+    border: 1px solid transparent;
+}
+
+header .links a:hover {
+    background-color: #eef7f7;
+    border-color: var(--accent-color);
+}
+
+.abstract {
+    background-color: #ffffff;
+    padding: 2rem;
+    border-radius: var(--border-radius);
+    margin-top: 2rem;
+    box-shadow: var(--card-shadow);
+}
+
+.abstract h2 {
+    font-family: 'Lora', serif;
+    color: var(--primary-color);
+    margin-top: 0;
+    text-align: center;
+}
+
+.section {
+    padding: 3rem 0;
+    border-bottom: 1px solid var(--light-gray);
+}
+.section:last-child {
+    border-bottom: none;
+}
+
+h2.section-title {
+    text-align: center;
+    font-family: 'Lora', serif;
+    font-size: 2.2rem;
+    color: var(--primary-color);
+    margin-bottom: 2rem;
+}
+
+.main-diagram img {
+    max-width: 100%;
+    height: auto;
+    border-radius: var(--border-radius);
+    box-shadow: var(--card-shadow);
+    border: 1px solid var(--light-gray);
+}
+.main-diagram p {
+    text-align: center;
+    color: var(--dark-gray);
+    font-style: italic;
+    margin-top: 1rem;
+}
+
+#interactive-demo {
+    background-color: #ffffff;
+    padding: 2.5rem;
+    border-radius: var(--border-radius);
+    box-shadow: var(--card-shadow);
+}
+
+.input-group {
+    margin-bottom: 1.5rem;
+}
+
+.input-group label {
+    display: block;
+    font-weight: 500;
+    margin-bottom: 0.5rem;
+    color: var(--primary-color);
+}
+
+textarea, .candidate-input-group input {
+    width: 100%;
+    padding: 0.8rem;
+    border-radius: 8px;
+    border: 1px solid #ced4da;
+    font-family: 'Inter', sans-serif;
+    font-size: 1rem;
+    box-sizing: border-box;
+    transition: border-color 0.3s ease;
+}
+
+textarea {
+    min-height: 80px;
+    resize: vertical;
+}
+
+textarea:focus, .candidate-input-group input:focus {
+    outline: none;
+    border-color: var(--secondary-color);
+}
+
+#candidates-container .candidate-input-group {
+    display: flex;
+    align-items: center;
+    margin-bottom: 0.75rem;
+}
+
+#candidates-container input {
+    flex-grow: 1;
+}
+
+.remove-candidate-btn {
+    margin-left: 0.5rem;
+    padding: 0.6rem;
+    border: none;
+    background-color: #fde8e8;
+    color: #c0392b;
+    border-radius: 50%;
+    cursor: pointer;
+    font-weight: bold;
+    width: 35px;
+    height: 35px;
+    line-height: 1;
+    transition: all 0.2s ease;
+}
+.remove-candidate-btn:hover {
+    background-color: #e74c3c;
+    color: white;
+}
+
+#add-candidate-btn, #run-treqa-btn {
+    background-color: var(--secondary-color);
+    color: white;
+    border: none;
+    padding: 0.8rem 1.5rem;
+    border-radius: 8px;
+    cursor: pointer;
+    font-weight: 500;
+    font-size: 1rem;
+    transition: background-color 0.3s ease;
+    display: inline-block;
+}
+#add-candidate-btn {
+    background-color: var(--accent-color);
+    color: var(--primary-color);
+    margin-top: 0.5rem;
+}
+
+#add-candidate-btn:hover, #run-treqa-btn:hover {
+    background-color: var(--primary-color);
+}
+
+.button-container {
+    text-align: center;
+    margin-top: 2rem;
+}
+
+#results-container {
+    margin-top: 3rem;
+    opacity: 0;
+    transform: translateY(20px);
+    transition: opacity 0.5s ease, transform 0.5s ease;
+    display: none;
+    text-align: center;
+}
+#results-container.visible {
+    opacity: 1;
+    transform: translateY(0);
+    display: block;
+    text-align: left;
+}
+
+.result-step {
+    background-color: #ffffff;
+    padding: 1.5rem 2rem;
+    border-radius: var(--border-radius);
+    margin-bottom: 1.5rem;
+    border-left: 5px solid var(--accent-color);
+}
+
+.result-step h3 {
+    color: var(--primary-color);
+    margin-top: 0;
+    font-family: 'Lora', serif;
+}
+
+.question-list {
+    list-style-type: none;
+    padding-left: 0;
+}
+.question-list li {
+    background-color: #f1f8f7;
+    padding: 0.8rem 1rem;
+    border-radius: 8px;
+    margin-bottom: 0.5rem;
+}
+
+.answers-table {
+    width: 100%;
+    border-collapse: collapse;
+    margin-top: 1rem;
+}
+.answers-table th, .answers-table td {
+    padding: 1rem;
+    text-align: left;
+    border-bottom: 1px solid var(--light-gray);
+}
+.answers-table th {
+    background-color: #eef7f7;
+    color: var(--primary-color);
+    font-weight: 500;
+}
+
+.score-card {
+    display: flex;
+    justify-content: space-between;
+    align-items: center;
+    background-color: #f1f8f7;
+    padding: 1rem 1.5rem;
+    border-radius: 8px;
+    margin-bottom: 1rem;
+}
+.score-card .score-value {
+    font-size: 1.5rem;
+    font-weight: bold;
+    padding: 0.5rem 1rem;
+    border-radius: 8px;
+}
+
+.score-card.high-score .score-value {
+    background-color: #d4edda;
+    color: #155724;
+}
+.score-card.low-score .score-value {
+    background-color: #f8d7da;
+    color: #721c24;
+}
+
+.loading-spinner {
+    display: inline-block;
+    width: 40px;
+    height: 40px;
+    border: 4px solid var(--accent-color);
+    border-top-color: var(--primary-color);
+    border-radius: 50%;
+    animation: spin 1s ease-in-out infinite;
+    margin-bottom: 1rem;
+}
+
+@keyframes spin {
+    to { transform: rotate(360deg); }
+}
+
+footer {
+    text-align: center;
+    padding: 2rem 0;
+    color: var(--dark-gray);
+    font-size: 0.9rem;
+}
+
diff --git a/demo/static/img/.DS_Store b/demo/static/img/.DS_Store
new file mode 100644
index 0000000..c11ac33
Binary files /dev/null and b/demo/static/img/.DS_Store differ
diff --git a/demo/static/img/treqa_diagram.jpg b/demo/static/img/treqa_diagram.jpg
new file mode 100644
index 0000000..5abc2e2
Binary files /dev/null and b/demo/static/img/treqa_diagram.jpg differ
diff --git a/demo/templates/index.html b/demo/templates/index.html
new file mode 100644
index 0000000..7e1640d
--- /dev/null
+++ b/demo/templates/index.html
@@ -0,0 +1,284 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>TREQA: Evaluating Paragraph-level MT with Question Answering</title>
+    <link rel="stylesheet" href="{{ url_for('static', filename='css/style.css') }}">
+    <!-- Added styles for the new results layout and running signal -->
+    <style>
+        .question-result-block {
+            background-color: #f9f9f9;
+            border: 1px solid #ddd;
+            border-radius: 8px;
+            padding: 1.5rem;
+            margin-bottom: 1.5rem;
+        }
+        .question-result-block h4 {
+            margin-top: 0;
+            border-bottom: 2px solid var(--primary-color);
+            padding-bottom: 0.5rem;
+            margin-bottom: 1rem;
+        }
+        .answers-table {
+            width: 100%;
+            border-collapse: collapse;
+        }
+        .answers-table th, .answers-table td {
+            padding: 0.75rem;
+            text-align: left;
+            border-bottom: 1px solid #eee;
+        }
+        .answers-table th {
+            color: var(--dark-gray);
+            font-size: 0.9em;
+        }
+        .answers-table tr:last-child td {
+            border-bottom: none;
+        }
+        .answers-table .reference-row {
+            background-color: #f0f8ff;
+        }
+        .score-cell {
+            text-align: center;
+            font-weight: bold;
+        }
+        .score-badge {
+            display: inline-block;
+            padding: 0.3em 0.6em;
+            border-radius: 12px;
+            color: white;
+            font-size: 0.9em;
+        }
+        .score-badge.high-score {
+            background-color: #28a745;
+        }
+        .score-badge.low-score {
+            background-color: #dc3545;
+        }
+
+        /* --- Styles for Running Signal --- */
+        #run-treqa-btn {
+            display: inline-flex;
+            align-items: center;
+            justify-content: center;
+            transition: background-color 0.2s;
+        }
+        #run-treqa-btn .btn-spinner {
+            display: none; /* Hidden by default */
+            width: 1em;
+            height: 1em;
+            border: 2px solid rgba(255,255,255,0.3);
+            border-radius: 50%;
+            border-top-color: #fff;
+            animation: spin 1s ease-in-out infinite;
+            margin-left: 10px;
+        }
+        #run-treqa-btn.running .btn-spinner {
+            display: inline-block; /* Shown when button has 'running' class */
+        }
+        #run-treqa-btn:disabled {
+            cursor: not-allowed;
+            opacity: 0.8;
+        }
+        @keyframes spin {
+            to { transform: rotate(360deg); }
+        }
+    </style>
+</head>
+<body>
+
+    <header>
+        <div class="container">
+            <h1>Do LLMs Understand Your Translations?</h1>
+            <p class="sub-title">Evaluating Paragraph-level MT with Question Answering</p>
+            <div class="authors">
+                Patrick Fernandes, Sweta Agrawal, Emmanouil Zaranis, <br> André F.T. Martins, Graham Neubig
+            </div>
+            <div class="links">
+                <a href="https://arxiv.org/abs/2504.07583" target="_blank">📄 Paper</a>
+                <a href="https://github.com/deep-spin/treqa" target="_blank">💻 Code</a>
+                <a href="#interactive-demo">🚀 Live Demo</a>
+            </div>
+        </div>
+    </header>
+
+    <main class="container">
+        <section class="abstract">
+            <h2>Abstract</h2>
+            <p>Despite the steady progress in machine translation evaluation, existing automatic metrics struggle to capture how well meaning is preserved beyond sentence boundaries. We posit that reliance on a single intrinsic quality score, trained to mimic human judgments, might be insufficient for evaluating translations of long, complex passages, and a more “pragmatic” approach that assesses how accurately key information is conveyed by a translation in context is needed. We introduce <strong>TREQA (Translation Evaluation via Question-Answering)</strong>, a framework that extrinsically evaluates translation quality by assessing how accurately candidate translations answer reading comprehension questions that target key information in the original source or reference texts...</p>
+        </section>
+
+        <section class="section">
+            <h2 class="section-title">The TREQA Framework</h2>
+            <div class="main-diagram">
+                <img src="{{ url_for('static', filename='img/treqa_diagram.jpg') }}" alt="Diagram showing the TREQA framework.">
+                <p>Figure 1: TREQA assesses translation quality through a question-answering framework.</p>
+            </div>
+        </section>
+        
+        <section id="interactive-demo" class="section">
+            <h2 class="section-title">Try TREQA Live</h2>
+            <p style="text-align: center; max-width: 700px; margin: 0 auto 2rem auto;">This demo uses an example from the paper (Figure 6) to show how TREQA identifies subtle translation errors. You can edit the text or add your own candidates.</p>
+            <div class="input-group">
+                <label for="source-text">Source Text</label>
+                <textarea id="source-text" placeholder="Enter original source text...">A cidade de Lisboa é a capital de Portugal. Cientistas descobriram uma nova espécie de pássaro na floresta amazônica. A economia portuguesa cresceu 2.5% no último trimestre.</textarea>
+            </div>
+            <div class="input-group">
+                <label for="reference-text">Reference Translation</label>
+                <textarea id="reference-text">The city of Lisbon is the capital of Portugal. Scientists discovered a new bird species in the Amazon rainforest. The Portuguese economy grew by 2.5% in the last quarter.</textarea>
+            </div>
+            <div class="input-group">
+                <label for="candidates-container">Candidate Translations</label>
+                <div id="candidates-container"></div>
+                <button id="add-candidate-btn">+ Add another candidate</button>
+            </div>
+            <div class="button-container">
+                <!-- FIX: Updated button structure for running signal -->
+                <button id="run-treqa-btn">
+                    <span class="btn-text">Run TREQA Evaluation</span>
+                    <span class="btn-spinner"></span>
+                </button>
+            </div>
+            
+            <div id="results-container"></div>
+        </section>
+    </main>
+    
+    <footer>
+        <div class="container">
+            Published as a conference paper at COLM 2025.
+        </div>
+    </footer>
+
+    <script>
+    document.addEventListener('DOMContentLoaded', function() {
+        const candidatesContainer = document.getElementById('candidates-container');
+        const addCandidateBtn = document.getElementById('add-candidate-btn');
+        const runTreqaBtn = document.getElementById('run-treqa-btn');
+        const resultsContainer = document.getElementById('results-container');
+        
+        let candidateCount = 0;
+
+        const initialCandidates = [
+            "The city of Lisbon is Portugal's capital. Scientists have found a new species of bird in the Amazon rainforest. Portugal's economy grew by 2.5% in the last quarter.",
+            "The city of Lisbon is the capital of Spain. Scientists have found a new species of fish in the Amazon rainforest. The Portuguese economy grew by 2.5% in the last year."
+        ];
+
+        function addCandidate(value = '') {
+            candidateCount++;
+            const div = document.createElement('div');
+            div.className = 'candidate-input-group';
+            div.innerHTML = `
+                <textarea placeholder="Candidate Translation ${candidateCount}">${value}</textarea>
+                <button class="remove-candidate-btn">&times;</button>
+            `;
+            candidatesContainer.appendChild(div);
+            div.querySelector('.remove-candidate-btn').addEventListener('click', () => div.remove());
+        }
+
+        addCandidateBtn.addEventListener('click', () => addCandidate());
+        initialCandidates.forEach(cand => addCandidate(cand));
+
+        // FIX: Updated click handler with running signal logic
+        runTreqaBtn.addEventListener('click', async function() {
+            const buttonText = runTreqaBtn.querySelector('.btn-text');
+            
+            // --- 1. Start Running Signal ---
+            runTreqaBtn.disabled = true;
+            runTreqaBtn.classList.add('running');
+            buttonText.textContent = 'Evaluating...';
+            resultsContainer.innerHTML = '<div class="loading-spinner"></div><h3>Please wait, running evaluation...</h3>';
+            resultsContainer.classList.add('visible');
+
+            const sourceText = document.getElementById('source-text').value;
+            const referenceText = document.getElementById('reference-text').value;
+            const candidateInputs = candidatesContainer.querySelectorAll('textarea');
+            const candidates = Array.from(candidateInputs).map(input => input.value);
+
+            // --- 2. Perform Async Operation with try...finally ---
+            try {
+                const response = await fetch('/evaluate', {
+                    method: 'POST',
+                    headers: { 'Content-Type': 'application/json' },
+                    body: JSON.stringify({
+                        source: sourceText,
+                        reference: referenceText,
+                        candidates: candidates
+                    }),
+                });
+
+                if (!response.ok) {
+                    const errorText = await response.text();
+                    throw new Error(`HTTP error! Status: ${response.status}. Server response: ${errorText}`);
+                }
+
+                const results = await response.json();
+                renderResults(results.questions, results.reference_answers, candidates, results.candidate_answers, results.scores);
+
+            } catch (error) {
+                console.error("Error calling evaluation API:", error);
+                resultsContainer.innerHTML = `<p class="error">An error occurred while running the evaluation. Please check the browser's developer console for details.</p>`;
+                resultsContainer.classList.add('visible');
+            } finally {
+                // --- 3. Reset Button State (always runs) ---
+                runTreqaBtn.disabled = false;
+                runTreqaBtn.classList.remove('running');
+                buttonText.textContent = 'Run TREQA Evaluation';
+            }
+        });
+        
+        function renderResults(questions, refAnswers, candidates, candAnswers, scores) {
+            // ... (this function remains the same) ...
+            let html = `
+                <div class="result-step">
+                    <h3><span style="font-size: 1.5em; vertical-align: middle;">1.</span> Question Generation (QAG)</h3>
+                    <p>TREQA generates questions that target key information.</p>
+                    <ul class="question-list">
+                        ${questions.map(q => `<li>${q}</li>`).join('')}
+                    </ul>
+                </div>
+                <div class="result-step">
+                    <h3><span style="font-size: 1.5em; vertical-align: middle;">2 & 3.</span> Answering & Scoring</h3>
+                    <p>For each question, the system finds an answer in the reference and candidate texts, then scores the candidate's answer against the reference's.</p>
+                    ${questions.map((question, q_idx) => `
+                    <div class="question-result-block">
+                        <h4>Question ${q_idx + 1}: ${question}</h4>
+                        <table class="answers-table">
+                            <thead>
+                                <tr>
+                                    <th>Translation</th>
+                                    <th>Answer from Text</th>
+                                    <th>Score</th>
+                                </tr>
+                            </thead>
+                            <tbody>
+                                <tr class="reference-row">
+                                    <td><strong>Reference</strong></td>
+                                    <td><em>"${refAnswers[q_idx]}"</em></td>
+                                    <td class="score-cell">--</td>
+                                </tr>
+                                ${candidates.map((cand, c_idx) => `
+                                <tr>
+                                    <td><strong>Candidate ${c_idx + 1}</strong></td>
+                                    <td><em>"${candAnswers[c_idx][q_idx]}"</em></td>
+                                    <td class="score-cell">
+                                        <span class="score-badge ${scores[c_idx][q_idx].rating}">
+                                            ${scores[c_idx][q_idx].score.toFixed(2)}
+                                        </span>
+                                    </td>
+                                </tr>
+                                `).join('')}
+                            </tbody>
+                        </table>
+                    </div>
+                    `).join('')}
+                </div>
+            `;
+            resultsContainer.innerHTML = html;
+            resultsContainer.classList.add('visible');
+        }
+    });
+    </script>
+</body>
+</html>
\ No newline at end of file
diff --git a/demo/utils.py b/demo/utils.py
new file mode 100644
index 0000000..2bfaaf2
--- /dev/null
+++ b/demo/utils.py
@@ -0,0 +1,68 @@
+SYSTEM_PROMPT_QAG = "You are a helpful AI assistant skilled in generating questions and answers from given passages."
+
+QAG_TEMPLATE = """"Generate question-answer pairs to verify translation accuracy. Each answer should be a key phrase, concept, or entity from the original passage (source or reference) that could help detect errors or mistranslations in the candidate(s).
+The questions and answers must be strictly in English, while ensuring that the meaning of the answer is preserved. The questions should be diverse and cover different aspects of the passage. Answer in the format:
+
+Q: <question1>
+A: <answer1>
+
+Q: <question2>
+A: <answer2>
+
+...
+
+Source Passage:
+{src_passage}
+
+Reference Passage:
+{ref_passage}
+
+Candidate Passage(s):
+{alternatives}
+
+Question-Answer Pairs:
+"""
+
+SYSTEM_PROMPT_QA = (
+    """You are a helpful AI assistant skilled in question answering."""
+)
+
+QA_TEMPLATE = """Given the following passage and question, return the answer in English using only the information from the passage. The answer should be a concise response based on the provided content.
+###
+Passage:
+{passage}
+###
+Question:
+{question}
+###
+Answer:"""
+
+def parse_output_default(
+        output: str,
+    ) -> tuple[list[tuple[str, str]], list[tuple[str, str]]]:
+    pairs = []
+    skipped_outs = []
+    for qa in output.split("\n\n"):
+        # skip if there is no content
+        if not qa.strip():
+            reason = "no content"
+            skipped_outs.append((reason, qa))
+            continue
+        # skip if there are not two lines
+        if len(qa.split("\n")) != 2:
+            reason = "not two lines"
+            skipped_outs.append((reason, qa))
+            continue
+
+        q, a = qa.split("\n")
+        # skip if the qa pair don't start with Q: and A:
+        if not q.startswith("Q:") or not a.startswith("A:"):
+            reason = "no Q: or A:"
+            skipped_outs.append((reason, qa))
+            continue
+
+        q = q.replace("Q:", "").strip()
+        a = a.replace("A:", "").strip()
+        pairs.append((q, a))
+
+    return list(set(pairs))
\ No newline at end of file