fuelix · Michael Freenor (negative-dialectic) · Feb 25, 2026 · Feb 25, 2026 · Feb 25, 2026 · Feb 25, 2026
diff --git a/ICLR_2026_Final.pdf b/ICLR_2026_Final.pdf
diff --git a/README.md b/README.md
@@ -25,19 +25,16 @@ Pre-computed embeddings for reproducing paper results are available on HuggingFa
 - 3,072-dimensional OpenAI `text-embedding-3-large` embeddings
 
 ```bash
-# Using huggingface_hub
 pip install huggingface_hub
-huggingface-cli download mfwta/RISE-ICLR-2026 --repo-type dataset --local-dir data/paper_embeddings
 ```
 
-You can also use Python:
 ```python
 from huggingface_hub import snapshot_download
 
 snapshot_download(
     repo_id='mfwta/RISE-ICLR-2026',
     repo_type='dataset',
-    local_dir='data/paper_embeddings'
+    local_dir='data'
 )
 ```
 
@@ -94,26 +91,45 @@ pip install -e .
 
 # 3. Verify paper results
 python scripts/verify_paper_results.py
+```
+
+The script runs all 3 models × 3 phenomena, computing full 7×7 cross-language
+transfer matrices (441 cells total) and comparing against every number in the
+paper. Expected output (abridged):
 
-# Expected output:
-# negation        0.857 (expected 0.864)  PASS
-# conditionality  0.828 (expected 0.832)  PASS
-# politeness      0.805 (expected 0.809)  PASS
+```
+  TABLE 2 (Synthetic Multilingual)
+  Model                       Obtained      Paper       Diff
+  -------------------------------------------------------
+  text-embedding-3-large        0.7962      0.771    +0.0252
+  bge-m3                        0.7993      0.782    +0.0173
+  mBERT                         0.7662      0.709    +0.0572
+
+  SECTION 6.1 (Per-Phenomenon Aggregates)
+  Phenomenon                  Obtained      Paper       Diff
+  -------------------------------------------------------
+  negation                      0.8061      0.788    +0.0181
+  conditionality                0.7946      0.780    +0.0146
+  politeness                    0.7610      0.762    -0.0010
+
+  Cell diff summary (441 cells):
+    Mean diff:     +0.0332
+    Mean |diff|:   0.0340
+    Max  |diff|:   0.1545
 ```
 
+Small positive diffs are expected — the paper values were rounded from a run
+with a slightly different numerical pipeline. All obtained scores are within
+a few points of the published numbers.
+
 ### Full Evaluation Suite
 
 ```bash
 python -m rise.experiments.run_evaluation \
-    --data-dir data/paper_embeddings \
+    --data-dir data/text-embedding-3-large \
     --transformations negation conditionality politeness \
     --languages en es ja ar th ta zu \
     --output-dir results/
-
-# Generate paper figures
-python -m rise.experiments.generate_figures \
-    --results-dir results/ \
-    --output-dir figures/
 ```
 
 ## Citation

diff --git a/requirements-frozen.txt b/requirements-frozen.txt
@@ -1,15 +1,11 @@
 # Frozen dependencies for reproducibility
-# These exact versions were used to generate the paper results.
 # Install with: pip install -r requirements-frozen.txt
 
 # Core dependencies
 torch==2.10.0
 numpy==2.4.1
 scipy==1.17.0
 
-# Visualization (optional, for generating figures)
-matplotlib>=3.7.0
-
 # Development/testing dependencies
 pytest>=7.3.0
 pytest-cov>=4.1.0

diff --git a/scripts/embed_classification_test.py b/scripts/embed_classification_test.py
@@ -0,0 +1,83 @@
+"""
+Embed the negation classification test set (1,919 sentences) with text-embedding-3-large.
+
+This produces the embedded version of the test set used in Table 9 / Appendix G
+of the ICLR 2026 paper for downstream negation classification.
+
+Usage:
+    python scripts/embed_classification_test.py
+"""
+
+import json
+import os
+import sys
+from pathlib import Path
+
+from dotenv import load_dotenv
+from openai import OpenAI
+from tqdm import tqdm
+
+# Load .env from helm or bkup-helm if OPENAI_API_KEY not already set
+for env_path in [Path.home() / "c/helm/.env", Path.home() / "c/bkup-helm/.env"]:
+    if env_path.exists():
+        load_dotenv(env_path)
+        break
+
+
+def batch_embed(client: OpenAI, texts: list[str], model: str, batch_size: int = 100) -> list[list[float]]:
+    """Embed texts in batches using the OpenAI API."""
+    all_embeddings = []
+    for i in tqdm(range(0, len(texts), batch_size), desc=f"Embedding ({model})"):
+        batch = texts[i : i + batch_size]
+        response = client.embeddings.create(input=batch, model=model)
+        embeddings = [item.embedding for item in response.data]
+        all_embeddings.extend(embeddings)
+    return all_embeddings
+
+
+def main():
+    data_dir = Path("data/negation_classification_test")
+    input_path = data_dir / "negation_test_sentences.json"
+
+    if not input_path.exists():
+        print(f"Error: {input_path} not found")
+        sys.exit(1)
+
+    with open(input_path) as f:
+        data = json.load(f)
+
+    positive = data["positive_sentences"]  # 958 sentences without negation
+    negative = data["negative_sentences"]  # 961 sentences with negation
+
+    print(f"Loaded {len(positive)} positive + {len(negative)} negative = {len(positive) + len(negative)} sentences")
+
+    client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])
+    model = "text-embedding-3-large"
+
+    # Embed all sentences in one pass
+    all_sentences = positive + negative
+    all_labels = [0] * len(positive) + [1] * len(negative)
+
+    print(f"\nEmbedding {len(all_sentences)} sentences with {model}...")
+    all_embeddings = batch_embed(client, all_sentences, model)
+
+    print(f"Got {len(all_embeddings)} embeddings of dimension {len(all_embeddings[0])}")
+
+    # Write as JSONL: one record per sentence
+    output_path = data_dir / "negation_test_embedded.jsonl"
+    with open(output_path, "w") as f:
+        for sentence, embedding, label in zip(all_sentences, all_embeddings, all_labels):
+            record = {
+                "text": sentence,
+                "embedding": embedding,
+                "label": label,  # 0 = no negation, 1 = has negation
+            }
+            f.write(json.dumps(record) + "\n")
+
+    print(f"\nWrote {len(all_embeddings)} embedded sentences to {output_path}")
+    print(f"  label=0 (no negation): {sum(1 for l in all_labels if l == 0)}")
+    print(f"  label=1 (has negation): {sum(1 for l in all_labels if l == 1)}")
+
+
+if __name__ == "__main__":
+    main()