TREC-ToT · rosaboyle · Oct 21, 2023 · Oct 21, 2023 · Oct 21, 2023 · Oct 21, 2023
diff --git a/.gitignore b/.gitignore
@@ -2,6 +2,7 @@
 .python-version
 anserini_docs
 anserini_indicies
+data
 .DS_Store
 dense_hs.*
 best_models.py
@@ -171,3 +172,6 @@ cython_debug/
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
+
+dense_models_backup/
+*.index
diff --git a/.vscode/launch.json b/.vscode/launch.json
@@ -0,0 +1,17 @@
+{
+    // Use IntelliSense to learn about possible attributes.
+    // Hover to view descriptions of existing attributes.
+    // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
+    "version": "0.2.0",
+    "configurations": [
+        {
+            "name": "Python: Current File",
+            "type": "python",
+            "request": "launch",
+            "program": "${file}",
+            "console": "integratedTerminal",
+            "justMyCode": true,
+            "args": ["--epochs","20","--lr","6e-05","--weight_decay","0.01","--model_dir","dense_models/baseline_distilbert_0","--run_id","baseline_distilbert_0","--model_or_checkpoint","distilbert-base-uncased","--embed_size","768","--batch_size","10","--encode_batch_size","128","--data_path","/home/ddo/CMU/PLLM/TREC-TOT","--negatives_path","/home/ddo/CMU/PLLM/TREC-TOT/negatives/bm25_negatives","--negatives_out","/home/ddo/CMU/PLLM/TREC-TOT/negatives/baseline_distilbert_0_negatives","--query","title_text","--device","cuda"]
+        }
+    ]
+}
diff --git a/README.md b/README.md
@@ -5,9 +5,9 @@ The following benchmarks (& runs) are available:
 
 | Benchmark            | Runfiles | Dev-DCG | Dev-Success@1000 | Dev-MRR  |
 |----------------------|----------|----------|-----------------|-------|
-| [BM25](BM25.md) (k1=0.8, b=1.0) |  [train](runs/bm25/train.run), [dev](runs/bm25/dev.run)     | 0.1314 |    0.4067 | 0.0881 |
-| [Dense Retrieval (SBERT)](DENSE.md) (Distilbert) |  [train](runs/distilbert/train.run), [dev](runs/distilbert/dev.run)  | 0.1627 |  0.6600  |  0.0743 |
-| [GPT-4](GPT4.md)* | [train](runs/gpt4/train.run), [dev](runs/gpt4/dev.run) | 0.2407 | 0.3200 | 0.2180 | 
+| [BM25](docs/BM25.md) (k1=0.8, b=1.0) |  [train](runs/bm25/train.run), [dev](runs/bm25/dev.run)     | 0.1314 |    0.4067 | 0.0881 |
+| [Dense Retrieval (SBERT)](docs/DENSE.md) (Distilbert) |  [train](runs/distilbert/train.run), [dev](runs/distilbert/dev.run)  | 0.1627 |  0.6600  |  0.0743 |
+| [GPT-4](docs/GPT4.md)* | [train](runs/gpt4/train.run), [dev](runs/gpt4/dev.run) | 0.2407 | 0.3200 | 0.2180 | 
 
 *: GPT-4 generates 20 candidates at most. See [GPT4](GPT4.md) for more details.
 
@@ -18,7 +18,7 @@ The following benchmarks (& runs) are available:
 ## optional: create new environment using py-env virtual-env
 ## pyenv virtualenv 3.8.11 trec-tot-benchmarks
 # install requirements 
-pip install ir_datasets sentence-transformers==2.2.2 pyserini==0.20.0 pytrec_eval faiss-cpu==1.6.5
+pip install ir_datasets sentence-transformers==2.2.2 pyserini==0.20.0 pytrec_eval faiss-cpu==1.6.5 ranx==0.3.7
 ``` 
 
 After downloading the files (see guidelines), set DATA_PATH to the folder which 
@@ -39,4 +39,4 @@ Quick test to see if data is setup properly:
 python tot.py
 ```
 The command above should print the correct number of train/dev queries and the number of documents 
-in the corpus, along with example queries and documents.
+in the corpus, along with example queries and documents.
diff --git a/bm25.py b/bm25.py
@@ -18,7 +18,7 @@
 
 log = logging.getLogger(__name__)
 
-METRICS = "recall_10,recall_100,recall_1000,ndcg_cut_10,ndcg_cut_100,ndcg_cut_1000,recip_rank"
+METRICS = "P_1,recall_10,recall_100,recall_1000,ndcg_cut_10,ndcg_cut_100,ndcg_cut_1000,recip_rank"
 
 
 def create_index(dataset, field_to_index, dest_folder, index):

diff --git a/bm25_with_rrf.py b/bm25_with_rrf.py
@@ -0,0 +1,121 @@
+import json
+import argparse
+import os
+import pandas as pd
+import logging
+
+
+from tqdm import tqdm
+from typing import Dict
+from pyserini.search.lucene import LuceneSearcher
+from pyserini.trectools import TrecRun
+from pyserini.fusion import reciprocal_rank_fusion
+from ranx import Qrels, Run, evaluate
+
+from modules import llm_based_decomposition, sentence_decomposition
+import tot
+import ir_datasets
+from src import utils
+import pytrec_eval
+
+METRICS = "P_1,recall_10,recall_100,recall_1000,ndcg_cut_10,ndcg_cut_100,ndcg_cut_1000,recip_rank"
+
+log = logging.getLogger(__name__)
+
+def main():
+    parser = argparse.ArgumentParser()
+    # Path to indexes directory
+    parser.add_argument("--index_name", default="bm25_0.8_1.0", help="name of index")
+
+    parser.add_argument("--decomposition_method", default="llm", help="how to decompose")
+
+    parser.add_argument("--data_path", default="./data", help="location to dataset")
+
+    parser.add_argument("--split", choices={"train", "dev", "test"}, default="dev", help="split to run")
+
+    parser.add_argument("--index_path", default="./anserini_indicies", help="path to store (all) indices")
+
+    parser.add_argument("--metrics", default=METRICS, help="csv - metrics to evaluate")
+
+    parser.add_argument("--param_k1", default=0.8, type=float, help="param: k1 for BM25")
+
+    parser.add_argument("--param_b", default=1.0, type=float, help="param: b for BM25")
+
+    # BM25 parameters
+    parser.add_argument('--K', type=int, help='retrieve top K documents', default=1000)
+
+    # Binary flags to enable or disable ranking methodss
+    parser.add_argument('--rm3', type=str, help='enable or disable rm3', choices=['y', 'n'], default='n')
+
+    # Run number
+    parser.add_argument('--run_number', type=int, help='run number', default=1)
+
+    # Output options and directory
+    parser.add_argument('--output_dir', type=str, help='path to output_dir', default="runs/")
+    args = parser.parse_args()
+
+    tot.register(args.data_path)
+
+    irds_name = "trec-tot:" + args.split
+    dataset = ir_datasets.load(irds_name)
+    if args.decomposition_method == "llm":
+        queries_expanded = llm_based_decomposition(dataset, f"{args.data_path}/decomposed_queries")
+    else:
+        queries_expanded = sentence_decomposition(dataset, f"{args.data_path}/decomposed_queries")
+
+    queries = json.load(open(queries_expanded))
+
+    run_save_folder = f'{args.output_dir}BM25-RRF'
+    if args.decomposition_method == "llm":
+        run_save_folder += f'-llm'
+
+    run_save_folder += f'-RM3-{args.run_number}' if args.rm3 == 'y' else f'-{args.run_number}'
+
+    run_save_full = f"{run_save_folder}/{args.split}.run"
+
+    searcher = LuceneSearcher(os.path.join(args.index_path, args.index_name))
+    searcher.set_bm25(k1=args.param_k1, b=args.param_b)
+
+    if args.rm3 == 'y':
+        searcher.set_rm3()
+
+    # Retrieve
+    run_result = []
+
+    for query_id in tqdm(queries):
+        for sintetic_query_id in queries[query_id]:
+            hits = searcher.search(f'{queries[query_id][sintetic_query_id]}', k=args.K)
+            sintetic_query_results = []
+            for rank, hit in enumerate(hits, start=1):
+                sintetic_query_results.append((query_id, 'Q0', hit.docid, rank, hit.score, f'{query_id}_{sintetic_query_id}'))
+
+            if sintetic_query_results != []:
+                run_result.append(TrecRun.from_list(sintetic_query_results))
+
+    results = reciprocal_rank_fusion(run_result, depth=args.K, k=args.K)
+
+    print(f"saving run to: {run_save_full}")
+    os.makedirs(os.path.dirname(run_save_full), exist_ok=True)
+    results.save_to_txt(run_save_full)
+
+    if dataset.has_qrels():
+
+        with open(run_save_full, 'r') as h:
+            run_to_eval = pytrec_eval.parse_run(h)
+
+        qrel, n_missing = utils.get_qrel(dataset, run_to_eval)
+        if n_missing > 0:
+            raise ValueError(f"Number of missing qids in run: {n_missing}")
+
+        evaluator = pytrec_eval.RelevanceEvaluator(
+            qrel, args.metrics.split(","))
+
+        eval_res = evaluator.evaluate(run_to_eval)
+
+        eval_res_agg = utils.aggregate_pytrec(eval_res, "mean")
+
+        for metric, (mean, std) in eval_res_agg.items():
+            print(f"{metric:<12}: {mean:.4f} ({std:0.4f})")
+
+if __name__ == '__main__':
+    main()
diff --git a/dense_rrf.py b/dense_rrf.py
@@ -0,0 +1,121 @@
+import json
+import argparse
+import os
+import pandas as pd
+import logging
+
+
+from tqdm import tqdm
+from typing import Dict
+from pyserini.search.lucene import LuceneSearcher
+from pyserini.trectools import TrecRun
+from pyserini.fusion import reciprocal_rank_fusion
+from ranx import Qrels, Run, evaluate
+
+from modules import llm_based_decomposition, sentence_decomposition
+import tot
+import ir_datasets
+from src import utils
+import pytrec_eval
+
+METRICS = "P_1,recall_10,recall_100,recall_1000,ndcg_cut_10,ndcg_cut_100,ndcg_cut_1000,recip_rank"
+
+log = logging.getLogger(__name__)
+
+def main():
+    parser = argparse.ArgumentParser()
+    # Path to indexes directory
+    parser.add_argument("--index_name", default="bm25_0.8_1.0", help="name of index")
+
+    parser.add_argument("--decomposition_method", default="llm", help="how to decompose")
+
+    parser.add_argument("--data_path", default="./data", help="location to dataset")
+
+    parser.add_argument("--split", choices={"train", "dev", "test"}, default="dev", help="split to run")
+
+    parser.add_argument("--index_path", default="./anserini_indicies", help="path to store (all) indices")
+
+    parser.add_argument("--metrics", default=METRICS, help="csv - metrics to evaluate")
+
+    parser.add_argument("--param_k1", default=0.8, type=float, help="param: k1 for BM25")
+
+    parser.add_argument("--param_b", default=1.0, type=float, help="param: b for BM25")
+
+    # BM25 parameters
+    parser.add_argument('--K', type=int, help='retrieve top K documents', default=1000)
+
+    # Binary flags to enable or disable ranking methodss
+    parser.add_argument('--rm3', type=str, help='enable or disable rm3', choices=['y', 'n'], default='n')
+
+    # Run number
+    parser.add_argument('--run_number', type=int, help='run number', default=1)
+
+    # Output options and directory
+    parser.add_argument('--output_dir', type=str, help='path to output_dir', default="runs/")
+    args = parser.parse_args()
+
+    tot.register(args.data_path)
+
+    irds_name = "trec-tot:" + args.split
+    dataset = ir_datasets.load(irds_name)
+    if args.decomposition_method == "llm":
+        queries_expanded = llm_based_decomposition(dataset, f"{args.data_path}/decomposed_queries")
+    else:
+        queries_expanded = sentence_decomposition(dataset, f"{args.data_path}/decomposed_queries")
+
+    queries = json.load(open(queries_expanded))
+
+    run_save_folder = f'{args.output_dir}BM25-RRF'
+    if args.decomposition_method == "llm":
+        run_save_folder += f'-llm'
+
+    run_save_folder += f'-RM3-{args.run_number}' if args.rm3 == 'y' else f'-{args.run_number}'
+
+    run_save_full = f"{run_save_folder}/{args.split}.run"
+
+    searcher = LuceneSearcher(os.path.join(args.index_path, args.index_name))
+    searcher.set_bm25(k1=args.param_k1, b=args.param_b)
+
+    if args.rm3 == 'y':
+        searcher.set_rm3()
+
+    # Retrieve
+    run_result = []
+
+    for query_id in tqdm(queries):
+        for sintetic_query_id in queries[query_id]:
+            hits = searcher.search(f'{queries[query_id][sintetic_query_id]}', k=args.K)
+            sintetic_query_results = []
+            for rank, hit in enumerate(hits, start=1):
+                sintetic_query_results.append((query_id, 'Q0', hit.docid, rank, hit.score, f'{query_id}_{sintetic_query_id}'))
+
+            if sintetic_query_results != []:
+                run_result.append(TrecRun.from_list(sintetic_query_results))
+
+    results = reciprocal_rank_fusion(run_result, depth=args.K, k=args.K)
+
+    print(f"saving run to: {run_save_full}")
+    os.makedirs(os.path.dirname(run_save_full), exist_ok=True)
+    results.save_to_txt(run_save_full)
+
+    if dataset.has_qrels():
+
+        with open(run_save_full, 'r') as h:
+            run_to_eval = pytrec_eval.parse_run(h)
+
+        qrel, n_missing = utils.get_qrel(dataset, run_to_eval)
+        if n_missing > 0:
+            raise ValueError(f"Number of missing qids in run: {n_missing}")
+
+        evaluator = pytrec_eval.RelevanceEvaluator(
+            qrel, args.metrics.split(","))
+
+        eval_res = evaluator.evaluate(run_to_eval)
+
+        eval_res_agg = utils.aggregate_pytrec(eval_res, "mean")
+
+        for metric, (mean, std) in eval_res_agg.items():
+            print(f"{metric:<12}: {mean:.4f} ({std:0.4f})")
+
+if __name__ == '__main__':
+    main()
diff --git a/docid_to_idx.json b/docid_to_idx.json
diff --git a/docs/BM25+RRF.md b/docs/BM25+RRF.md
diff --git a/BM25.md → docs/BM25.md b/BM25.md → docs/BM25.md
diff --git a/DENSE.md → docs/DENSE.md b/DENSE.md → docs/DENSE.md
diff --git a/GPT4.md → docs/GPT4.md b/GPT4.md → docs/GPT4.md
diff --git a/gpt4-zeroshot.py b/gpt4-zeroshot.py
@@ -0,0 +1,72 @@
+import time
+import openai  # for calling the OpenAI API
+import pandas as pd  # for storing text and embeddings data
+import re
+
+
+base = '/Users/aprameya/Desktop/llms-project/data/'
+EMBEDDING_MODEL = "text-embedding-ada-002"
+GPT_MODEL = "gpt-4"
+openai.api_key = ""
+
+
+jsonObjdata = pd.read_json("/Users/aprameya/Downloads/corpus.jsonl", lines=True)
+ans = []
+count=0
+d = dict()
+for index, row in jsonObjdata.iterrows():
+    d[row["doc_id"]] = row["page_title"]
+
+
+
+jsonObj = pd.read_json(path_or_buf=base+"queries.jsonl", lines=True)
+ans = []
+actual = []
+count=0
+for index,row in jsonObj.iterrows():
+    query = f"""You are an expert in movies. You are helping someone recollect a movie name that is on the tip of their tongue. You respond to each message with a single guess for the name of the movie being described.**important**: you only mention the names of the movie and nothing else. Given below is the movie description:"
+
+        Description:
+        \"\"\"
+        {row["text"]}
+        \"\"\"
+
+        """
+    response = openai.ChatCompletion.create(
+        messages=[
+            {'role': 'system', 'content': 'You help someone recollect a movie'},
+            {'role': 'user', 'content': query},
+        ],
+        model=GPT_MODEL,
+        temperature=0,
+    )
+    ans.append(response['choices'][0]['message']['content'])
+    actual.append(d[row['wikipedia_id']])
+    print(count)
+    count+=1
+    time.sleep(3)
+with open(base+'results.txt', 'w') as fp:
+    for item in ans:
+        fp.write(item+'\n')
+with open(base+'labels.txt', 'w') as fp:
+    for item in actual:
+        fp.write(item+'\n')
+
+
+### EVALUATE ###
+
+with open(base+'results.txt', 'r') as f:
+  guesses = f.readlines()
+with open(base+'labels.txt', 'r') as f:
+  labels = f.readlines()
+count = 0
+for i in range(len(labels)):
+  labels[i] = re.sub("\(.*?\)|\[.*?\]","",labels[i])
+  guesses[i] = re.sub("\(.*?\)|\[.*?\]","",guesses[i])
+  if(labels[i].strip()==guesses[i].strip()):
+    count+=1
+    print(labels[i])
+
+print(count/150) 
+
+
diff --git a/idx_to_docid.json b/idx_to_docid.json
diff --git a/modules/__init__.py b/modules/__init__.py
@@ -0,0 +1 @@
+from .query_decomposition import *