Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
.python-version
anserini_docs
anserini_indicies
data
.DS_Store
dense_hs.*
best_models.py
Expand Down Expand Up @@ -171,3 +172,6 @@ cython_debug/
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/

dense_models_backup/
*.index
17 changes: 17 additions & 0 deletions .vscode/launch.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
{
// Use IntelliSense to learn about possible attributes.
// Hover to view descriptions of existing attributes.
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
"version": "0.2.0",
"configurations": [
{
"name": "Python: Current File",
"type": "python",
"request": "launch",
"program": "${file}",
"console": "integratedTerminal",
"justMyCode": true,
"args": ["--epochs","20","--lr","6e-05","--weight_decay","0.01","--model_dir","dense_models/baseline_distilbert_0","--run_id","baseline_distilbert_0","--model_or_checkpoint","distilbert-base-uncased","--embed_size","768","--batch_size","10","--encode_batch_size","128","--data_path","/home/ddo/CMU/PLLM/TREC-TOT","--negatives_path","/home/ddo/CMU/PLLM/TREC-TOT/negatives/bm25_negatives","--negatives_out","/home/ddo/CMU/PLLM/TREC-TOT/negatives/baseline_distilbert_0_negatives","--query","title_text","--device","cuda"]
}
]
}
10 changes: 5 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@ The following benchmarks (& runs) are available:

| Benchmark | Runfiles | Dev-DCG | Dev-Success@1000 | Dev-MRR |
|----------------------|----------|----------|-----------------|-------|
| [BM25](BM25.md) (k1=0.8, b=1.0) | [train](runs/bm25/train.run), [dev](runs/bm25/dev.run) | 0.1314 | 0.4067 | 0.0881 |
| [Dense Retrieval (SBERT)](DENSE.md) (Distilbert) | [train](runs/distilbert/train.run), [dev](runs/distilbert/dev.run) | 0.1627 | 0.6600 | 0.0743 |
| [GPT-4](GPT4.md)* | [train](runs/gpt4/train.run), [dev](runs/gpt4/dev.run) | 0.2407 | 0.3200 | 0.2180 |
| [BM25](docs/BM25.md) (k1=0.8, b=1.0) | [train](runs/bm25/train.run), [dev](runs/bm25/dev.run) | 0.1314 | 0.4067 | 0.0881 |
| [Dense Retrieval (SBERT)](docs/DENSE.md) (Distilbert) | [train](runs/distilbert/train.run), [dev](runs/distilbert/dev.run) | 0.1627 | 0.6600 | 0.0743 |
| [GPT-4](docs/GPT4.md)* | [train](runs/gpt4/train.run), [dev](runs/gpt4/dev.run) | 0.2407 | 0.3200 | 0.2180 |

*: GPT-4 generates 20 candidates at most. See [GPT4](GPT4.md) for more details.

Expand All @@ -18,7 +18,7 @@ The following benchmarks (& runs) are available:
## optional: create new environment using py-env virtual-env
## pyenv virtualenv 3.8.11 trec-tot-benchmarks
# install requirements
pip install ir_datasets sentence-transformers==2.2.2 pyserini==0.20.0 pytrec_eval faiss-cpu==1.6.5
pip install ir_datasets sentence-transformers==2.2.2 pyserini==0.20.0 pytrec_eval faiss-cpu==1.6.5 ranx==0.3.7
```

After downloading the files (see guidelines), set DATA_PATH to the folder which
Expand All @@ -39,4 +39,4 @@ Quick test to see if data is setup properly:
python tot.py
```
The command above should print the correct number of train/dev queries and the number of documents
in the corpus, along with example queries and documents.
in the corpus, along with example queries and documents.
2 changes: 1 addition & 1 deletion bm25.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@

log = logging.getLogger(__name__)

METRICS = "recall_10,recall_100,recall_1000,ndcg_cut_10,ndcg_cut_100,ndcg_cut_1000,recip_rank"
METRICS = "P_1,recall_10,recall_100,recall_1000,ndcg_cut_10,ndcg_cut_100,ndcg_cut_1000,recip_rank"


def create_index(dataset, field_to_index, dest_folder, index):
Expand Down
121 changes: 121 additions & 0 deletions bm25_with_rrf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
import json
import argparse
import os
import pandas as pd
import logging


from tqdm import tqdm
from typing import Dict
from pyserini.search.lucene import LuceneSearcher
from pyserini.trectools import TrecRun
from pyserini.fusion import reciprocal_rank_fusion
from ranx import Qrels, Run, evaluate

from modules import llm_based_decomposition, sentence_decomposition
import tot
import ir_datasets
from src import utils
import pytrec_eval

METRICS = "P_1,recall_10,recall_100,recall_1000,ndcg_cut_10,ndcg_cut_100,ndcg_cut_1000,recip_rank"

log = logging.getLogger(__name__)

def main():
parser = argparse.ArgumentParser()
# Path to indexes directory
parser.add_argument("--index_name", default="bm25_0.8_1.0", help="name of index")

parser.add_argument("--decomposition_method", default="llm", help="how to decompose")

parser.add_argument("--data_path", default="./data", help="location to dataset")

parser.add_argument("--split", choices={"train", "dev", "test"}, default="dev", help="split to run")

parser.add_argument("--index_path", default="./anserini_indicies", help="path to store (all) indices")

parser.add_argument("--metrics", default=METRICS, help="csv - metrics to evaluate")

parser.add_argument("--param_k1", default=0.8, type=float, help="param: k1 for BM25")

parser.add_argument("--param_b", default=1.0, type=float, help="param: b for BM25")

# BM25 parameters
parser.add_argument('--K', type=int, help='retrieve top K documents', default=1000)

# Binary flags to enable or disable ranking methodss
parser.add_argument('--rm3', type=str, help='enable or disable rm3', choices=['y', 'n'], default='n')

# Run number
parser.add_argument('--run_number', type=int, help='run number', default=1)

# Output options and directory
parser.add_argument('--output_dir', type=str, help='path to output_dir', default="runs/")
args = parser.parse_args()

tot.register(args.data_path)

irds_name = "trec-tot:" + args.split
dataset = ir_datasets.load(irds_name)
if args.decomposition_method == "llm":
queries_expanded = llm_based_decomposition(dataset, f"{args.data_path}/decomposed_queries")
else:
queries_expanded = sentence_decomposition(dataset, f"{args.data_path}/decomposed_queries")

queries = json.load(open(queries_expanded))

run_save_folder = f'{args.output_dir}BM25-RRF'
if args.decomposition_method == "llm":
run_save_folder += f'-llm'

run_save_folder += f'-RM3-{args.run_number}' if args.rm3 == 'y' else f'-{args.run_number}'

run_save_full = f"{run_save_folder}/{args.split}.run"

searcher = LuceneSearcher(os.path.join(args.index_path, args.index_name))
searcher.set_bm25(k1=args.param_k1, b=args.param_b)

if args.rm3 == 'y':
searcher.set_rm3()

# Retrieve
run_result = []

for query_id in tqdm(queries):
for sintetic_query_id in queries[query_id]:
hits = searcher.search(f'{queries[query_id][sintetic_query_id]}', k=args.K)
sintetic_query_results = []
for rank, hit in enumerate(hits, start=1):
sintetic_query_results.append((query_id, 'Q0', hit.docid, rank, hit.score, f'{query_id}_{sintetic_query_id}'))

if sintetic_query_results != []:
run_result.append(TrecRun.from_list(sintetic_query_results))

results = reciprocal_rank_fusion(run_result, depth=args.K, k=args.K)

print(f"saving run to: {run_save_full}")
os.makedirs(os.path.dirname(run_save_full), exist_ok=True)
results.save_to_txt(run_save_full)

if dataset.has_qrels():

with open(run_save_full, 'r') as h:
run_to_eval = pytrec_eval.parse_run(h)

qrel, n_missing = utils.get_qrel(dataset, run_to_eval)
if n_missing > 0:
raise ValueError(f"Number of missing qids in run: {n_missing}")

evaluator = pytrec_eval.RelevanceEvaluator(
qrel, args.metrics.split(","))

eval_res = evaluator.evaluate(run_to_eval)

eval_res_agg = utils.aggregate_pytrec(eval_res, "mean")

for metric, (mean, std) in eval_res_agg.items():
print(f"{metric:<12}: {mean:.4f} ({std:0.4f})")

if __name__ == '__main__':
main()
121 changes: 121 additions & 0 deletions dense_rrf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
import json
import argparse
import os
import pandas as pd
import logging


from tqdm import tqdm
from typing import Dict
from pyserini.search.lucene import LuceneSearcher
from pyserini.trectools import TrecRun
from pyserini.fusion import reciprocal_rank_fusion
from ranx import Qrels, Run, evaluate

from modules import llm_based_decomposition, sentence_decomposition
import tot
import ir_datasets
from src import utils
import pytrec_eval

METRICS = "P_1,recall_10,recall_100,recall_1000,ndcg_cut_10,ndcg_cut_100,ndcg_cut_1000,recip_rank"

log = logging.getLogger(__name__)

def main():
parser = argparse.ArgumentParser()
# Path to indexes directory
parser.add_argument("--index_name", default="bm25_0.8_1.0", help="name of index")

parser.add_argument("--decomposition_method", default="llm", help="how to decompose")

parser.add_argument("--data_path", default="./data", help="location to dataset")

parser.add_argument("--split", choices={"train", "dev", "test"}, default="dev", help="split to run")

parser.add_argument("--index_path", default="./anserini_indicies", help="path to store (all) indices")

parser.add_argument("--metrics", default=METRICS, help="csv - metrics to evaluate")

parser.add_argument("--param_k1", default=0.8, type=float, help="param: k1 for BM25")

parser.add_argument("--param_b", default=1.0, type=float, help="param: b for BM25")

# BM25 parameters
parser.add_argument('--K', type=int, help='retrieve top K documents', default=1000)

# Binary flags to enable or disable ranking methodss
parser.add_argument('--rm3', type=str, help='enable or disable rm3', choices=['y', 'n'], default='n')

# Run number
parser.add_argument('--run_number', type=int, help='run number', default=1)

# Output options and directory
parser.add_argument('--output_dir', type=str, help='path to output_dir', default="runs/")
args = parser.parse_args()

tot.register(args.data_path)

irds_name = "trec-tot:" + args.split
dataset = ir_datasets.load(irds_name)
if args.decomposition_method == "llm":
queries_expanded = llm_based_decomposition(dataset, f"{args.data_path}/decomposed_queries")
else:
queries_expanded = sentence_decomposition(dataset, f"{args.data_path}/decomposed_queries")

queries = json.load(open(queries_expanded))

run_save_folder = f'{args.output_dir}BM25-RRF'
if args.decomposition_method == "llm":
run_save_folder += f'-llm'

run_save_folder += f'-RM3-{args.run_number}' if args.rm3 == 'y' else f'-{args.run_number}'

run_save_full = f"{run_save_folder}/{args.split}.run"

searcher = LuceneSearcher(os.path.join(args.index_path, args.index_name))
searcher.set_bm25(k1=args.param_k1, b=args.param_b)

if args.rm3 == 'y':
searcher.set_rm3()

# Retrieve
run_result = []

for query_id in tqdm(queries):
for sintetic_query_id in queries[query_id]:
hits = searcher.search(f'{queries[query_id][sintetic_query_id]}', k=args.K)
sintetic_query_results = []
for rank, hit in enumerate(hits, start=1):
sintetic_query_results.append((query_id, 'Q0', hit.docid, rank, hit.score, f'{query_id}_{sintetic_query_id}'))

if sintetic_query_results != []:
run_result.append(TrecRun.from_list(sintetic_query_results))

results = reciprocal_rank_fusion(run_result, depth=args.K, k=args.K)

print(f"saving run to: {run_save_full}")
os.makedirs(os.path.dirname(run_save_full), exist_ok=True)
results.save_to_txt(run_save_full)

if dataset.has_qrels():

with open(run_save_full, 'r') as h:
run_to_eval = pytrec_eval.parse_run(h)

qrel, n_missing = utils.get_qrel(dataset, run_to_eval)
if n_missing > 0:
raise ValueError(f"Number of missing qids in run: {n_missing}")

evaluator = pytrec_eval.RelevanceEvaluator(
qrel, args.metrics.split(","))

eval_res = evaluator.evaluate(run_to_eval)

eval_res_agg = utils.aggregate_pytrec(eval_res, "mean")

for metric, (mean, std) in eval_res_agg.items():
print(f"{metric:<12}: {mean:.4f} ({std:0.4f})")

if __name__ == '__main__':
main()
1 change: 1 addition & 0 deletions docid_to_idx.json

Large diffs are not rendered by default.

Empty file added docs/BM25+RRF.md
Empty file.
File renamed without changes.
File renamed without changes.
File renamed without changes.
72 changes: 72 additions & 0 deletions gpt4-zeroshot.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
import time
import openai # for calling the OpenAI API
import pandas as pd # for storing text and embeddings data
import re


base = '/Users/aprameya/Desktop/llms-project/data/'
EMBEDDING_MODEL = "text-embedding-ada-002"
GPT_MODEL = "gpt-4"
openai.api_key = ""


jsonObjdata = pd.read_json("/Users/aprameya/Downloads/corpus.jsonl", lines=True)
ans = []
count=0
d = dict()
for index, row in jsonObjdata.iterrows():
d[row["doc_id"]] = row["page_title"]



jsonObj = pd.read_json(path_or_buf=base+"queries.jsonl", lines=True)
ans = []
actual = []
count=0
for index,row in jsonObj.iterrows():
query = f"""You are an expert in movies. You are helping someone recollect a movie name that is on the tip of their tongue. You respond to each message with a single guess for the name of the movie being described.**important**: you only mention the names of the movie and nothing else. Given below is the movie description:"

Description:
\"\"\"
{row["text"]}
\"\"\"

"""
response = openai.ChatCompletion.create(
messages=[
{'role': 'system', 'content': 'You help someone recollect a movie'},
{'role': 'user', 'content': query},
],
model=GPT_MODEL,
temperature=0,
)
ans.append(response['choices'][0]['message']['content'])
actual.append(d[row['wikipedia_id']])
print(count)
count+=1
time.sleep(3)
with open(base+'results.txt', 'w') as fp:
for item in ans:
fp.write(item+'\n')
with open(base+'labels.txt', 'w') as fp:
for item in actual:
fp.write(item+'\n')


### EVALUATE ###

with open(base+'results.txt', 'r') as f:
guesses = f.readlines()
with open(base+'labels.txt', 'r') as f:
labels = f.readlines()
count = 0
for i in range(len(labels)):
labels[i] = re.sub("\(.*?\)|\[.*?\]","",labels[i])
guesses[i] = re.sub("\(.*?\)|\[.*?\]","",guesses[i])
if(labels[i].strip()==guesses[i].strip()):
count+=1
print(labels[i])

print(count/150)


1 change: 1 addition & 0 deletions idx_to_docid.json

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions modules/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .query_decomposition import *
Loading