diff --git a/config/dataset/beir/climate-fever.yaml b/config/dataset/beir/climate-fever.yaml new file mode 100644 index 0000000..74ea66d --- /dev/null +++ b/config/dataset/beir/climate-fever.yaml @@ -0,0 +1,19 @@ +train: + doc: null + query: null + +dev: + doc: + init_args: + _target_: modules.dataset_processor.IRDSDocProcessor + irds_name: "beir/climate-fever" + split: "full" + + query: + init_args: + _target_: modules.dataset_processor.IRDSQueryProcessor + irds_name: "beir/climate-fever" + split: "full" +test: + doc: null + query: null \ No newline at end of file diff --git a/config/dataset/beir/dbpedia.yaml b/config/dataset/beir/dbpedia.yaml new file mode 100644 index 0000000..c06778b --- /dev/null +++ b/config/dataset/beir/dbpedia.yaml @@ -0,0 +1,19 @@ +train: + doc: null + query: null + +dev: + doc: + init_args: + _target_: modules.dataset_processor.IRDSDocProcessor + irds_name: "beir/dbpedia-entity/test" + split: "full" + + query: + init_args: + _target_: modules.dataset_processor.IRDSQueryProcessor + irds_name: "beir/dbpedia-entity/test" + split: "full" +test: + doc: null + query: null \ No newline at end of file diff --git a/config/dataset/beir/fever.yaml b/config/dataset/beir/fever.yaml new file mode 100644 index 0000000..cf3254c --- /dev/null +++ b/config/dataset/beir/fever.yaml @@ -0,0 +1,19 @@ +train: + doc: null + query: null + +dev: + doc: + init_args: + _target_: modules.dataset_processor.IRDSDocProcessor + irds_name: "beir/fever/test" + split: "full" + + query: + init_args: + _target_: modules.dataset_processor.IRDSQueryProcessor + irds_name: "beir/fever/test" + split: "full" +test: + doc: null + query: null \ No newline at end of file diff --git a/config/dataset/beir/fiqa.yaml b/config/dataset/beir/fiqa.yaml new file mode 100644 index 0000000..b0473bf --- /dev/null +++ b/config/dataset/beir/fiqa.yaml @@ -0,0 +1,19 @@ +train: + doc: null + query: null + +dev: + doc: + init_args: + _target_: modules.dataset_processor.IRDSDocProcessor + irds_name: "beir/fiqa/test" + split: "full" + + query: + init_args: + _target_: modules.dataset_processor.IRDSQueryProcessor + irds_name: "beir/fiqa/test" + split: "full" +test: + doc: null + query: null \ No newline at end of file diff --git a/config/dataset/beir/hotpotqa.yaml b/config/dataset/beir/hotpotqa.yaml new file mode 100644 index 0000000..cd0880d --- /dev/null +++ b/config/dataset/beir/hotpotqa.yaml @@ -0,0 +1,19 @@ +train: + doc: null + query: null + +dev: + doc: + init_args: + _target_: modules.dataset_processor.IRDSDocProcessor + irds_name: "beir/hotpotqa/test" + split: "full" + + query: + init_args: + _target_: modules.dataset_processor.IRDSQueryProcessor + irds_name: "beir/hotpotqa/test" + split: "full" +test: + doc: null + query: null \ No newline at end of file diff --git a/config/dataset/beir/lotte_forum.yaml b/config/dataset/beir/lotte_forum.yaml new file mode 100644 index 0000000..17c5cd3 --- /dev/null +++ b/config/dataset/beir/lotte_forum.yaml @@ -0,0 +1,19 @@ +train: + doc: null + query: null + +dev: + doc: + init_args: + _target_: modules.dataset_processor.IRDSDocProcessor + irds_name: "lotte/pooled/test/forum" + split: "full" + + query: + init_args: + _target_: modules.dataset_processor.IRDSQueryProcessor + irds_name: "lotte/pooled/test/forum" + split: "full" +test: + doc: null + query: null diff --git a/config/dataset/beir/lotte_search.yaml b/config/dataset/beir/lotte_search.yaml new file mode 100644 index 0000000..389400c --- /dev/null +++ b/config/dataset/beir/lotte_search.yaml @@ -0,0 +1,19 @@ +train: + doc: null + query: null + +dev: + doc: + init_args: + _target_: modules.dataset_processor.IRDSDocProcessor + irds_name: "lotte/pooled/test/search" + split: "full" + + query: + init_args: + _target_: modules.dataset_processor.IRDSQueryProcessor + irds_name: "lotte/pooled/test/search" + split: "full" +test: + doc: null + query: null diff --git a/config/dataset/beir/nfcorpus.yaml b/config/dataset/beir/nfcorpus.yaml new file mode 100644 index 0000000..61e3fd0 --- /dev/null +++ b/config/dataset/beir/nfcorpus.yaml @@ -0,0 +1,19 @@ +train: + doc: null + query: null + +dev: + doc: + init_args: + _target_: modules.dataset_processor.IRDSDocProcessor + irds_name: "beir/nfcorpus/test" + split: "full" + + query: + init_args: + _target_: modules.dataset_processor.IRDSQueryProcessor + irds_name: "beir/nfcorpus/test" + split: "full" +test: + doc: null + query: null \ No newline at end of file diff --git a/config/dataset/beir/nq.yaml b/config/dataset/beir/nq.yaml new file mode 100644 index 0000000..15fad26 --- /dev/null +++ b/config/dataset/beir/nq.yaml @@ -0,0 +1,19 @@ +train: + doc: null + query: null + +dev: + doc: + init_args: + _target_: modules.dataset_processor.IRDSDocProcessor + irds_name: "beir/nq" + split: "full" + + query: + init_args: + _target_: modules.dataset_processor.IRDSQueryProcessor + irds_name: "beir/nq" + split: "full" +test: + doc: null + query: null \ No newline at end of file diff --git a/config/dataset/beir/quora.yaml b/config/dataset/beir/quora.yaml new file mode 100644 index 0000000..2c2d273 --- /dev/null +++ b/config/dataset/beir/quora.yaml @@ -0,0 +1,19 @@ +train: + doc: null + query: null + +dev: + doc: + init_args: + _target_: modules.dataset_processor.IRDSDocProcessor + irds_name: "beir/quora/test" + split: "full" + query: + init_args: + _target_: modules.dataset_processor.IRDSQueryProcessor + irds_name: "beir/quora/test" + split: "full" + +test: + doc: null + query: null diff --git a/config/dataset/beir/scidocs.yaml b/config/dataset/beir/scidocs.yaml new file mode 100644 index 0000000..2b07173 --- /dev/null +++ b/config/dataset/beir/scidocs.yaml @@ -0,0 +1,19 @@ +train: + doc: null + query: null + +dev: + doc: + init_args: + _target_: modules.dataset_processor.IRDSDocProcessor + irds_name: "beir/scidocs" + split: "full" + + query: + init_args: + _target_: modules.dataset_processor.IRDSQueryProcessor + irds_name: "beir/scidocs" + split: "full" +test: + doc: null + query: null \ No newline at end of file diff --git a/config/dataset/beir/scifact.yaml b/config/dataset/beir/scifact.yaml new file mode 100644 index 0000000..9aa23e6 --- /dev/null +++ b/config/dataset/beir/scifact.yaml @@ -0,0 +1,19 @@ +train: + doc: null + query: null + +dev: + doc: + init_args: + _target_: modules.dataset_processor.IRDSDocProcessor + irds_name: "beir/scifact/test" + split: "full" + + query: + init_args: + _target_: modules.dataset_processor.IRDSQueryProcessor + irds_name: "beir/scifact/test" + split: "full" +test: + doc: null + query: null \ No newline at end of file diff --git a/config/dataset/beir/trec-covid.yaml b/config/dataset/beir/trec-covid.yaml new file mode 100644 index 0000000..60e7030 --- /dev/null +++ b/config/dataset/beir/trec-covid.yaml @@ -0,0 +1,19 @@ +train: + doc: null + query: null + +dev: + doc: + init_args: + _target_: modules.dataset_processor.IRDSDocProcessor + irds_name: "beir/trec-covid" + split: "full" + + query: + init_args: + _target_: modules.dataset_processor.IRDSQueryProcessor + irds_name: "beir/trec-covid" + split: "full" +test: + doc: null + query: null \ No newline at end of file diff --git a/config/dataset/beir/webis-touche.yaml b/config/dataset/beir/webis-touche.yaml new file mode 100644 index 0000000..7d8783c --- /dev/null +++ b/config/dataset/beir/webis-touche.yaml @@ -0,0 +1,19 @@ +train: + doc: null + query: null + +dev: + doc: + init_args: + _target_: modules.dataset_processor.IRDSDocProcessor + irds_name: "beir/webis-touche2020/v2" + split: "full" + + query: + init_args: + _target_: modules.dataset_processor.IRDSQueryProcessor + irds_name: "beir/webis-touche2020/v2" + split: "full" +test: + doc: null + query: null \ No newline at end of file diff --git a/config/dataset/bright/aops.yaml b/config/dataset/bright/aops.yaml new file mode 100644 index 0000000..aa18b25 --- /dev/null +++ b/config/dataset/bright/aops.yaml @@ -0,0 +1,19 @@ +train: + doc: null + query: null + +dev: + doc: + init_args: + _target_: modules.processors.bright_dataset_processor.BRIGHTDocProcessor + split: "aops" + longdoc: False + + query: + init_args: + _target_: modules.processors.bright_dataset_processor.BRIGHTQueryProcessor + split: "aops" + longdoc: False +test: + doc: null + query: null diff --git a/config/dataset/bright/biology.yaml b/config/dataset/bright/biology.yaml new file mode 100644 index 0000000..49a1909 --- /dev/null +++ b/config/dataset/bright/biology.yaml @@ -0,0 +1,19 @@ +train: + doc: null + query: null + +dev: + doc: + init_args: + _target_: modules.processors.bright_dataset_processor.BRIGHTDocProcessor + split: "biology" + longdoc: False + + query: + init_args: + _target_: modules.processors.bright_dataset_processor.BRIGHTQueryProcessor + split: "biology" + longdoc: False +test: + doc: null + query: null diff --git a/config/dataset/bright/earth_science.yaml b/config/dataset/bright/earth_science.yaml new file mode 100644 index 0000000..3990005 --- /dev/null +++ b/config/dataset/bright/earth_science.yaml @@ -0,0 +1,19 @@ +train: + doc: null + query: null + +dev: + doc: + init_args: + _target_: modules.processors.bright_dataset_processor.BRIGHTDocProcessor + split: "earth_science" + longdoc: False + + query: + init_args: + _target_: modules.processors.bright_dataset_processor.BRIGHTQueryProcessor + split: "earth_science" + longdoc: False +test: + doc: null + query: null diff --git a/config/dataset/bright/economics.yaml b/config/dataset/bright/economics.yaml new file mode 100644 index 0000000..536d34a --- /dev/null +++ b/config/dataset/bright/economics.yaml @@ -0,0 +1,19 @@ +train: + doc: null + query: null + +dev: + doc: + init_args: + _target_: modules.processors.bright_dataset_processor.BRIGHTDocProcessor + split: "economics" + longdoc: False + + query: + init_args: + _target_: modules.processors.bright_dataset_processor.BRIGHTQueryProcessor + split: "economics" + longdoc: False +test: + doc: null + query: null diff --git a/config/dataset/bright/leetcode.yaml b/config/dataset/bright/leetcode.yaml new file mode 100644 index 0000000..308fde3 --- /dev/null +++ b/config/dataset/bright/leetcode.yaml @@ -0,0 +1,19 @@ +train: + doc: null + query: null + +dev: + doc: + init_args: + _target_: modules.processors.bright_dataset_processor.BRIGHTDocProcessor + split: "leetcode" + longdoc: False + + query: + init_args: + _target_: modules.processors.bright_dataset_processor.BRIGHTQueryProcessor + split: "leetcode" + longdoc: False +test: + doc: null + query: null diff --git a/config/dataset/bright/pony.yaml b/config/dataset/bright/pony.yaml new file mode 100644 index 0000000..c57261e --- /dev/null +++ b/config/dataset/bright/pony.yaml @@ -0,0 +1,19 @@ +train: + doc: null + query: null + +dev: + doc: + init_args: + _target_: modules.processors.bright_dataset_processor.BRIGHTDocProcessor + split: "pony" + longdoc: False + + query: + init_args: + _target_: modules.processors.bright_dataset_processor.BRIGHTQueryProcessor + split: "pony" + longdoc: False +test: + doc: null + query: null diff --git a/config/dataset/bright/psychology.yaml b/config/dataset/bright/psychology.yaml new file mode 100644 index 0000000..93b281d --- /dev/null +++ b/config/dataset/bright/psychology.yaml @@ -0,0 +1,19 @@ +train: + doc: null + query: null + +dev: + doc: + init_args: + _target_: modules.processors.bright_dataset_processor.BRIGHTDocProcessor + split: "psychology" + longdoc: False + + query: + init_args: + _target_: modules.processors.bright_dataset_processor.BRIGHTQueryProcessor + split: "psychology" + longdoc: False +test: + doc: null + query: null diff --git a/config/dataset/bright/robotics.yaml b/config/dataset/bright/robotics.yaml new file mode 100644 index 0000000..32a192d --- /dev/null +++ b/config/dataset/bright/robotics.yaml @@ -0,0 +1,19 @@ +train: + doc: null + query: null + +dev: + doc: + init_args: + _target_: modules.processors.bright_dataset_processor.BRIGHTDocProcessor + split: "robotics" + longdoc: False + + query: + init_args: + _target_: modules.processors.bright_dataset_processor.BRIGHTQueryProcessor + split: "robotics" + longdoc: False +test: + doc: null + query: null diff --git a/config/dataset/bright/stackoverflow.yaml b/config/dataset/bright/stackoverflow.yaml new file mode 100644 index 0000000..3a0f5da --- /dev/null +++ b/config/dataset/bright/stackoverflow.yaml @@ -0,0 +1,19 @@ +train: + doc: null + query: null + +dev: + doc: + init_args: + _target_: modules.processors.bright_dataset_processor.BRIGHTDocProcessor + split: "stackoverflow" + longdoc: False + + query: + init_args: + _target_: modules.processors.bright_dataset_processor.BRIGHTQueryProcessor + split: "stackoverflow" + longdoc: False +test: + doc: null + query: null diff --git a/config/dataset/bright/sustainable_living.yaml b/config/dataset/bright/sustainable_living.yaml new file mode 100644 index 0000000..81524a8 --- /dev/null +++ b/config/dataset/bright/sustainable_living.yaml @@ -0,0 +1,19 @@ +train: + doc: null + query: null + +dev: + doc: + init_args: + _target_: modules.processors.bright_dataset_processor.BRIGHTDocProcessor + split: "sustainable_living" + longdoc: False + + query: + init_args: + _target_: modules.processors.bright_dataset_processor.BRIGHTQueryProcessor + split: "sustainable_living" + longdoc: False +test: + doc: null + query: null diff --git a/config/dataset/bright/theoremqa_questions.yaml b/config/dataset/bright/theoremqa_questions.yaml new file mode 100644 index 0000000..2c9c5f5 --- /dev/null +++ b/config/dataset/bright/theoremqa_questions.yaml @@ -0,0 +1,19 @@ +train: + doc: null + query: null + +dev: + doc: + init_args: + _target_: modules.processors.bright_dataset_processor.BRIGHTDocProcessor + split: "theoremqa_questions" + longdoc: False + + query: + init_args: + _target_: modules.processors.bright_dataset_processor.BRIGHTQueryProcessor + split: "theoremqa_questions" + longdoc: False +test: + doc: null + query: null diff --git a/config/dataset/bright/theoremqa_theorems.yaml b/config/dataset/bright/theoremqa_theorems.yaml new file mode 100644 index 0000000..a13b479 --- /dev/null +++ b/config/dataset/bright/theoremqa_theorems.yaml @@ -0,0 +1,19 @@ +train: + doc: null + query: null + +dev: + doc: + init_args: + _target_: modules.processors.bright_dataset_processor.BRIGHTDocProcessor + split: "theoremqa_theorems" + longdoc: False + + query: + init_args: + _target_: modules.processors.bright_dataset_processor.BRIGHTQueryProcessor + split: "theoremqa_theorems" + longdoc: False +test: + doc: null + query: null diff --git a/config/dataset/msmarco-train.yaml b/config/dataset/msmarco-train.yaml new file mode 100644 index 0000000..4c6c3a5 --- /dev/null +++ b/config/dataset/msmarco-train.yaml @@ -0,0 +1,16 @@ +train: + doc: null + query: null +dev: + doc: + init_args: + _target_: modules.dataset_processor.MsMarcoCollection + split: "full" + + query: + init_args: + _target_: modules.dataset_processor.MsMarcoTrainQueries + split: "full" +test: + doc: null + query: null diff --git a/config/reranker/bge.yaml b/config/reranker/bge.yaml index 11216c1..c405a9c 100644 --- a/config/reranker/bge.yaml +++ b/config/reranker/bge.yaml @@ -1,5 +1,5 @@ init_args: _target_: models.rerankers.crossencoder.CrossEncoder - model_name: "BAAI/bge-large-en" + model_name: "BAAI/bge-reranker-large" max_len: 256 batch_size: 256 diff --git a/config/reranker/mixbread.yaml b/config/reranker/mixbread.yaml new file mode 100644 index 0000000..eb51a46 --- /dev/null +++ b/config/reranker/mixbread.yaml @@ -0,0 +1,5 @@ +init_args: + _target_: models.rerankers.crossencoder.CrossEncoder + model_name: "mixedbread-ai/mxbai-rerank-large-v1" + max_len: 256 +batch_size: 64 diff --git a/config/reranker/rankllama.yaml b/config/reranker/rankllama.yaml new file mode 100644 index 0000000..df79e30 --- /dev/null +++ b/config/reranker/rankllama.yaml @@ -0,0 +1,5 @@ +init_args: + _target_: models.rerankers.crossencoder.CrossEncoder + model_name: "castorini/rankllama-v1-7b-lora-passage" + max_len: 256 +batch_size: 32 diff --git a/models/rerankers/crossencoder.py b/models/rerankers/crossencoder.py index bb5476c..1c1be33 100644 --- a/models/rerankers/crossencoder.py +++ b/models/rerankers/crossencoder.py @@ -14,8 +14,14 @@ class CrossEncoder(Reranker): def __init__(self, model_name=None,max_len=512): self.model_name = model_name self.max_len= max_len - self.model = AutoModelForSequenceClassification.from_pretrained(self.model_name, low_cpu_mem_usage=True, torch_dtype=torch.float16) - self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, max_length=self.max_len) + self.model = AutoModelForSequenceClassification.from_pretrained(self.model_name, num_labels=1, torch_dtype=torch.float16) + if model_name== 'castorini/rankllama-v1-7b-lora-passage': + self.tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf", padding_side="right") + self.tokenizer.pad_token = self.tokenizer.eos_token + self.tokenizer.pad_token_id = self.tokenizer.eos_token_id + self.model.config.pad_token_id = self.tokenizer.pad_token_id + else: + self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, max_length=self.max_len) self.model.eval() if torch.cuda.device_count() > 1 and torch.cuda.is_available(): self.model = torch.nn.DataParallel(self.model) diff --git a/models/retrievers/repllama.py b/models/retrievers/repllama.py index 0bb308c..78fac23 100644 --- a/models/retrievers/repllama.py +++ b/models/retrievers/repllama.py @@ -55,7 +55,7 @@ def collate_fn(self, batch, query_or_doc): return_dict = self.tokenizer(content, padding=True, truncation=True, max_length=self.max_len,return_tensors='pt') return return_dict - def __call__(self, kwargs): + def __call__(self, query_or_doc, kwargs): kwargs = {key: value.to(self.device) for key, value in kwargs.items()} # get accumulated eos token counts per exmaple accumulated_eos_tokens = (kwargs['input_ids'] != self.tokenizer.pad_token_id).cumsum(dim=1) diff --git a/modules/dataset_processor.py b/modules/dataset_processor.py index 69923c7..c18931a 100644 --- a/modules/dataset_processor.py +++ b/modules/dataset_processor.py @@ -483,15 +483,74 @@ def __init__(self, *args, **kwargs): def process(self): # load from the ir-dataset HF repo hf_name = "irds/msmarco-passage" - dataset = datasets.load_dataset(hf_name, 'docs', num_proc=self.num_proc) # no need for split? + dataset = datasets.load_dataset(hf_name, 'docs', num_proc=self.num_proc,trust_remote_code=True) # no need for split? dataset = dataset.rename_column("doc_id", "id") dataset = dataset.rename_column("text", "content") return dataset +class MsMarcoTrainQueries(Processor): + + def __init__(self, *args, **kwargs): + dataset_name = 'ms-marco-train-queries' + super().__init__(*args, **kwargs, dataset_name=dataset_name) + + def process(self): + import ir_datasets + ird = ir_datasets.load("msmarco-passage/train/judged") + Qid= [q.query_id for q in ird.queries_iter()] + Qtext= [q.text for q in ird.queries_iter()] + hf_dataset= datasets.Dataset.from_dict({'id':Qid, 'content':Qtext}) + return hf_dataset + # applies processing to dataset names # processes query and doc with different processors +class IRDSDocProcessor(Processor): + def __init__(self, irds_name,*args, **kwargs): + dataset_name = irds_name.replace('/','_')+'_doc' + super().__init__(*args, **kwargs, dataset_name=dataset_name) + self.irds_name=irds_name + + def process(self): + import ir_datasets + dataset = ir_datasets.load(self.irds_name) + print(dataset) + def dataset_generator(): + for doc in dataset.docs_iter(): + doc # namedtuple + doc_text='' + if hasattr(doc,'title'): + doc_text+=doc.title+ ' ' +doc.text + else: + doc_text+=doc.text + yield {'id':doc.doc_id, 'content':doc_text} + + hf_dataset= datasets.Dataset.from_generator(dataset_generator) + return hf_dataset + + +class IRDSQueryProcessor(Processor): + def __init__(self, irds_name,*args, **kwargs): + dataset_name = irds_name.replace('/','_')+'_query' + self.irds_name=irds_name + super().__init__(*args, **kwargs, dataset_name=dataset_name) + + def process(self): + import ir_datasets + dataset = ir_datasets.load(self.irds_name) + print(dataset) + def dataset_generator(): + for doc in dataset.queries_iter(): + ## namedtuple + yield {'id':doc.query_id, 'content':doc.text} + + hf_dataset= datasets.Dataset.from_generator(dataset_generator) + return hf_dataset + + + + class UT1Queries(Processor): def __init__(self, *args, **kwargs): dataset_name = 'ut1queries' diff --git a/modules/processors/bright_dataset_processor.py b/modules/processors/bright_dataset_processor.py new file mode 100644 index 0000000..b20ff5b --- /dev/null +++ b/modules/processors/bright_dataset_processor.py @@ -0,0 +1,47 @@ +from ..dataset_processor import * +import datasets +import requests + + + + + +class BRIGHTDocProcessor(Processor): + def __init__(self, longdoc, split,*args, **kwargs): + dataset_name = 'BRIGHT_%s'% split + super().__init__(*args, **kwargs, split=split,dataset_name=dataset_name) + self.longdoc = longdoc + + def process(self): + hf_name = 'xlangai/BRIGHT' + doc = 'long_documents' if self.longdoc else 'documents' + dataset = datasets.load_dataset(hf_name, doc,num_proc=self.num_proc)[self.split] + return dataset + + + + +class BRIGHTQueryProcessor(Processor): + def __init__(self, longdoc,split,qlen=-1,*args, **kwargs): + dataset_name = 'BRIGHTQuery_%s' %split + super().__init__(*args, **kwargs, split=split,dataset_name=dataset_name) + self.longdoc = longdoc + self.qlen = qlen + + def process(self): + hf_name = 'xlangai/BRIGHT' + dataset = datasets.load_dataset(hf_name, "examples",num_proc=self.num_proc)[self.split] + dataset = dataset.rename_column("query", "content") + if self.qlen != -1: + dataset = dataset.map(lambda x:{'content':" ".join(x['content'].split()[:self.qlen])}) + if self.longdoc: + dataset = dataset.rename_column("gold_ids_long", "ranking_label") + else: + dataset = dataset.rename_column("gold_ids", "ranking_label") + + dataset = dataset.remove_columns(['reasoning', 'excluded_ids','gold_ids_long']) + + return dataset + + +