From cdaf034ca2a1aed80f660f58b48cdf2b47eba212 Mon Sep 17 00:00:00 2001 From: Ryan Yu Date: Wed, 10 Dec 2025 03:36:11 -0500 Subject: [PATCH 01/14] rewrote passage chunking --- src/tevatron/retriever/arguments.py | 4 + src/tevatron/retriever/collator.py | 97 +++++++++++++++++----- src/tevatron/retriever/driver/train.py | 1 + src/tevatron/retriever/modeling/dense.py | 35 +++++++- src/tevatron/retriever/modeling/encoder.py | 55 ++++++++---- src/tevatron/retriever/trainer.py | 5 +- 6 files changed, 157 insertions(+), 40 deletions(-) diff --git a/src/tevatron/retriever/arguments.py b/src/tevatron/retriever/arguments.py index 00034903..cce3285f 100644 --- a/src/tevatron/retriever/arguments.py +++ b/src/tevatron/retriever/arguments.py @@ -203,6 +203,10 @@ class DataArguments: metadata={"help": "padding side for the tokenizer, can be 'left' or 'right'"} ) + passage_chunk_size: int = field( + default=0, + metadata={"help": "Chunk size for chunked passage encoding with MaxSim. 0=disabled, >0=chunk size in tokens"} + ) @dataclass diff --git a/src/tevatron/retriever/collator.py b/src/tevatron/retriever/collator.py index 20e02ef5..6323fcc1 100644 --- a/src/tevatron/retriever/collator.py +++ b/src/tevatron/retriever/collator.py @@ -24,7 +24,7 @@ def __call__(self, features: List[Tuple[str, List[str]]]): """ Collate function for training. :param features: list of (query, passages) tuples - :return: tokenized query_ids, passage_ids + :return: tokenized query_ids, passage_ids, [eos_positions if chunked] """ all_queries = [f[0] for f in features] all_passages = [] @@ -32,6 +32,8 @@ def __call__(self, features: List[Tuple[str, List[str]]]): all_passages.extend(f[1]) all_queries = [q[0] for q in all_queries] all_passages = [p[0] for p in all_passages] + + # Query tokenization q_collated = self.tokenizer( all_queries, padding=False, @@ -41,20 +43,8 @@ def __call__(self, features: List[Tuple[str, List[str]]]): return_token_type_ids=False, add_special_tokens=True, ) - d_collated = self.tokenizer( - all_passages, - padding=False, - truncation=True, - max_length=self.data_args.passage_max_len-1 if self.data_args.append_eos_token else self.data_args.passage_max_len, - return_attention_mask=False, - return_token_type_ids=False, - add_special_tokens=True, - ) - if self.data_args.append_eos_token: q_collated['input_ids'] = [q + [self.tokenizer.eos_token_id] for q in q_collated['input_ids']] - d_collated['input_ids'] = [d + [self.tokenizer.eos_token_id] for d in d_collated['input_ids']] - q_collated = self.tokenizer.pad( q_collated, padding=True, @@ -62,14 +52,79 @@ def __call__(self, features: List[Tuple[str, List[str]]]): return_attention_mask=True, return_tensors='pt', ) - d_collated = self.tokenizer.pad( - d_collated, - padding=True, - pad_to_multiple_of=self.data_args.pad_to_multiple_of, - return_attention_mask=True, - return_tensors='pt', - ) - return q_collated, d_collated + + # Passage tokenization + if self.data_args.passage_chunk_size > 0: + d_collated, eos_positions = self._tokenize_chunked_passages(all_passages) + return q_collated, d_collated, eos_positions + else: + d_collated = self.tokenizer( + all_passages, + padding=False, + truncation=True, + max_length=self.data_args.passage_max_len-1 if self.data_args.append_eos_token else self.data_args.passage_max_len, + return_attention_mask=False, + return_token_type_ids=False, + add_special_tokens=True, + ) + if self.data_args.append_eos_token: + d_collated['input_ids'] = [d + [self.tokenizer.eos_token_id] for d in d_collated['input_ids']] + d_collated = self.tokenizer.pad( + d_collated, + padding=True, + pad_to_multiple_of=self.data_args.pad_to_multiple_of, + return_attention_mask=True, + return_tensors='pt', + ) + return q_collated, d_collated + + def _tokenize_chunked_passages(self, passages: List[str]): + """ + Tokenize passages with EOS separators between chunks. + Each chunk ends with EOS, enabling extraction of chunk embeddings from EOS positions. + """ + chunk_size = self.data_args.passage_chunk_size + eos_id = self.tokenizer.eos_token_id + pad_id = self.tokenizer.pad_token_id if self.tokenizer.pad_token_id is not None else 0 + + all_input_ids = [] + all_eos_positions = [] + + for passage in passages: + tokens = self.tokenizer.encode(passage, add_special_tokens=False) + + new_tokens = [] + eos_positions = [] + for i in range(0, max(len(tokens), 1), chunk_size): + chunk = tokens[i:i + chunk_size] + new_tokens.extend(chunk) + new_tokens.append(eos_id) + eos_positions.append(len(new_tokens) - 1) + + all_input_ids.append(new_tokens) + all_eos_positions.append(eos_positions) + + # Padding + max_len = min(max(len(ids) for ids in all_input_ids), self.data_args.passage_max_len) + if self.data_args.pad_to_multiple_of: + max_len = ((max_len + self.data_args.pad_to_multiple_of - 1) + // self.data_args.pad_to_multiple_of * self.data_args.pad_to_multiple_of) + + padded_ids, padded_mask, final_eos_positions = [], [], [] + for input_ids, eos_pos in zip(all_input_ids, all_eos_positions): + if len(input_ids) > max_len: + input_ids = input_ids[:max_len] + eos_pos = [p for p in eos_pos if p < max_len] + pad_len = max_len - len(input_ids) + padded_ids.append(input_ids + [pad_id] * pad_len) + padded_mask.append([1] * len(input_ids) + [0] * pad_len) + final_eos_positions.append(eos_pos) + + d_collated = { + 'input_ids': torch.tensor(padded_ids, dtype=torch.long), + 'attention_mask': torch.tensor(padded_mask, dtype=torch.long), + } + return d_collated, final_eos_positions @dataclass diff --git a/src/tevatron/retriever/driver/train.py b/src/tevatron/retriever/driver/train.py index 39abab45..15b13adc 100644 --- a/src/tevatron/retriever/driver/train.py +++ b/src/tevatron/retriever/driver/train.py @@ -87,6 +87,7 @@ def main(): torch_dtype=torch_dtype, attn_implementation=model_args.attn_implementation, ) + model.passage_chunk_size = data_args.passage_chunk_size train_dataset = TrainDataset(data_args) collator = TrainCollator(data_args, tokenizer) diff --git a/src/tevatron/retriever/modeling/dense.py b/src/tevatron/retriever/modeling/dense.py index 8bc50106..0cf64445 100644 --- a/src/tevatron/retriever/modeling/dense.py +++ b/src/tevatron/retriever/modeling/dense.py @@ -1,4 +1,5 @@ import torch +import torch.nn.functional as F import logging from transformers import Qwen2_5OmniThinkerForConditionalGeneration from .encoder import EncoderModel @@ -8,14 +9,44 @@ class DenseModel(EncoderModel): + def __init__(self, encoder, pooling='cls', normalize=False, temperature=1.0): + super().__init__(encoder, pooling, normalize, temperature) + self.passage_chunk_size = 0 + self._eos_positions = None + + def set_eos_positions(self, eos_positions): + self._eos_positions = eos_positions + def encode_query(self, qry): query_hidden_states = self.encoder(**qry, return_dict=True) query_hidden_states = query_hidden_states.last_hidden_state return self._pooling(query_hidden_states, qry['attention_mask']) def encode_passage(self, psg): - # encode passage is the same as encode query - return self.encode_query(psg) + hidden_states = self.encoder(**psg, return_dict=True).last_hidden_state + + if self.passage_chunk_size > 0 and self._eos_positions is not None: + return self._encode_chunked_passage(hidden_states) + return self._pooling(hidden_states, psg['attention_mask']) + + def _encode_chunked_passage(self, hidden_states): + """Extract EOS position embeddings as chunk representations.""" + batch_size, seq_len, hidden_size = hidden_states.shape + max_chunks = max(len(pos) for pos in self._eos_positions) + + chunk_embs = torch.zeros(batch_size, max_chunks, hidden_size, + device=hidden_states.device, dtype=hidden_states.dtype) + chunk_mask = torch.zeros(batch_size, max_chunks, device=hidden_states.device) + + for i, positions in enumerate(self._eos_positions): + for j, pos in enumerate(positions): + if pos < seq_len: + chunk_embs[i, j] = hidden_states[i, pos] + chunk_mask[i, j] = 1.0 + + if self.normalize: + chunk_embs = F.normalize(chunk_embs, p=2, dim=-1) + return chunk_embs, chunk_mask def _pooling(self, last_hidden_state, attention_mask): diff --git a/src/tevatron/retriever/modeling/encoder.py b/src/tevatron/retriever/modeling/encoder.py index c3eedc35..56536e18 100644 --- a/src/tevatron/retriever/modeling/encoder.py +++ b/src/tevatron/retriever/modeling/encoder.py @@ -38,6 +38,7 @@ def __init__(self, self.pooling = pooling self.normalize = normalize self.temperature = temperature + self.passage_chunk_size = 0 self.cross_entropy = nn.CrossEntropyLoss(reduction='mean') self.is_ddp = dist.is_initialized() if self.is_ddp: @@ -46,40 +47,50 @@ def __init__(self, def forward(self, query: Dict[str, Tensor] = None, passage: Dict[str, Tensor] = None): q_reps = self.encode_query(query) if query else None - p_reps = self.encode_passage(passage) if passage else None + + # Handle chunked vs normal passage encoding + if passage is not None: + p_result = self.encode_passage(passage) + if self.passage_chunk_size > 0 and isinstance(p_result, tuple): + p_reps, chunk_mask = p_result + else: + p_reps, chunk_mask = p_result, None + else: + p_reps, chunk_mask = None, None # for inference if q_reps is None or p_reps is None: - return EncoderOutput( - q_reps=q_reps, - p_reps=p_reps - ) + return EncoderOutput(q_reps=q_reps, p_reps=p_reps) # for training if self.training: if self.is_ddp: q_reps = self._dist_gather_tensor(q_reps) p_reps = self._dist_gather_tensor(p_reps) + if chunk_mask is not None: + chunk_mask = self._dist_gather_tensor(chunk_mask) - scores = self.compute_similarity(q_reps, p_reps) + if self.passage_chunk_size > 0 and chunk_mask is not None: + scores = self.compute_maxsim_similarity(q_reps, p_reps, chunk_mask) + else: + scores = self.compute_similarity(q_reps, p_reps) scores = scores.view(q_reps.size(0), -1) - target = torch.arange(scores.size(0), device=scores.device, dtype=torch.long) - target = target * (p_reps.size(0) // q_reps.size(0)) + num_psg_per_query = scores.size(1) // q_reps.size(0) + target = torch.arange(q_reps.size(0), device=scores.device, dtype=torch.long) + target = target * num_psg_per_query loss = self.compute_loss(scores / self.temperature, target) if self.is_ddp: - loss = loss * self.world_size # counter average weight reduction + loss = loss * self.world_size # for eval else: - scores = self.compute_similarity(q_reps, p_reps) + if self.passage_chunk_size > 0 and chunk_mask is not None: + scores = self.compute_maxsim_similarity(q_reps, p_reps, chunk_mask) + else: + scores = self.compute_similarity(q_reps, p_reps) loss = None - return EncoderOutput( - loss=loss, - scores=scores, - q_reps=q_reps, - p_reps=p_reps, - ) + return EncoderOutput(loss=loss, scores=scores, q_reps=q_reps, p_reps=p_reps) def encode_passage(self, psg): raise NotImplementedError('EncoderModel is an abstract class') @@ -90,6 +101,18 @@ def encode_query(self, qry): def compute_similarity(self, q_reps, p_reps): return torch.matmul(q_reps, p_reps.transpose(0, 1)) + def compute_maxsim_similarity(self, q_reps, p_reps, chunk_mask): + """ + MaxSim: max similarity between query and passage chunks. + q_reps: [Q, H], p_reps: [P, C, H], chunk_mask: [P, C] + Returns: [Q, P] + """ + chunk_scores = torch.einsum('qh,pch->qpc', q_reps, p_reps) + if chunk_mask is not None: + padding_mask = ~chunk_mask.unsqueeze(0).bool() + chunk_scores = chunk_scores.masked_fill(padding_mask, float('-inf')) + return chunk_scores.max(dim=-1).values + def compute_loss(self, scores, target): return self.cross_entropy(scores, target) diff --git a/src/tevatron/retriever/trainer.py b/src/tevatron/retriever/trainer.py index 0c6ceb58..30504361 100644 --- a/src/tevatron/retriever/trainer.py +++ b/src/tevatron/retriever/trainer.py @@ -45,7 +45,10 @@ def _save(self, output_dir: Optional[str] = None, state_dict=None): torch.save(self.args, os.path.join(output_dir, TRAINING_ARGS_NAME)) def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None): - query, passage = inputs + query, passage, *rest = inputs + eos_positions = rest[0] if rest else None + if hasattr(model, 'set_eos_positions'): + model.set_eos_positions(eos_positions) return model(query=query, passage=passage).loss def training_step(self, *args): From 1536517cb02e628a04a02998bfe92ccdaf558c02 Mon Sep 17 00:00:00 2001 From: Ryan Yu Date: Wed, 10 Dec 2025 14:05:42 -0500 Subject: [PATCH 02/14] added logic for left padding --- src/tevatron/retriever/arguments.py | 4 ++-- src/tevatron/retriever/collator.py | 18 +++++++++++++++--- 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/src/tevatron/retriever/arguments.py b/src/tevatron/retriever/arguments.py index cce3285f..f0e7ffa2 100644 --- a/src/tevatron/retriever/arguments.py +++ b/src/tevatron/retriever/arguments.py @@ -199,8 +199,8 @@ class DataArguments: ) padding_side: str = field( - default='right', - metadata={"help": "padding side for the tokenizer, can be 'left' or 'right'"} + default='left', + metadata={"help": "padding side for the tokenizer, can be 'left' or 'right'. Use 'left' for last-token pooling (decoder models like Qwen/LLaMA), 'right' for cls pooling (encoder models like BERT)"} ) passage_chunk_size: int = field( diff --git a/src/tevatron/retriever/collator.py b/src/tevatron/retriever/collator.py index 6323fcc1..d2435a1d 100644 --- a/src/tevatron/retriever/collator.py +++ b/src/tevatron/retriever/collator.py @@ -82,10 +82,12 @@ def _tokenize_chunked_passages(self, passages: List[str]): """ Tokenize passages with EOS separators between chunks. Each chunk ends with EOS, enabling extraction of chunk embeddings from EOS positions. + Respects tokenizer.padding_side for consistent padding direction. """ chunk_size = self.data_args.passage_chunk_size eos_id = self.tokenizer.eos_token_id pad_id = self.tokenizer.pad_token_id if self.tokenizer.pad_token_id is not None else 0 + use_left_padding = self.tokenizer.padding_side == 'left' all_input_ids = [] all_eos_positions = [] @@ -115,10 +117,20 @@ def _tokenize_chunked_passages(self, passages: List[str]): if len(input_ids) > max_len: input_ids = input_ids[:max_len] eos_pos = [p for p in eos_pos if p < max_len] + pad_len = max_len - len(input_ids) - padded_ids.append(input_ids + [pad_id] * pad_len) - padded_mask.append([1] * len(input_ids) + [0] * pad_len) - final_eos_positions.append(eos_pos) + + if use_left_padding: + # Left padding: [PAD, PAD, ..., content] + padded_ids.append([pad_id] * pad_len + input_ids) + padded_mask.append([0] * pad_len + [1] * len(input_ids)) + # Adjust EOS positions: shift right by pad_len + final_eos_positions.append([p + pad_len for p in eos_pos]) + else: + # Right padding: [content, ..., PAD, PAD] + padded_ids.append(input_ids + [pad_id] * pad_len) + padded_mask.append([1] * len(input_ids) + [0] * pad_len) + final_eos_positions.append(eos_pos) d_collated = { 'input_ids': torch.tensor(padded_ids, dtype=torch.long), From dccfaf89874a313afabf6af28851996fa49fae33 Mon Sep 17 00:00:00 2001 From: Ryan Yu Date: Wed, 10 Dec 2025 22:48:13 -0500 Subject: [PATCH 03/14] added search --- src/tevatron/retriever/collator.py | 88 +++++++++++++++++++++++- src/tevatron/retriever/driver/encode.py | 79 +++++++++++++++++---- src/tevatron/retriever/driver/search.py | 91 +++++++++++++++++++++++-- 3 files changed, 235 insertions(+), 23 deletions(-) diff --git a/src/tevatron/retriever/collator.py b/src/tevatron/retriever/collator.py index d2435a1d..0eb37ac2 100644 --- a/src/tevatron/retriever/collator.py +++ b/src/tevatron/retriever/collator.py @@ -82,10 +82,15 @@ def _tokenize_chunked_passages(self, passages: List[str]): """ Tokenize passages with EOS separators between chunks. Each chunk ends with EOS, enabling extraction of chunk embeddings from EOS positions. - Respects tokenizer.padding_side for consistent padding direction. + + Uses the same token that tokenizer.add_special_tokens adds (e.g., <|endoftext|>) + so that query and passage use the same pooling token automatically. """ chunk_size = self.data_args.passage_chunk_size - eos_id = self.tokenizer.eos_token_id + # Get the token that tokenizer adds with add_special_tokens=True + # This ensures query and passage use the same token for pooling + sample = self.tokenizer.encode("x", add_special_tokens=True) + eos_id = sample[-1] # Last token added by tokenizer pad_id = self.tokenizer.pad_token_id if self.tokenizer.pad_token_id is not None else 0 use_left_padding = self.tokenizer.padding_side == 'left' @@ -289,6 +294,85 @@ def __call__(self, features): ) return content_ids, collated_inputs + +@dataclass +class ChunkedEncodeCollator: + """ + Collator for chunked passage encoding (inference/search). + Splits passages into chunks with EOS separators, similar to training. + """ + data_args: DataArguments + tokenizer: PreTrainedTokenizer + + def __call__(self, features): + """ + Collate function for chunked passage encoding. + :param features: list of (doc_id, text, image, video, audio) tuples + :return: (doc_ids, collated_inputs, eos_positions, chunk_counts) + + Uses the same token that tokenizer.add_special_tokens adds for consistency with query. + """ + doc_ids = [x[0] for x in features] + texts = [x[1] for x in features] + + chunk_size = self.data_args.passage_chunk_size + # Get the token that tokenizer adds with add_special_tokens=True + sample = self.tokenizer.encode("x", add_special_tokens=True) + eos_id = sample[-1] # Last token added by tokenizer + pad_id = self.tokenizer.pad_token_id if self.tokenizer.pad_token_id is not None else 0 + use_left_padding = self.tokenizer.padding_side == 'left' + + all_input_ids = [] + all_eos_positions = [] + chunk_counts = [] + + for text in texts: + if text is None: + text = "" + tokens = self.tokenizer.encode(text, add_special_tokens=False) + + new_tokens = [] + eos_positions = [] + for i in range(0, max(len(tokens), 1), chunk_size): + chunk = tokens[i:i + chunk_size] + new_tokens.extend(chunk) + new_tokens.append(eos_id) + eos_positions.append(len(new_tokens) - 1) + + all_input_ids.append(new_tokens) + all_eos_positions.append(eos_positions) + chunk_counts.append(len(eos_positions)) + + # Padding + max_len = min(max(len(ids) for ids in all_input_ids), self.data_args.passage_max_len) + if self.data_args.pad_to_multiple_of: + max_len = ((max_len + self.data_args.pad_to_multiple_of - 1) // self.data_args.pad_to_multiple_of * self.data_args.pad_to_multiple_of) + + padded_ids, padded_mask, final_eos_positions = [], [], [] + for input_ids, eos_pos in zip(all_input_ids, all_eos_positions): + if len(input_ids) > max_len: + input_ids = input_ids[:max_len] + eos_pos = [p for p in eos_pos if p < max_len] + + pad_len = max_len - len(input_ids) + + if use_left_padding: + padded_ids.append([pad_id] * pad_len + input_ids) + padded_mask.append([0] * pad_len + [1] * len(input_ids)) + final_eos_positions.append([p + pad_len for p in eos_pos]) + else: + padded_ids.append(input_ids + [pad_id] * pad_len) + padded_mask.append([1] * len(input_ids) + [0] * pad_len) + final_eos_positions.append(eos_pos) + + collated_inputs = { + 'input_ids': torch.tensor(padded_ids, dtype=torch.long), + 'attention_mask': torch.tensor(padded_mask, dtype=torch.long), + } + + return doc_ids, collated_inputs, final_eos_positions, chunk_counts + + @dataclass class MultiModalEncodeCollator: """ diff --git a/src/tevatron/retriever/driver/encode.py b/src/tevatron/retriever/driver/encode.py index 8749dfda..6908d789 100644 --- a/src/tevatron/retriever/driver/encode.py +++ b/src/tevatron/retriever/driver/encode.py @@ -8,6 +8,7 @@ from tqdm import tqdm import torch +import torch.nn.functional as F from torch.utils.data import DataLoader from transformers import AutoTokenizer @@ -18,7 +19,7 @@ from tevatron.retriever.arguments import ModelArguments, DataArguments, \ TevatronTrainingArguments as TrainingArguments from tevatron.retriever.dataset import EncodeDataset -from tevatron.retriever.collator import EncodeCollator +from tevatron.retriever.collator import EncodeCollator, ChunkedEncodeCollator from tevatron.retriever.modeling import EncoderOutput, DenseModel logger = logging.getLogger(__name__) @@ -78,10 +79,23 @@ def main(): data_args=data_args, ) - encode_collator = EncodeCollator( - data_args=data_args, - tokenizer=tokenizer, + # Check if using chunked passage encoding + use_chunked = ( + not data_args.encode_is_query and + data_args.passage_chunk_size > 0 ) + + if use_chunked: + logger.info(f"Using chunked passage encoding with chunk_size={data_args.passage_chunk_size}") + encode_collator = ChunkedEncodeCollator( + data_args=data_args, + tokenizer=tokenizer, + ) + else: + encode_collator = EncodeCollator( + data_args=data_args, + tokenizer=tokenizer, + ) encode_loader = DataLoader( encode_dataset, @@ -96,23 +110,58 @@ def main(): model = model.to(training_args.device) model.eval() - for (batch_ids, batch) in tqdm(encode_loader): - lookup_indices.extend(batch_ids) + for batch in tqdm(encode_loader): with torch.amp.autocast('cuda') if training_args.fp16 or training_args.bf16 else nullcontext(): with torch.no_grad(): - for k, v in batch.items(): - batch[k] = v.to(training_args.device) - if data_args.encode_is_query: - model_output: EncoderOutput = model(query=batch) - encoded.append(model_output.q_reps.cpu().detach().numpy()) + if use_chunked: + # Chunked passage encoding + doc_ids, batch_inputs, eos_positions, chunk_counts = batch + + for k, v in batch_inputs.items(): + batch_inputs[k] = v.to(training_args.device) + + # Get hidden states from encoder + hidden_states = model.encoder(**batch_inputs, return_dict=True).last_hidden_state + # hidden_states: [batch_size, seq_len, hidden_size] + + # Extract embeddings at EOS positions + for i, (doc_id, positions) in enumerate(zip(doc_ids, eos_positions)): + for chunk_idx, pos in enumerate(positions): + if pos < hidden_states.shape[1]: + chunk_emb = hidden_states[i, pos] + + # Normalize if needed + if model.normalize: + chunk_emb = F.normalize(chunk_emb, p=2, dim=-1) + + encoded.append(chunk_emb.cpu().numpy()) + lookup_indices.append((doc_id, chunk_idx)) else: - model_output: EncoderOutput = model(passage=batch) - encoded.append(model_output.p_reps.cpu().detach().numpy()) - - encoded = np.concatenate(encoded) + # Standard query or passage encoding + batch_ids, batch_inputs = batch + lookup_indices.extend(batch_ids) + + for k, v in batch_inputs.items(): + batch_inputs[k] = v.to(training_args.device) + + if data_args.encode_is_query: + model_output: EncoderOutput = model(query=batch_inputs) + encoded.append(model_output.q_reps.cpu().detach().numpy()) + else: + model_output: EncoderOutput = model(passage=batch_inputs) + encoded.append(model_output.p_reps.cpu().detach().numpy()) + + # Combine encoded embeddings + if use_chunked: + encoded = np.stack(encoded) + logger.info(f"Encoded {len(set(d for d, c in lookup_indices))} docs into {len(lookup_indices)} chunks") + else: + encoded = np.concatenate(encoded) with open(data_args.encode_output_path, 'wb') as f: pickle.dump((encoded, lookup_indices), f) + + logger.info(f"Saved embeddings to {data_args.encode_output_path}, shape: {encoded.shape}") if __name__ == "__main__": diff --git a/src/tevatron/retriever/driver/search.py b/src/tevatron/retriever/driver/search.py index 1f374eac..5a7f84e8 100644 --- a/src/tevatron/retriever/driver/search.py +++ b/src/tevatron/retriever/driver/search.py @@ -3,6 +3,7 @@ import numpy as np import glob from argparse import ArgumentParser +from collections import defaultdict from itertools import chain from tqdm import tqdm import faiss @@ -29,6 +30,41 @@ def search_queries(retriever, q_reps, p_lookup, args): return all_scores, psg_indices +def search_queries_chunked(retriever, q_reps, p_lookup, args): + """ + Search with chunked passages and aggregate by document using MaxSim. + """ + # Search more chunks to ensure good recall after aggregation + search_depth = args.depth * args.chunk_multiplier + + if args.batch_size > 0: + all_scores, all_indices = retriever.batch_search(q_reps, search_depth, args.batch_size, args.quiet) + else: + all_scores, all_indices = retriever.search(q_reps, search_depth) + + # Aggregate by document ID using MaxSim + aggregated_results = [] + for q_idx in range(len(q_reps)): + scores = all_scores[q_idx] + indices = all_indices[q_idx] + + doc_max_scores = defaultdict(lambda: float('-inf')) + + for score, idx in zip(scores, indices): + if idx < 0: # FAISS returns -1 for insufficient results + continue + + doc_id, chunk_idx = p_lookup[idx] + # MaxSim: keep the maximum score for each document + doc_max_scores[doc_id] = max(doc_max_scores[doc_id], score) + + # Sort by score and take top-depth + sorted_docs = sorted(doc_max_scores.items(), key=lambda x: x[1], reverse=True)[:args.depth] + aggregated_results.append(sorted_docs) + + return aggregated_results + + def write_ranking(corpus_indices, corpus_scores, q_lookup, ranking_save_file): with open(ranking_save_file, 'w') as f: for qid, q_doc_scores, q_doc_indices in zip(q_lookup, corpus_scores, corpus_indices): @@ -38,6 +74,17 @@ def write_ranking(corpus_indices, corpus_scores, q_lookup, ranking_save_file): f.write(f'{qid}\t{idx}\t{s}\n') +def write_ranking_chunked(results, q_lookup, ranking_save_file): + """ + Write ranking results from chunked search. + results: List[List[Tuple[doc_id, score]]] + """ + with open(ranking_save_file, 'w') as f: + for qid, doc_scores in zip(q_lookup, results): + for doc_id, score in doc_scores: + f.write(f'{qid}\t{doc_id}\t{score}\n') + + def pickle_load(path): with open(path, 'rb') as f: reps, lookup = pickle.load(f) @@ -58,6 +105,11 @@ def main(): parser.add_argument('--save_ranking_to', required=True) parser.add_argument('--save_text', action='store_true') parser.add_argument('--quiet', action='store_true') + # Chunked search arguments + parser.add_argument('--chunked', action='store_true', + help='Enable chunked search with document-level MaxSim aggregation') + parser.add_argument('--chunk_multiplier', type=int, default=10, + help='Multiply search depth by this factor for chunked search to ensure recall') args = parser.parse_args() @@ -75,6 +127,14 @@ def main(): retriever.add(p_reps) look_up += p_lookup + # Auto-detect chunked format: lookup entries are tuples (doc_id, chunk_idx) + is_chunked = args.chunked or (len(look_up) > 0 and isinstance(look_up[0], tuple)) + + if is_chunked: + unique_docs = len(set(doc_id for doc_id, _ in look_up)) + logger.info(f"Chunked mode: {len(look_up)} chunks from {unique_docs} documents") + logger.info(f"Search depth: {args.depth} docs, chunk search depth: {args.depth * args.chunk_multiplier}") + q_reps, q_lookup = pickle_load(args.query_reps) q_reps = q_reps @@ -96,14 +156,33 @@ def main(): ngpu=num_gpus) logger.info('Index Search Start') - all_scores, psg_indices = search_queries(retriever, q_reps, look_up, args) - logger.info('Index Search Finished') - - if args.save_text: - write_ranking(psg_indices, all_scores, q_lookup, args.save_ranking_to) + + if is_chunked: + # Chunked search with MaxSim aggregation + aggregated_results = search_queries_chunked(retriever, q_reps, look_up, args) + logger.info('Index Search Finished (chunked mode with MaxSim aggregation)') + + if args.save_text: + write_ranking_chunked(aggregated_results, q_lookup, args.save_ranking_to) + else: + # Convert to arrays for pickle + all_scores = [] + all_doc_ids = [] + for doc_scores in aggregated_results: + scores = [s for _, s in doc_scores] + doc_ids = [d for d, _ in doc_scores] + all_scores.append(scores) + all_doc_ids.append(doc_ids) + pickle_save((all_scores, all_doc_ids), args.save_ranking_to) else: - pickle_save((all_scores, psg_indices), args.save_ranking_to) + # Standard search + all_scores, psg_indices = search_queries(retriever, q_reps, look_up, args) + logger.info('Index Search Finished') + if args.save_text: + write_ranking(psg_indices, all_scores, q_lookup, args.save_ranking_to) + else: + pickle_save((all_scores, psg_indices), args.save_ranking_to) if __name__ == '__main__': main() From 1769715e6e2e2f6187c406091ae80cadc558f9f2 Mon Sep 17 00:00:00 2001 From: Ryan Yu Date: Sun, 14 Dec 2025 00:10:16 -0500 Subject: [PATCH 04/14] changed the tokenizer logic --- src/tevatron/retriever/collator.py | 72 +++++++++++------------------- 1 file changed, 25 insertions(+), 47 deletions(-) diff --git a/src/tevatron/retriever/collator.py b/src/tevatron/retriever/collator.py index 0eb37ac2..58ada992 100644 --- a/src/tevatron/retriever/collator.py +++ b/src/tevatron/retriever/collator.py @@ -86,61 +86,39 @@ def _tokenize_chunked_passages(self, passages: List[str]): Uses the same token that tokenizer.add_special_tokens adds (e.g., <|endoftext|>) so that query and passage use the same pooling token automatically. """ - chunk_size = self.data_args.passage_chunk_size - # Get the token that tokenizer adds with add_special_tokens=True - # This ensures query and passage use the same token for pooling - sample = self.tokenizer.encode("x", add_special_tokens=True) - eos_id = sample[-1] # Last token added by tokenizer - pad_id = self.tokenizer.pad_token_id if self.tokenizer.pad_token_id is not None else 0 - use_left_padding = self.tokenizer.padding_side == 'left' + chunk_length = self.data_args.passage_chunk_size + sep_id = 151645 # <|separator|> + eos_id = 151643 # <|endoftext|> all_input_ids = [] - all_eos_positions = [] + all_sep_positions = [] for passage in passages: - tokens = self.tokenizer.encode(passage, add_special_tokens=False) - + tokens = self.tokenizer.encode(passage, add_special_tokens=False) # There maybe some differences between models, this is for qwen3-embedding-0.6b, it only adds <|separator|> and endoftext new_tokens = [] - eos_positions = [] - for i in range(0, max(len(tokens), 1), chunk_size): - chunk = tokens[i:i + chunk_size] - new_tokens.extend(chunk) - new_tokens.append(eos_id) - eos_positions.append(len(new_tokens) - 1) - + sep_positions = [] + i = 1 + while i < len(tokens): + if i % chunk_length == 0: + new_tokens.append(sep_id) + sep_positions.append(i-1) + else: + new_tokens.append(tokens[i-1]) + i += 1 + new_tokens.append(eos_id) # edge case, what if the new_tokens[-1] is sep_id? + new_tokens.append(sep_id) + sep_positions.append(len(new_tokens)-1) all_input_ids.append(new_tokens) - all_eos_positions.append(eos_positions) + all_sep_positions.append(sep_positions) # Padding - max_len = min(max(len(ids) for ids in all_input_ids), self.data_args.passage_max_len) - if self.data_args.pad_to_multiple_of: - max_len = ((max_len + self.data_args.pad_to_multiple_of - 1) - // self.data_args.pad_to_multiple_of * self.data_args.pad_to_multiple_of) - - padded_ids, padded_mask, final_eos_positions = [], [], [] - for input_ids, eos_pos in zip(all_input_ids, all_eos_positions): - if len(input_ids) > max_len: - input_ids = input_ids[:max_len] - eos_pos = [p for p in eos_pos if p < max_len] - - pad_len = max_len - len(input_ids) - - if use_left_padding: - # Left padding: [PAD, PAD, ..., content] - padded_ids.append([pad_id] * pad_len + input_ids) - padded_mask.append([0] * pad_len + [1] * len(input_ids)) - # Adjust EOS positions: shift right by pad_len - final_eos_positions.append([p + pad_len for p in eos_pos]) - else: - # Right padding: [content, ..., PAD, PAD] - padded_ids.append(input_ids + [pad_id] * pad_len) - padded_mask.append([1] * len(input_ids) + [0] * pad_len) - final_eos_positions.append(eos_pos) - - d_collated = { - 'input_ids': torch.tensor(padded_ids, dtype=torch.long), - 'attention_mask': torch.tensor(padded_mask, dtype=torch.long), - } + d_collated = self.tokenizer.pad( + d_collated, + padding=True, + pad_to_multiple_of=self.data_args.pad_to_multiple_of, + return_attention_mask=True, + return_tensors='pt', + ) return d_collated, final_eos_positions From 16d604a21b3e89ec21f43b22dd074dcce863a52c Mon Sep 17 00:00:00 2001 From: Ryan Yu Date: Sun, 14 Dec 2025 01:23:06 -0500 Subject: [PATCH 05/14] added train collator debug --- src/tevatron/retriever/collator.py | 35 +++++++++++++++--------------- 1 file changed, 17 insertions(+), 18 deletions(-) diff --git a/src/tevatron/retriever/collator.py b/src/tevatron/retriever/collator.py index 58ada992..6129c1c1 100644 --- a/src/tevatron/retriever/collator.py +++ b/src/tevatron/retriever/collator.py @@ -86,7 +86,7 @@ def _tokenize_chunked_passages(self, passages: List[str]): Uses the same token that tokenizer.add_special_tokens adds (e.g., <|endoftext|>) so that query and passage use the same pooling token automatically. """ - chunk_length = self.data_args.passage_chunk_size + chunk_len = self.data_args.passage_chunk_size -1 sep_id = 151645 # <|separator|> eos_id = 151643 # <|endoftext|> @@ -94,23 +94,20 @@ def _tokenize_chunked_passages(self, passages: List[str]): all_sep_positions = [] for passage in passages: - tokens = self.tokenizer.encode(passage, add_special_tokens=False) # There maybe some differences between models, this is for qwen3-embedding-0.6b, it only adds <|separator|> and endoftext - new_tokens = [] - sep_positions = [] - i = 1 - while i < len(tokens): - if i % chunk_length == 0: - new_tokens.append(sep_id) - sep_positions.append(i-1) - else: - new_tokens.append(tokens[i-1]) - i += 1 - new_tokens.append(eos_id) # edge case, what if the new_tokens[-1] is sep_id? - new_tokens.append(sep_id) - sep_positions.append(len(new_tokens)-1) - all_input_ids.append(new_tokens) - all_sep_positions.append(sep_positions) + tokens = self.tokenizer.encode(passage, add_special_tokens=False) + tokens.append(eos_id) + ids = [] + sep_pos = [] + for i in range(0, len(tokens), chunk_len): + chunk = tokens[i:i + chunk_len] # up to 128 tokens + ids.extend(chunk) + ids.append(sep_id) # SEP at end of this chunk + sep_pos.append(len(ids) - 1) # position of SEP + + all_input_ids.append(ids) + all_sep_positions.append(sep_pos) + d_collated = {'input_ids': all_input_ids} # Padding d_collated = self.tokenizer.pad( d_collated, @@ -119,7 +116,9 @@ def _tokenize_chunked_passages(self, passages: List[str]): return_attention_mask=True, return_tensors='pt', ) - return d_collated, final_eos_positions + print(d_collated['input_ids'][0]) + print(all_sep_positions[0]) + return d_collated, all_sep_positions @dataclass From 61dbf6eb3fb2f2ac1b73d714eb11d5f72100f689 Mon Sep 17 00:00:00 2001 From: Ryan Yu Date: Sun, 14 Dec 2025 01:40:12 -0500 Subject: [PATCH 06/14] traincollator is done --- src/tevatron/retriever/collator.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/tevatron/retriever/collator.py b/src/tevatron/retriever/collator.py index 6129c1c1..a9082379 100644 --- a/src/tevatron/retriever/collator.py +++ b/src/tevatron/retriever/collator.py @@ -116,8 +116,6 @@ def _tokenize_chunked_passages(self, passages: List[str]): return_attention_mask=True, return_tensors='pt', ) - print(d_collated['input_ids'][0]) - print(all_sep_positions[0]) return d_collated, all_sep_positions From 6ec22a9ff46694c7fd3f23a56526b80266d18863 Mon Sep 17 00:00:00 2001 From: Ryan Yu Date: Sun, 14 Dec 2025 02:06:33 -0500 Subject: [PATCH 07/14] fixed some comments --- src/tevatron/retriever/collator.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/tevatron/retriever/collator.py b/src/tevatron/retriever/collator.py index a9082379..cb322603 100644 --- a/src/tevatron/retriever/collator.py +++ b/src/tevatron/retriever/collator.py @@ -55,7 +55,7 @@ def __call__(self, features: List[Tuple[str, List[str]]]): # Passage tokenization if self.data_args.passage_chunk_size > 0: - d_collated, eos_positions = self._tokenize_chunked_passages(all_passages) + d_collated, eos_positions = self._tokenize_and_pad_chunked_passages(all_passages) return q_collated, d_collated, eos_positions else: d_collated = self.tokenizer( @@ -78,7 +78,7 @@ def __call__(self, features: List[Tuple[str, List[str]]]): ) return q_collated, d_collated - def _tokenize_chunked_passages(self, passages: List[str]): + def _tokenize_and_pad_chunked_passages(self, passages: List[str]): """ Tokenize passages with EOS separators between chunks. Each chunk ends with EOS, enabling extraction of chunk embeddings from EOS positions. @@ -99,7 +99,7 @@ def _tokenize_chunked_passages(self, passages: List[str]): ids = [] sep_pos = [] for i in range(0, len(tokens), chunk_len): - chunk = tokens[i:i + chunk_len] # up to 128 tokens + chunk = tokens[i:i + chunk_len] # up to self.data_args.passage_chunk_size -1 tokens ids.extend(chunk) ids.append(sep_id) # SEP at end of this chunk sep_pos.append(len(ids) - 1) # position of SEP From 40eddf8d81710b427379e5976ba711658f056b6a Mon Sep 17 00:00:00 2001 From: Ryan Yu Date: Sun, 14 Dec 2025 14:57:25 -0500 Subject: [PATCH 08/14] modified chunkedencoder --- src/tevatron/retriever/collator.py | 74 +++++++++---------------- src/tevatron/retriever/driver/encode.py | 16 +----- 2 files changed, 30 insertions(+), 60 deletions(-) diff --git a/src/tevatron/retriever/collator.py b/src/tevatron/retriever/collator.py index cb322603..e489fa29 100644 --- a/src/tevatron/retriever/collator.py +++ b/src/tevatron/retriever/collator.py @@ -275,6 +275,7 @@ class ChunkedEncodeCollator: """ Collator for chunked passage encoding (inference/search). Splits passages into chunks with EOS separators, similar to training. + Uses the same chunking logic as TrainCollator._tokenize_and_pad_chunked_passages. """ data_args: DataArguments tokenizer: PreTrainedTokenizer @@ -283,69 +284,48 @@ def __call__(self, features): """ Collate function for chunked passage encoding. :param features: list of (doc_id, text, image, video, audio) tuples - :return: (doc_ids, collated_inputs, eos_positions, chunk_counts) - - Uses the same token that tokenizer.add_special_tokens adds for consistency with query. + :return: (doc_ids, collated_inputs, sep_positions, chunk_counts) """ doc_ids = [x[0] for x in features] texts = [x[1] for x in features] - chunk_size = self.data_args.passage_chunk_size - # Get the token that tokenizer adds with add_special_tokens=True - sample = self.tokenizer.encode("x", add_special_tokens=True) - eos_id = sample[-1] # Last token added by tokenizer - pad_id = self.tokenizer.pad_token_id if self.tokenizer.pad_token_id is not None else 0 - use_left_padding = self.tokenizer.padding_side == 'left' + chunk_len = self.data_args.passage_chunk_size - 1 + sep_id = 151645 # <|separator|> + eos_id = 151643 # <|endoftext|> all_input_ids = [] - all_eos_positions = [] + all_sep_positions = [] chunk_counts = [] for text in texts: if text is None: text = "" tokens = self.tokenizer.encode(text, add_special_tokens=False) + tokens.append(eos_id) - new_tokens = [] - eos_positions = [] - for i in range(0, max(len(tokens), 1), chunk_size): - chunk = tokens[i:i + chunk_size] - new_tokens.extend(chunk) - new_tokens.append(eos_id) - eos_positions.append(len(new_tokens) - 1) - - all_input_ids.append(new_tokens) - all_eos_positions.append(eos_positions) - chunk_counts.append(len(eos_positions)) - - # Padding - max_len = min(max(len(ids) for ids in all_input_ids), self.data_args.passage_max_len) - if self.data_args.pad_to_multiple_of: - max_len = ((max_len + self.data_args.pad_to_multiple_of - 1) // self.data_args.pad_to_multiple_of * self.data_args.pad_to_multiple_of) - - padded_ids, padded_mask, final_eos_positions = [], [], [] - for input_ids, eos_pos in zip(all_input_ids, all_eos_positions): - if len(input_ids) > max_len: - input_ids = input_ids[:max_len] - eos_pos = [p for p in eos_pos if p < max_len] - - pad_len = max_len - len(input_ids) + ids = [] + sep_pos = [] + for i in range(0, len(tokens), chunk_len): + chunk = tokens[i:i + chunk_len] # up to passage_chunk_size - 1 tokens + ids.extend(chunk) + ids.append(sep_id) # SEP at end of this chunk + sep_pos.append(len(ids) - 1) # position of SEP - if use_left_padding: - padded_ids.append([pad_id] * pad_len + input_ids) - padded_mask.append([0] * pad_len + [1] * len(input_ids)) - final_eos_positions.append([p + pad_len for p in eos_pos]) - else: - padded_ids.append(input_ids + [pad_id] * pad_len) - padded_mask.append([1] * len(input_ids) + [0] * pad_len) - final_eos_positions.append(eos_pos) + all_input_ids.append(ids) + all_sep_positions.append(sep_pos) + chunk_counts.append(len(sep_pos)) - collated_inputs = { - 'input_ids': torch.tensor(padded_ids, dtype=torch.long), - 'attention_mask': torch.tensor(padded_mask, dtype=torch.long), - } + # Use tokenizer.pad() for consistent padding + d_collated = {'input_ids': all_input_ids} + d_collated = self.tokenizer.pad( + d_collated, + padding=True, + pad_to_multiple_of=self.data_args.pad_to_multiple_of, + return_attention_mask=True, + return_tensors='pt', + ) - return doc_ids, collated_inputs, final_eos_positions, chunk_counts + return doc_ids, d_collated, all_sep_positions, chunk_counts @dataclass diff --git a/src/tevatron/retriever/driver/encode.py b/src/tevatron/retriever/driver/encode.py index 6908d789..b0c4d862 100644 --- a/src/tevatron/retriever/driver/encode.py +++ b/src/tevatron/retriever/driver/encode.py @@ -79,23 +79,13 @@ def main(): data_args=data_args, ) - # Check if using chunked passage encoding - use_chunked = ( - not data_args.encode_is_query and - data_args.passage_chunk_size > 0 - ) + use_chunked = not data_args.encode_is_query and data_args.passage_chunk_size > 0 if use_chunked: logger.info(f"Using chunked passage encoding with chunk_size={data_args.passage_chunk_size}") - encode_collator = ChunkedEncodeCollator( - data_args=data_args, - tokenizer=tokenizer, - ) + encode_collator = ChunkedEncodeCollator(data_args=data_args, tokenizer=tokenizer) else: - encode_collator = EncodeCollator( - data_args=data_args, - tokenizer=tokenizer, - ) + encode_collator = EncodeCollator(data_args=data_args, tokenizer=tokenizer) encode_loader = DataLoader( encode_dataset, From 0ebcf37959f17345a5e41e12945ca10466a2549e Mon Sep 17 00:00:00 2001 From: Ryan Yu Date: Sun, 14 Dec 2025 16:25:54 -0500 Subject: [PATCH 09/14] Modified forward and model --- src/tevatron/retriever/driver/encode.py | 27 +++++--------- src/tevatron/retriever/modeling/dense.py | 41 ++++++++++++---------- src/tevatron/retriever/modeling/encoder.py | 37 ++++++++++--------- 3 files changed, 53 insertions(+), 52 deletions(-) diff --git a/src/tevatron/retriever/driver/encode.py b/src/tevatron/retriever/driver/encode.py index b0c4d862..2583496d 100644 --- a/src/tevatron/retriever/driver/encode.py +++ b/src/tevatron/retriever/driver/encode.py @@ -104,30 +104,21 @@ def main(): with torch.amp.autocast('cuda') if training_args.fp16 or training_args.bf16 else nullcontext(): with torch.no_grad(): if use_chunked: - # Chunked passage encoding - doc_ids, batch_inputs, eos_positions, chunk_counts = batch - + doc_ids, batch_inputs, sep_positions, chunk_counts = batch for k, v in batch_inputs.items(): batch_inputs[k] = v.to(training_args.device) - # Get hidden states from encoder - hidden_states = model.encoder(**batch_inputs, return_dict=True).last_hidden_state - # hidden_states: [batch_size, seq_len, hidden_size] + # Use DenseModel's encode_passage to extract chunk embeddings + chunk_embs, chunk_mask = model.encode_passage(batch_inputs, sep_positions=sep_positions) - # Extract embeddings at EOS positions - for i, (doc_id, positions) in enumerate(zip(doc_ids, eos_positions)): - for chunk_idx, pos in enumerate(positions): - if pos < hidden_states.shape[1]: - chunk_emb = hidden_states[i, pos] - - # Normalize if needed - if model.normalize: - chunk_emb = F.normalize(chunk_emb, p=2, dim=-1) - - encoded.append(chunk_emb.cpu().numpy()) + # Flatten chunk embeddings and create lookup indices + batch_size, max_chunks, hidden_size = chunk_embs.shape + for i, doc_id in enumerate(doc_ids): + for chunk_idx in range(max_chunks): + if chunk_mask[i, chunk_idx] > 0: # Valid chunk + encoded.append(chunk_embs[i, chunk_idx].cpu().detach().numpy()) lookup_indices.append((doc_id, chunk_idx)) else: - # Standard query or passage encoding batch_ids, batch_inputs = batch lookup_indices.extend(batch_ids) diff --git a/src/tevatron/retriever/modeling/dense.py b/src/tevatron/retriever/modeling/dense.py index 0cf64445..fe817455 100644 --- a/src/tevatron/retriever/modeling/dense.py +++ b/src/tevatron/retriever/modeling/dense.py @@ -12,40 +12,45 @@ class DenseModel(EncoderModel): def __init__(self, encoder, pooling='cls', normalize=False, temperature=1.0): super().__init__(encoder, pooling, normalize, temperature) self.passage_chunk_size = 0 - self._eos_positions = None - - def set_eos_positions(self, eos_positions): - self._eos_positions = eos_positions + self.sep_positions = None def encode_query(self, qry): query_hidden_states = self.encoder(**qry, return_dict=True) query_hidden_states = query_hidden_states.last_hidden_state return self._pooling(query_hidden_states, qry['attention_mask']) - def encode_passage(self, psg): + def encode_passage(self, psg, sep_positions=None): hidden_states = self.encoder(**psg, return_dict=True).last_hidden_state - - if self.passage_chunk_size > 0 and self._eos_positions is not None: - return self._encode_chunked_passage(hidden_states) + if self.passage_chunk_size > 0: + return self._pooling_chunked(hidden_states, self.sep_positions) return self._pooling(hidden_states, psg['attention_mask']) - def _encode_chunked_passage(self, hidden_states): - """Extract EOS position embeddings as chunk representations.""" - batch_size, seq_len, hidden_size = hidden_states.shape - max_chunks = max(len(pos) for pos in self._eos_positions) + def _pooling_chunked(self, last_hidden_state, sep_positions): + batch_size, seq_len, hidden_size = last_hidden_state.shape - chunk_embs = torch.zeros(batch_size, max_chunks, hidden_size, - device=hidden_states.device, dtype=hidden_states.dtype) - chunk_mask = torch.zeros(batch_size, max_chunks, device=hidden_states.device) + if not sep_positions: + # No chunks, return empty + return torch.zeros(batch_size, 0, hidden_size, device=last_hidden_state.device, dtype=last_hidden_state.dtype), \ + torch.zeros(batch_size, 0, device=last_hidden_state.device) - for i, positions in enumerate(self._eos_positions): + # Find max number of chunks across all passages + max_chunks = max(len(pos_list) for pos_list in sep_positions) + + chunk_embs = torch.zeros(batch_size, max_chunks, hidden_size, device=last_hidden_state.device, dtype=last_hidden_state.dtype) + chunk_mask = torch.zeros(batch_size, max_chunks, device=last_hidden_state.device, dtype=torch.float) + + # Extract embeddings at sep_positions (this is the pooling operation for chunked passages) + for i, positions in enumerate(sep_positions): for j, pos in enumerate(positions): - if pos < seq_len: - chunk_embs[i, j] = hidden_states[i, pos] + if 0 <= pos < seq_len: + chunk_embs[i, j] = last_hidden_state[i, pos] chunk_mask[i, j] = 1.0 + else: + logger.warning(f"Position {pos} out of bounds for sequence length {seq_len} in batch {i}, chunk {j}") if self.normalize: chunk_embs = F.normalize(chunk_embs, p=2, dim=-1) + return chunk_embs, chunk_mask diff --git a/src/tevatron/retriever/modeling/encoder.py b/src/tevatron/retriever/modeling/encoder.py index 56536e18..1ccd678d 100644 --- a/src/tevatron/retriever/modeling/encoder.py +++ b/src/tevatron/retriever/modeling/encoder.py @@ -47,28 +47,24 @@ def __init__(self, def forward(self, query: Dict[str, Tensor] = None, passage: Dict[str, Tensor] = None): q_reps = self.encode_query(query) if query else None - - # Handle chunked vs normal passage encoding - if passage is not None: - p_result = self.encode_passage(passage) - if self.passage_chunk_size > 0 and isinstance(p_result, tuple): - p_reps, chunk_mask = p_result - else: - p_reps, chunk_mask = p_result, None - else: - p_reps, chunk_mask = None, None + p_reps, chunk_mask = None, None + if passage: + p_reps = self.encode_passage(passage) + if self.passage_chunk_size > 0 and isinstance(p_reps, tuple): + p_reps, chunk_mask = p_reps # for inference if q_reps is None or p_reps is None: - return EncoderOutput(q_reps=q_reps, p_reps=p_reps) + return EncoderOutput( + q_reps=q_reps, + p_reps=p_reps + ) # for training if self.training: if self.is_ddp: q_reps = self._dist_gather_tensor(q_reps) p_reps = self._dist_gather_tensor(p_reps) - if chunk_mask is not None: - chunk_mask = self._dist_gather_tensor(chunk_mask) if self.passage_chunk_size > 0 and chunk_mask is not None: scores = self.compute_maxsim_similarity(q_reps, p_reps, chunk_mask) @@ -82,7 +78,7 @@ def forward(self, query: Dict[str, Tensor] = None, passage: Dict[str, Tensor] = loss = self.compute_loss(scores / self.temperature, target) if self.is_ddp: - loss = loss * self.world_size + loss = loss * self.world_size # counter average weight reduction # for eval else: if self.passage_chunk_size > 0 and chunk_mask is not None: @@ -90,7 +86,12 @@ def forward(self, query: Dict[str, Tensor] = None, passage: Dict[str, Tensor] = else: scores = self.compute_similarity(q_reps, p_reps) loss = None - return EncoderOutput(loss=loss, scores=scores, q_reps=q_reps, p_reps=p_reps) + return EncoderOutput( + loss=loss, + scores=scores, + q_reps=q_reps, + p_reps=p_reps, + ) def encode_passage(self, psg): raise NotImplementedError('EncoderModel is an abstract class') @@ -105,9 +106,13 @@ def compute_maxsim_similarity(self, q_reps, p_reps, chunk_mask): """ MaxSim: max similarity between query and passage chunks. q_reps: [Q, H], p_reps: [P, C, H], chunk_mask: [P, C] + Q: number of queries + P: number of passages + C: number of chunks per passage + H: dimension of the embeddings Returns: [Q, P] """ - chunk_scores = torch.einsum('qh,pch->qpc', q_reps, p_reps) + chunk_scores = torch.einsum('qh,pch->qpc', q_reps, p_reps) # 第 q 个 query 和第 p 个 passage 的第 c 个 chunk 的相似度 if chunk_mask is not None: padding_mask = ~chunk_mask.unsqueeze(0).bool() chunk_scores = chunk_scores.masked_fill(padding_mask, float('-inf')) From e7e3bc368700240e26675f294ea3e241263d4c4f Mon Sep 17 00:00:00 2001 From: Ryan Yu Date: Sun, 14 Dec 2025 18:27:07 -0500 Subject: [PATCH 10/14] Modified inference on chunked passage, in progress --- src/tevatron/retriever/arguments.py | 4 ++-- src/tevatron/retriever/driver/encode.py | 7 ++++--- src/tevatron/retriever/modeling/dense.py | 12 ++++++------ 3 files changed, 12 insertions(+), 11 deletions(-) diff --git a/src/tevatron/retriever/arguments.py b/src/tevatron/retriever/arguments.py index f0e7ffa2..cce3285f 100644 --- a/src/tevatron/retriever/arguments.py +++ b/src/tevatron/retriever/arguments.py @@ -199,8 +199,8 @@ class DataArguments: ) padding_side: str = field( - default='left', - metadata={"help": "padding side for the tokenizer, can be 'left' or 'right'. Use 'left' for last-token pooling (decoder models like Qwen/LLaMA), 'right' for cls pooling (encoder models like BERT)"} + default='right', + metadata={"help": "padding side for the tokenizer, can be 'left' or 'right'"} ) passage_chunk_size: int = field( diff --git a/src/tevatron/retriever/driver/encode.py b/src/tevatron/retriever/driver/encode.py index 2583496d..55044691 100644 --- a/src/tevatron/retriever/driver/encode.py +++ b/src/tevatron/retriever/driver/encode.py @@ -10,6 +10,8 @@ import torch import torch.nn.functional as F +from rich import print + from torch.utils.data import DataLoader from transformers import AutoTokenizer from transformers import ( @@ -105,11 +107,10 @@ def main(): with torch.no_grad(): if use_chunked: doc_ids, batch_inputs, sep_positions, chunk_counts = batch + print(batch_inputs) for k, v in batch_inputs.items(): batch_inputs[k] = v.to(training_args.device) - - # Use DenseModel's encode_passage to extract chunk embeddings - chunk_embs, chunk_mask = model.encode_passage(batch_inputs, sep_positions=sep_positions) + chunk_embs, chunk_mask = model.encode_passage(batch_inputs, sep_positions) # Flatten chunk embeddings and create lookup indices batch_size, max_chunks, hidden_size = chunk_embs.shape diff --git a/src/tevatron/retriever/modeling/dense.py b/src/tevatron/retriever/modeling/dense.py index fe817455..43fec213 100644 --- a/src/tevatron/retriever/modeling/dense.py +++ b/src/tevatron/retriever/modeling/dense.py @@ -21,8 +21,8 @@ def encode_query(self, qry): def encode_passage(self, psg, sep_positions=None): hidden_states = self.encoder(**psg, return_dict=True).last_hidden_state - if self.passage_chunk_size > 0: - return self._pooling_chunked(hidden_states, self.sep_positions) + if self.passage_chunk_size > 0 and sep_positions: + return self._pooling_chunked(hidden_states, sep_positions) return self._pooling(hidden_states, psg['attention_mask']) def _pooling_chunked(self, last_hidden_state, sep_positions): @@ -36,22 +36,22 @@ def _pooling_chunked(self, last_hidden_state, sep_positions): # Find max number of chunks across all passages max_chunks = max(len(pos_list) for pos_list in sep_positions) - chunk_embs = torch.zeros(batch_size, max_chunks, hidden_size, device=last_hidden_state.device, dtype=last_hidden_state.dtype) + chunk_reps = torch.zeros(batch_size, max_chunks, hidden_size, device=last_hidden_state.device, dtype=last_hidden_state.dtype) chunk_mask = torch.zeros(batch_size, max_chunks, device=last_hidden_state.device, dtype=torch.float) # Extract embeddings at sep_positions (this is the pooling operation for chunked passages) for i, positions in enumerate(sep_positions): for j, pos in enumerate(positions): if 0 <= pos < seq_len: - chunk_embs[i, j] = last_hidden_state[i, pos] + chunk_reps[i, j] = last_hidden_state[i, pos] chunk_mask[i, j] = 1.0 else: logger.warning(f"Position {pos} out of bounds for sequence length {seq_len} in batch {i}, chunk {j}") if self.normalize: - chunk_embs = F.normalize(chunk_embs, p=2, dim=-1) + chunk_reps = F.normalize(chunk_reps, p=2, dim=-1) - return chunk_embs, chunk_mask + return chunk_reps, chunk_mask def _pooling(self, last_hidden_state, attention_mask): From 2d9939f2c6bebc01a1a5bb855bb54a2e02a7d632 Mon Sep 17 00:00:00 2001 From: Ryan Yu Date: Sun, 14 Dec 2025 18:53:02 -0500 Subject: [PATCH 11/14] fixed a chunk size not passed to model --- src/tevatron/retriever/driver/encode.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/tevatron/retriever/driver/encode.py b/src/tevatron/retriever/driver/encode.py index 55044691..24cfc514 100644 --- a/src/tevatron/retriever/driver/encode.py +++ b/src/tevatron/retriever/driver/encode.py @@ -85,6 +85,7 @@ def main(): if use_chunked: logger.info(f"Using chunked passage encoding with chunk_size={data_args.passage_chunk_size}") + model.passage_chunk_size = data_args.passage_chunk_size encode_collator = ChunkedEncodeCollator(data_args=data_args, tokenizer=tokenizer) else: encode_collator = EncodeCollator(data_args=data_args, tokenizer=tokenizer) From ed3e302c2d49a97a5c1c751884fad514a09ef4c7 Mon Sep 17 00:00:00 2001 From: Ryan Yu Date: Mon, 15 Dec 2025 14:29:40 -0500 Subject: [PATCH 12/14] changed eos to sep --- src/tevatron/retriever/trainer.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/tevatron/retriever/trainer.py b/src/tevatron/retriever/trainer.py index 30504361..e759c42c 100644 --- a/src/tevatron/retriever/trainer.py +++ b/src/tevatron/retriever/trainer.py @@ -46,9 +46,9 @@ def _save(self, output_dir: Optional[str] = None, state_dict=None): def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None): query, passage, *rest = inputs - eos_positions = rest[0] if rest else None - if hasattr(model, 'set_eos_positions'): - model.set_eos_positions(eos_positions) + sep_positions = rest[0] if rest else None + if hasattr(model, 'sep_positions'): + model.sep_positions = sep_positions return model(query=query, passage=passage).loss def training_step(self, *args): From add3832f2071525e257658cbe42cf9f9bbb3b928 Mon Sep 17 00:00:00 2001 From: Ryan Yu Date: Tue, 16 Dec 2025 00:49:27 -0500 Subject: [PATCH 13/14] added logs --- src/tevatron/retriever/collator.py | 4 +- src/tevatron/retriever/modeling/encoder.py | 62 ++++++++++++++++++++-- 2 files changed, 61 insertions(+), 5 deletions(-) diff --git a/src/tevatron/retriever/collator.py b/src/tevatron/retriever/collator.py index e489fa29..710cca91 100644 --- a/src/tevatron/retriever/collator.py +++ b/src/tevatron/retriever/collator.py @@ -55,8 +55,8 @@ def __call__(self, features: List[Tuple[str, List[str]]]): # Passage tokenization if self.data_args.passage_chunk_size > 0: - d_collated, eos_positions = self._tokenize_and_pad_chunked_passages(all_passages) - return q_collated, d_collated, eos_positions + d_collated, sep_positions = self._tokenize_and_pad_chunked_passages(all_passages) + return q_collated, d_collated, sep_positions else: d_collated = self.tokenizer( all_passages, diff --git a/src/tevatron/retriever/modeling/encoder.py b/src/tevatron/retriever/modeling/encoder.py index 1ccd678d..ed8cf123 100644 --- a/src/tevatron/retriever/modeling/encoder.py +++ b/src/tevatron/retriever/modeling/encoder.py @@ -1,6 +1,7 @@ from dataclasses import dataclass from typing import Dict, Optional +import os import torch import torch.distributed as dist from torch import nn, Tensor @@ -49,7 +50,21 @@ def forward(self, query: Dict[str, Tensor] = None, passage: Dict[str, Tensor] = q_reps = self.encode_query(query) if query else None p_reps, chunk_mask = None, None if passage: - p_reps = self.encode_passage(passage) + # If training with chunked passages, sep_positions is produced by the collator and + # attached to the model by TevatronTrainer.compute_loss(). Forward() needs to pass it + # into encode_passage() to actually get chunk reps/masks. + sep_positions = getattr(self, "sep_positions", None) + if self.passage_chunk_size > 0 and sep_positions is not None: + print(f"sep_positions: {sep_positions}") + try: + p_reps = self.encode_passage(passage, sep_positions=sep_positions) + except TypeError: + # Some models (e.g., multimodal) don't accept sep_positions. + p_reps = self.encode_passage(passage) + else: + p_reps = self.encode_passage(passage) + print(f"p_reps: {p_reps}") + print(f"type(p_reps): {type(p_reps)}") if self.passage_chunk_size > 0 and isinstance(p_reps, tuple): p_reps, chunk_mask = p_reps @@ -65,10 +80,14 @@ def forward(self, query: Dict[str, Tensor] = None, passage: Dict[str, Tensor] = if self.is_ddp: q_reps = self._dist_gather_tensor(q_reps) p_reps = self._dist_gather_tensor(p_reps) - + print(f"passage_chunk_size: {self.passage_chunk_size}") + print(f"chunk_mask: {chunk_mask}") if self.passage_chunk_size > 0 and chunk_mask is not None: + print(f"start compute maxsim similarity==========================") scores = self.compute_maxsim_similarity(q_reps, p_reps, chunk_mask) + print(f"end compute maxsim similarity==========================") else: + print(f"start compute similarity==========================") scores = self.compute_similarity(q_reps, p_reps) scores = scores.view(q_reps.size(0), -1) @@ -116,7 +135,44 @@ def compute_maxsim_similarity(self, q_reps, p_reps, chunk_mask): if chunk_mask is not None: padding_mask = ~chunk_mask.unsqueeze(0).bool() chunk_scores = chunk_scores.masked_fill(padding_mask, float('-inf')) - return chunk_scores.max(dim=-1).values + max_vals, max_idx = chunk_scores.max(dim=-1) # [Q, P], [Q, P] + + # Print argmax chunk index + (optional) original token position from sep_positions + if True: + # only log from rank-0 if DDP + if (not getattr(self, "is_ddp", False)) or getattr(self, "process_rank", 0) == 0: + sep_positions = getattr(self, "sep_positions", None) + # If DDP gathered passages, sep_positions may not align; only use when sizes match. + sep_ok = ( + isinstance(sep_positions, (list, tuple)) + and len(sep_positions) == p_reps.size(0) + ) + qn, pn = max_idx.size(0), max_idx.size(1) + for qi in range(qn): + for pi in range(pn): + ci = int(max_idx[qi, pi].item()) + # last valid chunk index for this passage (by mask) + if chunk_mask is not None: + valid = int(chunk_mask[pi].sum().item()) + last_ci = max(valid - 1, 0) + else: + last_ci = p_reps.size(1) - 1 + + if sep_ok and sep_positions[pi]: + pos_list = sep_positions[pi] + best_pos = pos_list[ci] if 0 <= ci < len(pos_list) else None + last_pos = pos_list[-1] + logger.info( + f"[maxsim] q={qi} p={pi} best_chunk={ci} best_pos={best_pos} " + f"last_chunk={last_ci} last_pos={last_pos} best_score={float(max_vals[qi, pi].item()):.6f}" + ) + else: + logger.info( + f"[maxsim] q={qi} p={pi} best_chunk={ci} last_chunk={last_ci} " + f"best_score={float(max_vals[qi, pi].item()):.6f}" + ) + + return max_vals def compute_loss(self, scores, target): return self.cross_entropy(scores, target) From 249dd9da7c69046f75f5b805a895284c9a78ea25 Mon Sep 17 00:00:00 2001 From: Ryan Yu Date: Fri, 19 Dec 2025 14:22:49 -0500 Subject: [PATCH 14/14] added some scripts --- finetune.sh | 27 ++++ finetune_with_chunk.sh | 27 ++++ req.txt | 271 +++++++++++++++++++++++++++++++++++++++ run_retrieval.sh | 65 ++++++++++ run_retrieval_chunked.sh | 65 ++++++++++ 5 files changed, 455 insertions(+) create mode 100755 finetune.sh create mode 100755 finetune_with_chunk.sh create mode 100644 req.txt create mode 100755 run_retrieval.sh create mode 100755 run_retrieval_chunked.sh diff --git a/finetune.sh b/finetune.sh new file mode 100755 index 00000000..dd983c33 --- /dev/null +++ b/finetune.sh @@ -0,0 +1,27 @@ +CUDA_VISIBLE_DEVICES=0 python -m tevatron.retriever.driver.train \ + --output_dir retriever-qwen3-emb-ft-chunk-1219-no-chunk-4-group-512-passage \ + --model_name_or_path Qwen/Qwen3-Embedding-0.6B \ + --do_train \ + --lora \ + --lora_target_modules q_proj,k_proj,v_proj,o_proj,down_proj,up_proj,gate_proj \ + --save_steps 50 \ + --dataset_name Tevatron/scifact \ + --dataset_split train \ + --query_prefix "Instruct: Given a scientific claim, retrieve documents that support or refute the claim.\nQuery:" \ + --passage_prefix "" \ + --bf16 \ + --pooling last \ + --padding_side left \ + --normalize \ + --temperature 0.01 \ + --per_device_train_batch_size 4 \ + --gradient_checkpointing \ + --train_group_size 16 \ + --learning_rate 1e-4 \ + --query_max_len 32 \ + --passage_max_len 512 \ + --num_train_epochs 10 \ + --logging_steps 10 \ + --overwrite_output_dir \ + --gradient_accumulation_steps 1 \ + --passage_chunk_size 0 diff --git a/finetune_with_chunk.sh b/finetune_with_chunk.sh new file mode 100755 index 00000000..712fdf09 --- /dev/null +++ b/finetune_with_chunk.sh @@ -0,0 +1,27 @@ +CUDA_VISIBLE_DEVICES=0 python -m tevatron.retriever.driver.train \ + --output_dir retriever-qwen3-emb-ft-chunk-1219-1 \ + --model_name_or_path Qwen/Qwen3-Embedding-0.6B \ + --do_train \ + --lora \ + --lora_target_modules q_proj,k_proj,v_proj,o_proj,down_proj,up_proj,gate_proj \ + --save_steps 50 \ + --dataset_name Tevatron/scifact \ + --dataset_split train \ + --query_prefix "Instruct: Given a scientific claim, retrieve documents that support or refute the claim.\nQuery:" \ + --passage_prefix "" \ + --bf16 \ + --pooling last \ + --padding_side right \ + --normalize \ + --temperature 0.01 \ + --per_device_train_batch_size 4 \ + --gradient_checkpointing \ + --train_group_size 16 \ + --learning_rate 1e-4 \ + --query_max_len 32 \ + --passage_max_len 512 \ + --num_train_epochs 10 \ + --logging_steps 10 \ + --overwrite_output_dir \ + --gradient_accumulation_steps 1 \ + --passage_chunk_size 256 diff --git a/req.txt b/req.txt new file mode 100644 index 00000000..b033b240 --- /dev/null +++ b/req.txt @@ -0,0 +1,271 @@ +accelerate==1.10.1 +aiohappyeyeballs==2.6.1 +aiohttp==3.13.0 +aiosignal==1.4.0 +annotated-types==0.7.0 +antlr4-python3-runtime==4.9.3 +anyio==4.11.0 +astor==0.8.1 +attrs==25.4.0 +audioread==3.0.1 +Authlib==1.6.5 +av==16.0.1 +beautifulsoup4==4.14.2 +beir==2.2.0 +blake3==1.0.8 +blinker==1.9.0 +blis==1.3.0 +cachetools==6.2.1 +catalogue==2.0.10 +cbor==1.0.0 +cbor2==5.7.0 +certifi==2025.10.5 +cffi==2.0.0 +charset-normalizer==3.4.3 +click==8.2.1 +clip @ git+https://github.com/openai/CLIP.git@dcba3cb2e2827b402d2701e7e1c7d9fed8a20ef1 +cloudpathlib==0.23.0 +cloudpickle==3.1.1 +coloredlogs==15.0.1 +compressed-tensors==0.11.0 +confection==0.1.5 +contourpy==1.3.3 +cramjam==2.11.0 +cryptography==46.0.2 +cupy-cuda12x==13.6.0 +cycler==0.12.1 +cyclopts==3.24.0 +cymem==2.0.11 +Cython==3.1.4 +datasets==2.19.0 +decorator==5.2.1 +decord==0.6.0 +deepspeed==0.18.0 +depyf==0.19.0 +dill==0.3.8 +diskcache==5.6.3 +distro==1.9.0 +dnspython==2.8.0 +docstring_parser==0.17.0 +docutils==0.22.2 +einops==0.8.1 +email-validator==2.3.0 +exceptiongroup==1.3.0 +fairscale==0.4.13 +faiss-cpu==1.12.0 +fastapi==0.119.0 +fastapi-cli==0.0.13 +fastapi-cloud-cli==0.3.1 +fastmcp==2.12.4 +fastparquet==2024.11.0 +fastrlock==0.8.3 +filelock==3.20.0 +flash_attn==2.8.3 +Flask==3.1.2 +flatbuffers==25.9.23 +fonttools==4.60.1 +frozendict==2.4.6 +frozenlist==1.8.0 +fsspec==2024.3.1 +ftfy==6.3.1 +gguf==0.17.1 +h11==0.16.0 +hf-xet==1.1.10 +hjson==3.1.0 +httpcore==1.0.9 +httptools==0.7.1 +httpx==0.28.1 +httpx-sse==0.4.3 +huggingface-hub==0.35.3 +humanfriendly==10.0 +idna==3.10 +ijson==3.4.0.post0 +iniconfig==2.3.0 +inscriptis==2.6.0 +interegular==0.3.3 +ir_datasets==0.5.11 +isodate==0.7.2 +itsdangerous==2.2.0 +Jinja2==3.1.6 +jiter==0.11.0 +joblib==1.5.2 +jsonschema==4.25.1 +jsonschema-path==0.3.4 +jsonschema-specifications==2025.9.1 +kiwisolver==1.4.9 +langcodes==3.5.0 +language_data==1.3.0 +lark==1.2.2 +lazy-object-proxy==1.12.0 +lazy_loader==0.4 +librosa==0.11.0 +llguidance==0.7.30 +llvmlite==0.44.0 +lm-format-enforcer==0.11.3 +lxml==6.0.2 +lz4==4.4.4 +marisa-trie==1.3.1 +markdown-it-py==4.0.0 +MarkupSafe==3.0.3 +matplotlib==3.10.7 +mcp==1.17.0 +mdurl==0.1.2 +mistral_common==1.8.5 +ml_dtypes==0.5.3 +more-itertools==10.8.0 +mpmath==1.3.0 +msgpack==1.1.2 +msgspec==0.19.0 +multidict==6.7.0 +multiprocess==0.70.16 +murmurhash==1.0.13 +networkx==3.5 +ninja==1.13.0 +numba==0.61.2 +numpy==2.2.6 +nvidia-cublas-cu12==12.8.4.1 +nvidia-cuda-cupti-cu12==12.8.90 +nvidia-cuda-nvrtc-cu12==12.8.93 +nvidia-cuda-runtime-cu12==12.8.90 +nvidia-cudnn-cu12==9.10.2.21 +nvidia-cufft-cu12==11.3.3.83 +nvidia-cufile-cu12==1.13.1.3 +nvidia-curand-cu12==10.3.9.90 +nvidia-cusolver-cu12==11.7.3.90 +nvidia-cusparse-cu12==12.5.8.93 +nvidia-cusparselt-cu12==0.7.1 +nvidia-nccl-cu12==2.27.3 +nvidia-nvjitlink-cu12==12.8.93 +nvidia-nvtx-cu12==12.8.90 +omegaconf==2.3.0 +onnx==1.19.1 +onnxoptimizer==0.3.13 +onnxruntime==1.23.1 +openai==2.3.0 +openai-harmony==0.0.4 +openapi-core==0.19.5 +openapi-pydantic==0.5.1 +openapi-schema-validator==0.6.3 +openapi-spec-validator==0.7.2 +opencv-python==4.12.0.88 +opencv-python-headless==4.12.0.88 +orjson==3.11.3 +outlines_core==0.2.11 +packaging==25.0 +pandas==2.3.3 +parse==1.20.2 +partial-json-parser==0.2.1.1.post6 +pathable==0.4.4 +peft==0.17.1 +pillow==11.3.0 +platformdirs==4.5.0 +pluggy==1.6.0 +pooch==1.8.2 +preshed==3.0.10 +prometheus-fastapi-instrumentator==7.1.0 +prometheus_client==0.23.1 +propcache==0.4.1 +protobuf==6.32.1 +psutil==7.1.0 +py-cpuinfo==9.0.0 +pyarrow==21.0.0 +pyarrow-hotfix==0.7 +pybase64==1.4.2 +pybind11==3.0.1 +pycountry==24.6.1 +pycparser==2.23 +pydantic==2.12.0 +pydantic-extra-types==2.10.6 +pydantic-settings==2.11.0 +pydantic_core==2.41.1 +Pygments==2.19.2 +pyjnius==1.7.0 +pynndescent==0.5.13 +pyparsing==3.2.5 +pyperclip==1.11.0 +-e git+ssh://git@github.com/FarmersWrap/pyserini.git@a1995bffa243636c89029735236348c1e5206161#egg=pyserini +pytest==9.0.1 +python-dateutil==2.9.0.post0 +python-dotenv==1.1.1 +python-json-logger==4.0.0 +python-multipart==0.0.20 +pytrec_eval-terrier==0.5.9 +pytz==2025.2 +PyYAML==6.0.3 +pyzmq==27.1.0 +qwen-omni-utils==0.0.8 +ranx==0.3.21 +ray==2.50.0 +referencing==0.36.2 +regex==2025.9.18 +requests==2.32.5 +rfc3339-validator==0.1.4 +rich==14.2.0 +rich-rst==1.3.1 +rich-toolkit==0.15.1 +rignore==0.7.1 +rpds-py==0.27.1 +safetensors==0.6.2 +scikit-learn==1.7.2 +scipy==1.16.2 +seaborn==0.13.2 +sentence-transformers==5.1.1 +sentencepiece==0.2.1 +sentry-sdk==2.42.0 +setproctitle==1.3.7 +setuptools==80.9.0 +shellingham==1.5.4 +six==1.17.0 +smart_open==7.3.1 +sniffio==1.3.1 +soundfile==0.13.1 +soupsieve==2.8 +soxr==1.0.0 +spacy==3.8.7 +spacy-legacy==3.0.12 +spacy-loggers==1.0.5 +srsly==2.5.1 +sse-starlette==3.0.2 +starlette==0.48.0 +sympy==1.14.0 +tabulate==0.9.0 +-e git+ssh://git@github.com/FarmersWrap/tevatron.git@add3832f2071525e257658cbe42cf9f9bbb3b928#egg=tevatron +thinc==8.3.6 +threadpoolctl==3.6.0 +tiktoken==0.12.0 +timm==1.0.20 +tokenizers==0.22.1 +torch==2.8.0 +torchaudio==2.8.0 +torchvision==0.23.0 +tqdm==4.67.1 +transformers==4.57.0 +trec-car-tools==2.6 +triton==3.4.0 +typeguard==4.4.4 +typer==0.19.2 +typing-inspection==0.4.2 +typing_extensions==4.15.0 +tzdata==2025.2 +umap-learn==0.5.9.post2 +uniir_for_pyserini==0.1.1 +unlzw3==0.2.3 +urllib3==2.5.0 +uvicorn==0.37.0 +uvloop==0.22.1 +vllm==0.11.0 +warc3-wet==0.2.5 +warc3-wet-clueweb09==0.2.5 +wasabi==1.1.3 +watchfiles==1.1.1 +wcwidth==0.2.14 +weasel==0.4.1 +websockets==15.0.1 +Werkzeug==3.1.1 +wheel==0.45.1 +wrapt==1.17.3 +xformers==0.0.32.post1 +xgrammar==0.1.25 +xxhash==3.6.0 +yarl==1.22.0 +zlib-state==0.1.10 diff --git a/run_retrieval.sh b/run_retrieval.sh new file mode 100755 index 00000000..9ee8d347 --- /dev/null +++ b/run_retrieval.sh @@ -0,0 +1,65 @@ +output_dir=retriever-qwen3-emb-ft-chunk-1219-no-chunk-4-group-512-passage +CUDA_VISIBLE_DEVICES=0 python -m tevatron.retriever.driver.encode \ + --output_dir=temp \ + --model_name_or_path Qwen/Qwen3-Embedding-0.6B \ + --bf16 \ + --per_device_eval_batch_size 4 \ + --normalize \ + --pooling last \ + --padding_side right \ + --query_prefix "Instruct: Given a scientific claim, retrieve documents that support or refute the claim.\nQuery:" \ + --query_max_len 512 \ + --dataset_name Tevatron/beir \ + --dataset_config scifact \ + --dataset_split test \ + --encode_output_path ${output_dir}/queries_scifact.pkl \ + --encode_is_query + + +# Encode corpus +CUDA_VISIBLE_DEVICES=0 python -m tevatron.retriever.driver.encode \ + --output_dir=temp \ + --model_name_or_path Qwen/Qwen3-Embedding-0.6B \ + --bf16 \ + --per_device_eval_batch_size 4 \ + --normalize \ + --pooling last \ + --padding_side right \ + --passage_prefix "" \ + --passage_max_len 512 \ + --dataset_name Tevatron/beir-corpus \ + --dataset_config scifact \ + --dataset_split train \ + --encode_output_path ${output_dir}/corpus_scifact.pkl \ + --passage_chunk_size 0 + +python -m tevatron.retriever.driver.search \ + --query_reps ${output_dir}/queries_scifact.pkl \ + --passage_reps ${output_dir}/corpus_scifact.pkl \ + --depth 100 \ + --batch_size 64 \ + --save_text \ + --save_ranking_to ${output_dir}/rank.scifact.txt + +# Convert to TREC format +python -m tevatron.utils.format.convert_result_to_trec --input ${output_dir}/rank.scifact.txt \ + --output ${output_dir}/rank.scifact.trec \ + --remove_query + +python -m tevatron.retriever.driver.search \ + --query_reps ${output_dir}/queries_scifact.pkl \ + --passage_reps ${output_dir}/corpus_scifact.pkl \ + --depth 100 \ + --batch_size 64 \ + --save_text \ + --save_ranking_to ${output_dir}/rank.scifact.txt + +# Convert to TREC format +python -m tevatron.utils.format.convert_result_to_trec --input ${output_dir}/rank.scifact.txt \ + --output ${output_dir}/rank.scifact.trec \ + --remove_query +python -m pyserini.eval.trec_eval -c -mrecall.100 -mndcg_cut.10 beir-v1.0.0-scifact-test ${output_dir}/rank.scifact.trec + +# recall_100 all 0.9767 +# ndcg_cut_10 all 0.7801 + diff --git a/run_retrieval_chunked.sh b/run_retrieval_chunked.sh new file mode 100755 index 00000000..b80ae37d --- /dev/null +++ b/run_retrieval_chunked.sh @@ -0,0 +1,65 @@ +output_dir=retriever-qwen3-emb-ft-chunk-1219-1 +CUDA_VISIBLE_DEVICES=0 python -m tevatron.retriever.driver.encode \ + --output_dir=temp \ + --model_name_or_path Qwen/Qwen3-Embedding-0.6B \ + --bf16 \ + --per_device_eval_batch_size 4 \ + --normalize \ + --pooling last \ + --padding_side right \ + --query_prefix "Instruct: Given a scientific claim, retrieve documents that support or refute the claim.\nQuery:" \ + --query_max_len 512 \ + --dataset_name Tevatron/beir \ + --dataset_config scifact \ + --dataset_split test \ + --encode_output_path ${output_dir}/queries_scifact.pkl \ + --encode_is_query + + +# Encode corpus +CUDA_VISIBLE_DEVICES=0 python -m tevatron.retriever.driver.encode \ + --output_dir=temp \ + --model_name_or_path Qwen/Qwen3-Embedding-0.6B \ + --bf16 \ + --per_device_eval_batch_size 4 \ + --normalize \ + --pooling last \ + --padding_side right \ + --passage_prefix "" \ + --passage_max_len 512 \ + --dataset_name Tevatron/beir-corpus \ + --dataset_config scifact \ + --dataset_split train \ + --encode_output_path ${output_dir}/corpus_scifact.pkl \ + --passage_chunk_size 256 + +python -m tevatron.retriever.driver.search \ + --query_reps ${output_dir}/queries_scifact.pkl \ + --passage_reps ${output_dir}/corpus_scifact.pkl \ + --depth 100 \ + --batch_size 64 \ + --save_text \ + --save_ranking_to ${output_dir}/rank.scifact.txt + +# Convert to TREC format +python -m tevatron.utils.format.convert_result_to_trec --input ${output_dir}/rank.scifact.txt \ + --output ${output_dir}/rank.scifact.trec \ + --remove_query + +python -m tevatron.retriever.driver.search \ + --query_reps ${output_dir}/queries_scifact.pkl \ + --passage_reps ${output_dir}/corpus_scifact.pkl \ + --depth 1000 \ + --batch_size 64 \ + --save_text \ + --save_ranking_to ${output_dir}/rank.scifact.txt + +# Convert to TREC format +python -m tevatron.utils.format.convert_result_to_trec --input ${output_dir}/rank.scifact.txt \ + --output ${output_dir}/rank.scifact.trec \ + --remove_query +python -m pyserini.eval.trec_eval -c -mrecall.100 -mndcg_cut.10 beir-v1.0.0-scifact-test ${output_dir}/rank.scifact.trec + +# recall_100 all 0.9767 +# ndcg_cut_10 all 0.7801 +