Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions finetune.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
CUDA_VISIBLE_DEVICES=0 python -m tevatron.retriever.driver.train \
--output_dir retriever-qwen3-emb-ft-chunk-1219-no-chunk-4-group-512-passage \
--model_name_or_path Qwen/Qwen3-Embedding-0.6B \
--do_train \
--lora \
--lora_target_modules q_proj,k_proj,v_proj,o_proj,down_proj,up_proj,gate_proj \
--save_steps 50 \
--dataset_name Tevatron/scifact \
--dataset_split train \
--query_prefix "Instruct: Given a scientific claim, retrieve documents that support or refute the claim.\nQuery:" \
--passage_prefix "" \
--bf16 \
--pooling last \
--padding_side left \
--normalize \
--temperature 0.01 \
--per_device_train_batch_size 4 \
--gradient_checkpointing \
--train_group_size 16 \
--learning_rate 1e-4 \
--query_max_len 32 \
--passage_max_len 512 \
--num_train_epochs 10 \
--logging_steps 10 \
--overwrite_output_dir \
--gradient_accumulation_steps 1 \
--passage_chunk_size 0
27 changes: 27 additions & 0 deletions finetune_with_chunk.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
CUDA_VISIBLE_DEVICES=0 python -m tevatron.retriever.driver.train \
--output_dir retriever-qwen3-emb-ft-chunk-1219-1 \
--model_name_or_path Qwen/Qwen3-Embedding-0.6B \
--do_train \
--lora \
--lora_target_modules q_proj,k_proj,v_proj,o_proj,down_proj,up_proj,gate_proj \
--save_steps 50 \
--dataset_name Tevatron/scifact \
--dataset_split train \
--query_prefix "Instruct: Given a scientific claim, retrieve documents that support or refute the claim.\nQuery:" \
--passage_prefix "" \
--bf16 \
--pooling last \
--padding_side right \
--normalize \
--temperature 0.01 \
--per_device_train_batch_size 4 \
--gradient_checkpointing \
--train_group_size 16 \
--learning_rate 1e-4 \
--query_max_len 32 \
--passage_max_len 512 \
--num_train_epochs 10 \
--logging_steps 10 \
--overwrite_output_dir \
--gradient_accumulation_steps 1 \
--passage_chunk_size 256
271 changes: 271 additions & 0 deletions req.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,271 @@
accelerate==1.10.1
aiohappyeyeballs==2.6.1
aiohttp==3.13.0
aiosignal==1.4.0
annotated-types==0.7.0
antlr4-python3-runtime==4.9.3
anyio==4.11.0
astor==0.8.1
attrs==25.4.0
audioread==3.0.1
Authlib==1.6.5
av==16.0.1
beautifulsoup4==4.14.2
beir==2.2.0
blake3==1.0.8
blinker==1.9.0
blis==1.3.0
cachetools==6.2.1
catalogue==2.0.10
cbor==1.0.0
cbor2==5.7.0
certifi==2025.10.5
cffi==2.0.0
charset-normalizer==3.4.3
click==8.2.1
clip @ git+https://github.com/openai/CLIP.git@dcba3cb2e2827b402d2701e7e1c7d9fed8a20ef1
cloudpathlib==0.23.0
cloudpickle==3.1.1
coloredlogs==15.0.1
compressed-tensors==0.11.0
confection==0.1.5
contourpy==1.3.3
cramjam==2.11.0
cryptography==46.0.2
cupy-cuda12x==13.6.0
cycler==0.12.1
cyclopts==3.24.0
cymem==2.0.11
Cython==3.1.4
datasets==2.19.0
decorator==5.2.1
decord==0.6.0
deepspeed==0.18.0
depyf==0.19.0
dill==0.3.8
diskcache==5.6.3
distro==1.9.0
dnspython==2.8.0
docstring_parser==0.17.0
docutils==0.22.2
einops==0.8.1
email-validator==2.3.0
exceptiongroup==1.3.0
fairscale==0.4.13
faiss-cpu==1.12.0
fastapi==0.119.0
fastapi-cli==0.0.13
fastapi-cloud-cli==0.3.1
fastmcp==2.12.4
fastparquet==2024.11.0
fastrlock==0.8.3
filelock==3.20.0
flash_attn==2.8.3
Flask==3.1.2
flatbuffers==25.9.23
fonttools==4.60.1
frozendict==2.4.6
frozenlist==1.8.0
fsspec==2024.3.1
ftfy==6.3.1
gguf==0.17.1
h11==0.16.0
hf-xet==1.1.10
hjson==3.1.0
httpcore==1.0.9
httptools==0.7.1
httpx==0.28.1
httpx-sse==0.4.3
huggingface-hub==0.35.3
humanfriendly==10.0
idna==3.10
ijson==3.4.0.post0
iniconfig==2.3.0
inscriptis==2.6.0
interegular==0.3.3
ir_datasets==0.5.11
isodate==0.7.2
itsdangerous==2.2.0
Jinja2==3.1.6
jiter==0.11.0
joblib==1.5.2
jsonschema==4.25.1
jsonschema-path==0.3.4
jsonschema-specifications==2025.9.1
kiwisolver==1.4.9
langcodes==3.5.0
language_data==1.3.0
lark==1.2.2
lazy-object-proxy==1.12.0
lazy_loader==0.4
librosa==0.11.0
llguidance==0.7.30
llvmlite==0.44.0
lm-format-enforcer==0.11.3
lxml==6.0.2
lz4==4.4.4
marisa-trie==1.3.1
markdown-it-py==4.0.0
MarkupSafe==3.0.3
matplotlib==3.10.7
mcp==1.17.0
mdurl==0.1.2
mistral_common==1.8.5
ml_dtypes==0.5.3
more-itertools==10.8.0
mpmath==1.3.0
msgpack==1.1.2
msgspec==0.19.0
multidict==6.7.0
multiprocess==0.70.16
murmurhash==1.0.13
networkx==3.5
ninja==1.13.0
numba==0.61.2
numpy==2.2.6
nvidia-cublas-cu12==12.8.4.1
nvidia-cuda-cupti-cu12==12.8.90
nvidia-cuda-nvrtc-cu12==12.8.93
nvidia-cuda-runtime-cu12==12.8.90
nvidia-cudnn-cu12==9.10.2.21
nvidia-cufft-cu12==11.3.3.83
nvidia-cufile-cu12==1.13.1.3
nvidia-curand-cu12==10.3.9.90
nvidia-cusolver-cu12==11.7.3.90
nvidia-cusparse-cu12==12.5.8.93
nvidia-cusparselt-cu12==0.7.1
nvidia-nccl-cu12==2.27.3
nvidia-nvjitlink-cu12==12.8.93
nvidia-nvtx-cu12==12.8.90
omegaconf==2.3.0
onnx==1.19.1
onnxoptimizer==0.3.13
onnxruntime==1.23.1
openai==2.3.0
openai-harmony==0.0.4
openapi-core==0.19.5
openapi-pydantic==0.5.1
openapi-schema-validator==0.6.3
openapi-spec-validator==0.7.2
opencv-python==4.12.0.88
opencv-python-headless==4.12.0.88
orjson==3.11.3
outlines_core==0.2.11
packaging==25.0
pandas==2.3.3
parse==1.20.2
partial-json-parser==0.2.1.1.post6
pathable==0.4.4
peft==0.17.1
pillow==11.3.0
platformdirs==4.5.0
pluggy==1.6.0
pooch==1.8.2
preshed==3.0.10
prometheus-fastapi-instrumentator==7.1.0
prometheus_client==0.23.1
propcache==0.4.1
protobuf==6.32.1
psutil==7.1.0
py-cpuinfo==9.0.0
pyarrow==21.0.0
pyarrow-hotfix==0.7
pybase64==1.4.2
pybind11==3.0.1
pycountry==24.6.1
pycparser==2.23
pydantic==2.12.0
pydantic-extra-types==2.10.6
pydantic-settings==2.11.0
pydantic_core==2.41.1
Pygments==2.19.2
pyjnius==1.7.0
pynndescent==0.5.13
pyparsing==3.2.5
pyperclip==1.11.0
-e git+ssh://git@github.com/FarmersWrap/pyserini.git@a1995bffa243636c89029735236348c1e5206161#egg=pyserini
pytest==9.0.1
python-dateutil==2.9.0.post0
python-dotenv==1.1.1
python-json-logger==4.0.0
python-multipart==0.0.20
pytrec_eval-terrier==0.5.9
pytz==2025.2
PyYAML==6.0.3
pyzmq==27.1.0
qwen-omni-utils==0.0.8
ranx==0.3.21
ray==2.50.0
referencing==0.36.2
regex==2025.9.18
requests==2.32.5
rfc3339-validator==0.1.4
rich==14.2.0
rich-rst==1.3.1
rich-toolkit==0.15.1
rignore==0.7.1
rpds-py==0.27.1
safetensors==0.6.2
scikit-learn==1.7.2
scipy==1.16.2
seaborn==0.13.2
sentence-transformers==5.1.1
sentencepiece==0.2.1
sentry-sdk==2.42.0
setproctitle==1.3.7
setuptools==80.9.0
shellingham==1.5.4
six==1.17.0
smart_open==7.3.1
sniffio==1.3.1
soundfile==0.13.1
soupsieve==2.8
soxr==1.0.0
spacy==3.8.7
spacy-legacy==3.0.12
spacy-loggers==1.0.5
srsly==2.5.1
sse-starlette==3.0.2
starlette==0.48.0
sympy==1.14.0
tabulate==0.9.0
-e git+ssh://git@github.com/FarmersWrap/tevatron.git@add3832f2071525e257658cbe42cf9f9bbb3b928#egg=tevatron
thinc==8.3.6
threadpoolctl==3.6.0
tiktoken==0.12.0
timm==1.0.20
tokenizers==0.22.1
torch==2.8.0
torchaudio==2.8.0
torchvision==0.23.0
tqdm==4.67.1
transformers==4.57.0
trec-car-tools==2.6
triton==3.4.0
typeguard==4.4.4
typer==0.19.2
typing-inspection==0.4.2
typing_extensions==4.15.0
tzdata==2025.2
umap-learn==0.5.9.post2
uniir_for_pyserini==0.1.1
unlzw3==0.2.3
urllib3==2.5.0
uvicorn==0.37.0
uvloop==0.22.1
vllm==0.11.0
warc3-wet==0.2.5
warc3-wet-clueweb09==0.2.5
wasabi==1.1.3
watchfiles==1.1.1
wcwidth==0.2.14
weasel==0.4.1
websockets==15.0.1
Werkzeug==3.1.1
wheel==0.45.1
wrapt==1.17.3
xformers==0.0.32.post1
xgrammar==0.1.25
xxhash==3.6.0
yarl==1.22.0
zlib-state==0.1.10
65 changes: 65 additions & 0 deletions run_retrieval.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
output_dir=retriever-qwen3-emb-ft-chunk-1219-no-chunk-4-group-512-passage
CUDA_VISIBLE_DEVICES=0 python -m tevatron.retriever.driver.encode \
--output_dir=temp \
--model_name_or_path Qwen/Qwen3-Embedding-0.6B \
--bf16 \
--per_device_eval_batch_size 4 \
--normalize \
--pooling last \
--padding_side right \
--query_prefix "Instruct: Given a scientific claim, retrieve documents that support or refute the claim.\nQuery:" \
--query_max_len 512 \
--dataset_name Tevatron/beir \
--dataset_config scifact \
--dataset_split test \
--encode_output_path ${output_dir}/queries_scifact.pkl \
--encode_is_query


# Encode corpus
CUDA_VISIBLE_DEVICES=0 python -m tevatron.retriever.driver.encode \
--output_dir=temp \
--model_name_or_path Qwen/Qwen3-Embedding-0.6B \
--bf16 \
--per_device_eval_batch_size 4 \
--normalize \
--pooling last \
--padding_side right \
--passage_prefix "" \
--passage_max_len 512 \
--dataset_name Tevatron/beir-corpus \
--dataset_config scifact \
--dataset_split train \
--encode_output_path ${output_dir}/corpus_scifact.pkl \
--passage_chunk_size 0

python -m tevatron.retriever.driver.search \
--query_reps ${output_dir}/queries_scifact.pkl \
--passage_reps ${output_dir}/corpus_scifact.pkl \
--depth 100 \
--batch_size 64 \
--save_text \
--save_ranking_to ${output_dir}/rank.scifact.txt

# Convert to TREC format
python -m tevatron.utils.format.convert_result_to_trec --input ${output_dir}/rank.scifact.txt \
--output ${output_dir}/rank.scifact.trec \
--remove_query

python -m tevatron.retriever.driver.search \
--query_reps ${output_dir}/queries_scifact.pkl \
--passage_reps ${output_dir}/corpus_scifact.pkl \
--depth 100 \
--batch_size 64 \
--save_text \
--save_ranking_to ${output_dir}/rank.scifact.txt

# Convert to TREC format
python -m tevatron.utils.format.convert_result_to_trec --input ${output_dir}/rank.scifact.txt \
--output ${output_dir}/rank.scifact.trec \
--remove_query
python -m pyserini.eval.trec_eval -c -mrecall.100 -mndcg_cut.10 beir-v1.0.0-scifact-test ${output_dir}/rank.scifact.trec

# recall_100 all 0.9767
# ndcg_cut_10 all 0.7801

Loading