texttron · FarmersWrap · Dec 10, 2025 · Dec 10, 2025 · Dec 11, 2025 · Dec 14, 2025
diff --git a/finetune.sh b/finetune.sh
@@ -0,0 +1,27 @@
+CUDA_VISIBLE_DEVICES=0 python -m tevatron.retriever.driver.train \
+  --output_dir retriever-qwen3-emb-ft-chunk-1219-no-chunk-4-group-512-passage \
+  --model_name_or_path Qwen/Qwen3-Embedding-0.6B \
+  --do_train \
+  --lora \
+  --lora_target_modules q_proj,k_proj,v_proj,o_proj,down_proj,up_proj,gate_proj \
+  --save_steps 50 \
+  --dataset_name Tevatron/scifact \
+  --dataset_split train \
+  --query_prefix "Instruct: Given a scientific claim, retrieve documents that support or refute the claim.\nQuery:" \
+  --passage_prefix "" \
+  --bf16 \
+  --pooling last \
+  --padding_side left \
+  --normalize \
+  --temperature 0.01 \
+  --per_device_train_batch_size 4 \
+  --gradient_checkpointing \
+  --train_group_size 16 \
+  --learning_rate 1e-4 \
+  --query_max_len 32 \
+  --passage_max_len 512 \
+  --num_train_epochs 10 \
+  --logging_steps 10 \
+  --overwrite_output_dir \
+  --gradient_accumulation_steps 1 \
+  --passage_chunk_size 0
diff --git a/finetune_with_chunk.sh b/finetune_with_chunk.sh
@@ -0,0 +1,27 @@
+CUDA_VISIBLE_DEVICES=0 python -m tevatron.retriever.driver.train \
+  --output_dir retriever-qwen3-emb-ft-chunk-1219-1 \
+  --model_name_or_path Qwen/Qwen3-Embedding-0.6B \
+  --do_train \
+  --lora \
+  --lora_target_modules q_proj,k_proj,v_proj,o_proj,down_proj,up_proj,gate_proj \
+  --save_steps 50 \
+  --dataset_name Tevatron/scifact \
+  --dataset_split train \
+  --query_prefix "Instruct: Given a scientific claim, retrieve documents that support or refute the claim.\nQuery:" \
+  --passage_prefix "" \
+  --bf16 \
+  --pooling last \
+  --padding_side right \
+  --normalize \
+  --temperature 0.01 \
+  --per_device_train_batch_size 4 \
+  --gradient_checkpointing \
+  --train_group_size 16 \
+  --learning_rate 1e-4 \
+  --query_max_len 32 \
+  --passage_max_len 512 \
+  --num_train_epochs 10 \
+  --logging_steps 10 \
+  --overwrite_output_dir \
+  --gradient_accumulation_steps 1 \
+  --passage_chunk_size 256
diff --git a/req.txt b/req.txt
@@ -0,0 +1,271 @@
+accelerate==1.10.1
+aiohappyeyeballs==2.6.1
+aiohttp==3.13.0
+aiosignal==1.4.0
+annotated-types==0.7.0
+antlr4-python3-runtime==4.9.3
+anyio==4.11.0
+astor==0.8.1
+attrs==25.4.0
+audioread==3.0.1
+Authlib==1.6.5
+av==16.0.1
+beautifulsoup4==4.14.2
+beir==2.2.0
+blake3==1.0.8
+blinker==1.9.0
+blis==1.3.0
+cachetools==6.2.1
+catalogue==2.0.10
+cbor==1.0.0
+cbor2==5.7.0
+certifi==2025.10.5
+cffi==2.0.0
+charset-normalizer==3.4.3
+click==8.2.1
+clip @ git+https://github.com/openai/CLIP.git@dcba3cb2e2827b402d2701e7e1c7d9fed8a20ef1
+cloudpathlib==0.23.0
+cloudpickle==3.1.1
+coloredlogs==15.0.1
+compressed-tensors==0.11.0
+confection==0.1.5
+contourpy==1.3.3
+cramjam==2.11.0
+cryptography==46.0.2
+cupy-cuda12x==13.6.0
+cycler==0.12.1
+cyclopts==3.24.0
+cymem==2.0.11
+Cython==3.1.4
+datasets==2.19.0
+decorator==5.2.1
+decord==0.6.0
+deepspeed==0.18.0
+depyf==0.19.0
+dill==0.3.8
+diskcache==5.6.3
+distro==1.9.0
+dnspython==2.8.0
+docstring_parser==0.17.0
+docutils==0.22.2
+einops==0.8.1
+email-validator==2.3.0
+exceptiongroup==1.3.0
+fairscale==0.4.13
+faiss-cpu==1.12.0
+fastapi==0.119.0
+fastapi-cli==0.0.13
+fastapi-cloud-cli==0.3.1
+fastmcp==2.12.4
+fastparquet==2024.11.0
+fastrlock==0.8.3
+filelock==3.20.0
+flash_attn==2.8.3
+Flask==3.1.2
+flatbuffers==25.9.23
+fonttools==4.60.1
+frozendict==2.4.6
+frozenlist==1.8.0
+fsspec==2024.3.1
+ftfy==6.3.1
+gguf==0.17.1
+h11==0.16.0
+hf-xet==1.1.10
+hjson==3.1.0
+httpcore==1.0.9
+httptools==0.7.1
+httpx==0.28.1
+httpx-sse==0.4.3
+huggingface-hub==0.35.3
+humanfriendly==10.0
+idna==3.10
+ijson==3.4.0.post0
+iniconfig==2.3.0
+inscriptis==2.6.0
+interegular==0.3.3
+ir_datasets==0.5.11
+isodate==0.7.2
+itsdangerous==2.2.0
+Jinja2==3.1.6
+jiter==0.11.0
+joblib==1.5.2
+jsonschema==4.25.1
+jsonschema-path==0.3.4
+jsonschema-specifications==2025.9.1
+kiwisolver==1.4.9
+langcodes==3.5.0
+language_data==1.3.0
+lark==1.2.2
+lazy-object-proxy==1.12.0
+lazy_loader==0.4
+librosa==0.11.0
+llguidance==0.7.30
+llvmlite==0.44.0
+lm-format-enforcer==0.11.3
+lxml==6.0.2
+lz4==4.4.4
+marisa-trie==1.3.1
+markdown-it-py==4.0.0
+MarkupSafe==3.0.3
+matplotlib==3.10.7
+mcp==1.17.0
+mdurl==0.1.2
+mistral_common==1.8.5
+ml_dtypes==0.5.3
+more-itertools==10.8.0
+mpmath==1.3.0
+msgpack==1.1.2
+msgspec==0.19.0
+multidict==6.7.0
+multiprocess==0.70.16
+murmurhash==1.0.13
+networkx==3.5
+ninja==1.13.0
+numba==0.61.2
+numpy==2.2.6
+nvidia-cublas-cu12==12.8.4.1
+nvidia-cuda-cupti-cu12==12.8.90
+nvidia-cuda-nvrtc-cu12==12.8.93
+nvidia-cuda-runtime-cu12==12.8.90
+nvidia-cudnn-cu12==9.10.2.21
+nvidia-cufft-cu12==11.3.3.83
+nvidia-cufile-cu12==1.13.1.3
+nvidia-curand-cu12==10.3.9.90
+nvidia-cusolver-cu12==11.7.3.90
+nvidia-cusparse-cu12==12.5.8.93
+nvidia-cusparselt-cu12==0.7.1
+nvidia-nccl-cu12==2.27.3
+nvidia-nvjitlink-cu12==12.8.93
+nvidia-nvtx-cu12==12.8.90
+omegaconf==2.3.0
+onnx==1.19.1
+onnxoptimizer==0.3.13
+onnxruntime==1.23.1
+openai==2.3.0
+openai-harmony==0.0.4
+openapi-core==0.19.5
+openapi-pydantic==0.5.1
+openapi-schema-validator==0.6.3
+openapi-spec-validator==0.7.2
+opencv-python==4.12.0.88
+opencv-python-headless==4.12.0.88
+orjson==3.11.3
+outlines_core==0.2.11
+packaging==25.0
+pandas==2.3.3
+parse==1.20.2
+partial-json-parser==0.2.1.1.post6
+pathable==0.4.4
+peft==0.17.1
+pillow==11.3.0
+platformdirs==4.5.0
+pluggy==1.6.0
+pooch==1.8.2
+preshed==3.0.10
+prometheus-fastapi-instrumentator==7.1.0
+prometheus_client==0.23.1
+propcache==0.4.1
+protobuf==6.32.1
+psutil==7.1.0
+py-cpuinfo==9.0.0
+pyarrow==21.0.0
+pyarrow-hotfix==0.7
+pybase64==1.4.2
+pybind11==3.0.1
+pycountry==24.6.1
+pycparser==2.23
+pydantic==2.12.0
+pydantic-extra-types==2.10.6
+pydantic-settings==2.11.0
+pydantic_core==2.41.1
+Pygments==2.19.2
+pyjnius==1.7.0
+pynndescent==0.5.13
+pyparsing==3.2.5
+pyperclip==1.11.0
+-e git+ssh://git@github.com/FarmersWrap/pyserini.git@a1995bffa243636c89029735236348c1e5206161#egg=pyserini
+pytest==9.0.1
+python-dateutil==2.9.0.post0
+python-dotenv==1.1.1
+python-json-logger==4.0.0
+python-multipart==0.0.20
+pytrec_eval-terrier==0.5.9
+pytz==2025.2
+PyYAML==6.0.3
+pyzmq==27.1.0
+qwen-omni-utils==0.0.8
+ranx==0.3.21
+ray==2.50.0
+referencing==0.36.2
+regex==2025.9.18
+requests==2.32.5
+rfc3339-validator==0.1.4
+rich==14.2.0
+rich-rst==1.3.1
+rich-toolkit==0.15.1
+rignore==0.7.1
+rpds-py==0.27.1
+safetensors==0.6.2
+scikit-learn==1.7.2
+scipy==1.16.2
+seaborn==0.13.2
+sentence-transformers==5.1.1
+sentencepiece==0.2.1
+sentry-sdk==2.42.0
+setproctitle==1.3.7
+setuptools==80.9.0
+shellingham==1.5.4
+six==1.17.0
+smart_open==7.3.1
+sniffio==1.3.1
+soundfile==0.13.1
+soupsieve==2.8
+soxr==1.0.0
+spacy==3.8.7
+spacy-legacy==3.0.12
+spacy-loggers==1.0.5
+srsly==2.5.1
+sse-starlette==3.0.2
+starlette==0.48.0
+sympy==1.14.0
+tabulate==0.9.0
+-e git+ssh://git@github.com/FarmersWrap/tevatron.git@add3832f2071525e257658cbe42cf9f9bbb3b928#egg=tevatron
+thinc==8.3.6
+threadpoolctl==3.6.0
+tiktoken==0.12.0
+timm==1.0.20
+tokenizers==0.22.1
+torch==2.8.0
+torchaudio==2.8.0
+torchvision==0.23.0
+tqdm==4.67.1
+transformers==4.57.0
+trec-car-tools==2.6
+triton==3.4.0
+typeguard==4.4.4
+typer==0.19.2
+typing-inspection==0.4.2
+typing_extensions==4.15.0
+tzdata==2025.2
+umap-learn==0.5.9.post2
+uniir_for_pyserini==0.1.1
+unlzw3==0.2.3
+urllib3==2.5.0
+uvicorn==0.37.0
+uvloop==0.22.1
+vllm==0.11.0
+warc3-wet==0.2.5
+warc3-wet-clueweb09==0.2.5
+wasabi==1.1.3
+watchfiles==1.1.1
+wcwidth==0.2.14
+weasel==0.4.1
+websockets==15.0.1
+Werkzeug==3.1.1
+wheel==0.45.1
+wrapt==1.17.3
+xformers==0.0.32.post1
+xgrammar==0.1.25
+xxhash==3.6.0
+yarl==1.22.0
+zlib-state==0.1.10
diff --git a/run_retrieval.sh b/run_retrieval.sh
@@ -0,0 +1,65 @@
+output_dir=retriever-qwen3-emb-ft-chunk-1219-no-chunk-4-group-512-passage
+CUDA_VISIBLE_DEVICES=0 python -m tevatron.retriever.driver.encode  \
+  --output_dir=temp \
+  --model_name_or_path Qwen/Qwen3-Embedding-0.6B \
+  --bf16 \
+  --per_device_eval_batch_size 4 \
+  --normalize \
+  --pooling last \
+  --padding_side right \
+  --query_prefix "Instruct: Given a scientific claim, retrieve documents that support or refute the claim.\nQuery:" \
+  --query_max_len 512 \
+  --dataset_name Tevatron/beir \
+  --dataset_config scifact \
+  --dataset_split test \
+  --encode_output_path ${output_dir}/queries_scifact.pkl \
+  --encode_is_query
+
+
+# Encode corpus
+CUDA_VISIBLE_DEVICES=0 python -m tevatron.retriever.driver.encode  \
+  --output_dir=temp \
+  --model_name_or_path Qwen/Qwen3-Embedding-0.6B \
+  --bf16 \
+  --per_device_eval_batch_size 4 \
+  --normalize \
+  --pooling last \
+  --padding_side right \
+  --passage_prefix "" \
+  --passage_max_len 512 \
+  --dataset_name Tevatron/beir-corpus \
+  --dataset_config scifact \
+  --dataset_split train \
+  --encode_output_path ${output_dir}/corpus_scifact.pkl \
+  --passage_chunk_size 0
+
+python -m tevatron.retriever.driver.search \
+    --query_reps ${output_dir}/queries_scifact.pkl \
+    --passage_reps ${output_dir}/corpus_scifact.pkl \
+    --depth 100 \
+    --batch_size 64 \
+    --save_text \
+    --save_ranking_to ${output_dir}/rank.scifact.txt
+
+# Convert to TREC format
+python -m tevatron.utils.format.convert_result_to_trec --input ${output_dir}/rank.scifact.txt \
+                                                       --output ${output_dir}/rank.scifact.trec \
+                                                       --remove_query
+
+python -m tevatron.retriever.driver.search \
+    --query_reps ${output_dir}/queries_scifact.pkl \
+    --passage_reps ${output_dir}/corpus_scifact.pkl \
+    --depth 100 \
+    --batch_size 64 \
+    --save_text \
+    --save_ranking_to ${output_dir}/rank.scifact.txt
+
+# Convert to TREC format
+python -m tevatron.utils.format.convert_result_to_trec --input ${output_dir}/rank.scifact.txt \
+                                                       --output ${output_dir}/rank.scifact.trec \
+                                                       --remove_query
+python -m pyserini.eval.trec_eval -c -mrecall.100 -mndcg_cut.10 beir-v1.0.0-scifact-test ${output_dir}/rank.scifact.trec
+
+# recall_100              all     0.9767
+# ndcg_cut_10             all     0.7801
+