conda create -n AttnCache python=3.9
conda activate AttnCache
pip install torch torchvision torchaudio
pip install transformers==4.50.3 accelerate datasets scikit-learn scipy matplotlib faiss-cpu
pip install auto-gptq optimum bitsandbytes
wget https://people.eecs.berkeley.edu/~hendrycks/data.tar
tar -xf data.tar
Collect Hidden States and Attention Maps
python collect_hs_apms_llama.py --model-path meta-llama/Llama-3.2-3B-Instruct
python train_fp_and_build_db.py --epoch 3 --batchsize 32
python test_llama.py --threshold 0.995
If you find AttnCache useful or relevant to your project and research, please kindly cite our paper:
@article{song2025attncache,
title={AttnCache: Accelerating Self-Attention Inference for LLM Prefill via Attention Cache},
author={Song, Dinghong and Feng, Yuan and Wang, Yiwei and Chen, Shangye and Guyot, Cyril and Blagojevic, Filip and Jeon, Hyeran and Su, Pengfei and Li, Dong},
journal={arXiv},
year={2025}
}
