Thanks to the KAIST-AILab for providing resources. Models are trained on the Vox, LRS2 and LRS3 dataset.
@inproceedings{ahn2024syncvsr,
author={Young Jin Ahn and Jungwoo Park and Sangha Park and Jonghyun Choi and Kee-Eung Kim},
title={{SyncVSR: Data-Efficient Visual Speech Recognition with End-to-End Crossmodal Audio Token Synchronization}},
booktitle={Proc. Interspeech 2024},
doi={10.21437/Interspeech.2024-432}
}