facebookresearch · avidale · Oct 10, 2025 · Oct 10, 2025 · Oct 10, 2025 · Oct 10, 2025
diff --git a/.github/workflows/lint_and_test.yml b/.github/workflows/lint_and_test.yml
@@ -28,8 +28,7 @@ jobs:
         run: |
           sudo apt-get install libsndfile1
           python -m pip install --upgrade pip
-          pip install -r requirements-dev.txt
-          pip install -e .
+          pip install -e ".[dev]"
       - name: Lint with flake8
         run: |
           # stop the build if there are Python syntax errors or undefined names
@@ -58,9 +57,7 @@ jobs:
         run: |
           sudo apt-get install libsndfile1
           python -m pip install --upgrade pip
-          pip install --extra-index-url https://fair.pkg.atmeta.com/fairseq2/whl/pt2.5.1/cpu -r requirements.txt
-          pip install -r requirements-dev.txt
-          pip install -e .
+          pip install -e  ".[dev]" --extra-index-url https://fair.pkg.atmeta.com/fairseq2/whl/pt2.6.0/cpu
       - name: mypy
         run: mypy --install-types --non-interactive ./ --cache-dir=.mypy_cache/
 
@@ -81,8 +78,7 @@ jobs:
       - name: Install dependencies
         run: |
           sudo apt-get install libsndfile1
-          pip install -r requirements-dev.txt
-          pip install -e .
+          pip install -e ".[dev]"
       - name: pytest_unit
         run: pytest -s -v tests/unit_tests/
 
@@ -104,8 +100,7 @@ jobs:
         run: |
           sudo apt-get install libsndfile1
           python -m pip install --upgrade pip
-          pip install -r requirements-dev.txt
-          pip install -e .
+          pip install -e ".[dev]"
       - name: Free Disk Space (Ubuntu)
         uses: jlumbroso/free-disk-space@main
         with:

diff --git a/LICENSE.md b/LICENSE.md
@@ -68,5 +68,5 @@ BUT BEWARE, the following speech encoders are released under a non commercial li
 | vie       | vietnamese       | https://dl.fbaipublicfiles.com/SONAR/spenc.v5ap.vie.pt |
 | yue       | yue              | https://dl.fbaipublicfiles.com/SONAR/spenc.v5ap.yue.pt |
 
-The SONAR text encoder and decoder, as well as the BLASER 2.0 models, are released under the same non-commercial 
+The SONAR text encoder and decoder, as well as the BLASER 2.0 models, are released under the same non-commercial
 license ([NC_MODEL_LICENSE](NC_MODEL_LICENSE.md)).
diff --git a/NC_MODEL_LICENSE.md b/NC_MODEL_LICENSE.md
@@ -58,7 +58,7 @@ exhaustive, and do not form part of our licenses.
      such as asking that all changes be marked or described.
      Although not required by our licenses, you are encouraged to
      respect those requests where reasonable. More_considerations
-     for the public: 
+     for the public:
 	wiki.creativecommons.org/Considerations_for_licensees
 
 =======================================================================

diff --git a/README.md b/README.md
@@ -2,7 +2,7 @@
 [[Paper]](https://ai.meta.com/research/publications/sonar-sentence-level-multimodal-and-language-agnostic-representations/)
 [[Demo]](#usage)
 
-We introduce SONAR, a new multilingual and multimodal fixed-size sentence embedding space, with a full suite of speech and text encoders and decoders. It substantially outperforms existing sentence embeddings such as LASER3 and LabSE on the xsim and xsim++ multilingual similarity search tasks. 
+We introduce SONAR, a new multilingual and multimodal fixed-size sentence embedding space, with a full suite of speech and text encoders and decoders. It substantially outperforms existing sentence embeddings such as LASER3 and LabSE on the xsim and xsim++ multilingual similarity search tasks.
 
 Speech segments can be embedded in the same SONAR embedding space using language-specific speech encoders trained in a teacher-student setting on speech transcription data. We also provide a single text decoder, which allows us to perform text-to-text and speech-to-text machine translation, including for zero-shot language and modality combinations.
 
@@ -37,7 +37,7 @@ pip install fairseq2 --extra-index-url https://fair.pkg.atmeta.com/fairseq2/whl/
 ```
 If [fairseq2](https://github.com/facebookresearch/fairseq2) does not provide a build for your machine, check the readme of that project to build it locally.
 
-We recommend installing SONAR only after you have a correct version of `fairseq2` installed.  Note that SONAR currently relies on the stable version of fairseq2 0.4.5 (with minor variations possible).
+We recommend installing SONAR only after you have a correct version of `fairseq2` installed.  Note that SONAR currently relies on the stable version of `fairseq2>=0.5.2` (with minor variations possible).
 
 If you want to install SONAR manually, you can install it localy:
 
@@ -46,6 +46,14 @@ pip install --upgrade pip
 pip install -e .
 ```
 
+### Versions
+Unfortunately, SONAR code is very much tied to fairseq2 code, and thus only specific version are compatible with each other:
+- `sonar-space~=0.5.0` (the current version) requires `fairseq2>=0.5.2`
+- `sonar-space~=0.4.0` required `fairseq2~=0.4.0`
+- `sonar-space~=0.2.0` required `fairseq2~=0.2.0`
+
+In the future, when the `fairseq2` interface stabilizes, we hope to keep the version dependencies less loosely coupled. 
+
 
 ## Usage
 fairseq2 will automatically download models into your `$TORCH_HOME/hub` directory upon using the commands below.
@@ -150,7 +158,7 @@ assert sr == 16000, "Sample rate should be 16kHz"
 s2t_model.predict([inp], target_lang="eng_Latn")
 # ['Television reports show white smoke coming from the plant.']
 
-# passing multiple wav files 
+# passing multiple wav files
 s2t_model.predict(["./tests/integration_tests/data/audio_files/audio_1.wav",
                    "./tests/integration_tests/data/audio_files/audio_2.wav"], target_lang="eng_Latn")
 # ['Television reports show white smoke coming from the plant.',
@@ -161,8 +169,8 @@ s2t_model.predict(["./tests/integration_tests/data/audio_files/audio_1.wav",
 ### Predicting sentence similarity with BLASER 2.0 models
 
 BLASER 2.0 is a family of models for automatic evaluation of machine translation quality based on SONAR embeddings.
-They predict [cross-lingual semantic similarity](https://github.com/facebookresearch/fairseq/tree/nllb/examples/nllb/human_XSTS_eval) 
-between the translation and the source (optionally, also using a reference translation). 
+They predict [cross-lingual semantic similarity](https://github.com/facebookresearch/fairseq/tree/nllb/examples/nllb/human_XSTS_eval)
+between the translation and the source (optionally, also using a reference translation).
 
 ```Python
 from sonar.inference_pipelines.text import TextToEmbeddingModelPipeline
@@ -182,7 +190,7 @@ with torch.inference_mode():
 ```
 
 Detailed model cards with more examples: [facebook/blaser-2.0-ref](https://huggingface.co/facebook/blaser-2.0-ref), 
-[facebook/blaser-2.0-qe](https://huggingface.co/facebook/blaser-2.0-qe). 
+[facebook/blaser-2.0-qe](https://huggingface.co/facebook/blaser-2.0-qe).
 
 ### Classifying the toxicity of sentences with MuTox
 
@@ -237,6 +245,10 @@ See more complete demo notebooks :
 * [sonar speech2text and other data pipeline examples](examples/inference_pipelines.ipynb)
 * [sonar bilingual document alignment with sonar text similarity](examples/bilingual_document.ipynb)
 
+### Troubleshooting
+
+- In case of errors like `fairseq2.assets.card.AssetCardError: Model checkpoint of the blaser_2_0_qe asset card cannot be loaded`, try removing the fairseq2 assets cache (located in `~/.cache/fairseq2`); it might be that some of the downloaded model checkpoints are invalid.
+
 
 ## Supported languages and download links
 The SONAR text encoder & decoder supports 200 languages. SONAR speech encoders support 37 languages.
@@ -554,6 +566,6 @@ See the [CONTRIBUTING](CONTRIBUTING.md) file for how to help out.
 
 SONAR code is released under the MIT license (see [CODE_LICENSE](CODE_LICENSE.md)).
 
-Some of SONAR models are released with the same MIT license, BUT BEWARE, 
+Some of SONAR models are released with the same MIT license, BUT BEWARE,
 some of them are released under a non commercial license (see [NC_MODEL_LICENSE](NC_MODEL_LICENSE.md)).
 Please refer to [LICENSE](LICENSE.md) for the details.
diff --git a/examples/data/eng_flores200_dev_sample.tsv → examples/eng_flores200_dev_sample.tsv b/examples/data/eng_flores200_dev_sample.tsv → examples/eng_flores200_dev_sample.tsv
diff --git a/examples/inference_pipelines.ipynb b/examples/inference_pipelines.ipynb
@@ -12,24 +12,6 @@
     "* Speech to Text translation"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Install sonar\n",
-    "\n",
-    "if sonar is not yet installed, install it:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "%pip install --quiet sonar-space"
-   ]
-  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -61,11 +43,13 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from sonar.models.sonar_speech.loader import load_sonar_speech_model\n",
+    "from fairseq2 import init_fairseq2\n",
+    "from fairseq2.data.text import get_text_tokenizer_hub\n",
+    "\n",
+    "from sonar.models.sonar_speech import get_sonar_speech_encoder_hub\n",
     "from sonar.models.sonar_text import (\n",
-    "    load_sonar_text_decoder_model,\n",
-    "    load_sonar_text_encoder_model,\n",
-    "    load_sonar_tokenizer,\n",
+    "    get_sonar_text_decoder_hub,\n",
+    "    get_sonar_text_encoder_hub,\n",
     ")"
    ]
   },
@@ -75,7 +59,17 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "speech_encoder_model = load_sonar_speech_model(\n",
+    "init_fairseq2()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "speech_encoder_hub = get_sonar_speech_encoder_hub()\n",
+    "speech_encoder_model = speech_encoder_hub.load(\n",
     "    \"sonar_speech_encoder_eng\", device=device\n",
     ").eval()"
    ]
@@ -86,7 +80,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "text_encoder_model = load_sonar_text_encoder_model(\n",
+    "text_encoder_hub = get_sonar_text_encoder_hub()\n",
+    "text_encoder_model = text_encoder_hub.load(\n",
     "    \"text_sonar_basic_encoder\", device=device\n",
     ").eval()"
    ]
@@ -97,7 +92,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "text_decoder_model = load_sonar_text_decoder_model(\n",
+    "text_decoder_hub = get_sonar_text_decoder_hub()\n",
+    "text_decoder_model = text_decoder_hub.load(\n",
     "    \"text_sonar_basic_decoder\", device=device\n",
     ").eval()"
    ]
@@ -109,7 +105,8 @@
    "outputs": [],
    "source": [
     "# tokenizer is compatible with nllb tokenizer logic already\n",
-    "text_tokenizer = load_sonar_tokenizer(\"text_sonar_basic_encoder\")"
+    "text_tokenizer_hub = get_text_tokenizer_hub()\n",
+    "text_tokenizer = text_tokenizer_hub.load(\"text_sonar_basic_encoder\")"
    ]
   },
   {
@@ -260,7 +257,7 @@
     }
    ],
    "source": [
-    "data_source = \"./data/eng_flores200_dev_sample.tsv\"\n",
+    "data_source = \"./eng_flores200_dev_sample.tsv\"\n",
     "text_emb = text_embedding_pipeline.predict(data_source, source_lang=\"eng_Latn\")\n",
     "text_emb"
    ]
@@ -297,11 +294,18 @@
     ")\n",
     "text_translation"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": ".venv",
+   "display_name": "sonar_fairseq2",
    "language": "python",
    "name": "python3"
   },
@@ -315,7 +319,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.12"
+   "version": "3.8.17"
   },
   "orig_nbformat": 4
  },

diff --git a/examples/sonar_text_demo.ipynb b/examples/sonar_text_demo.ipynb
@@ -1,21 +1,5 @@
 {
  "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Install the dependencies"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "%pip install --quiet sonar-space seaborn pandas"
-   ]
-  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -219,14 +203,14 @@
     "    french_translated_sentences, [french_sentences], tokenize=\"flores200\"\n",
     ")\n",
     "print(\"*\" * 120)\n",
-    "print(\"english to french translation bleu score :\")\n",
+    "print(\"english to spanish translation bleu score :\")\n",
     "print(bleu_obj)\n",
     "\n",
     "bleu_obj = sacrebleu.corpus_bleu(\n",
     "    english_translated_sentences, [english_sentences], tokenize=\"flores200\"\n",
     ")\n",
     "print(\"*\" * 120)\n",
-    "print(\"french to english translation bleu score :\")\n",
+    "print(\"spanish to english translation bleu score :\")\n",
     "print(bleu_obj)"
    ]
   },

diff --git a/huggingface_pipelines/text.py b/huggingface_pipelines/text.py
@@ -333,9 +333,9 @@ def process_batch(self, batch: Dict[str, Any]) -> Dict[str, Any]:
                 ):
                     all_embeddings = np.asarray(embeddings, dtype=self.config.dtype)
                     all_decoded_texts = self.decode_embeddings(all_embeddings)
-                    batch[
-                        f"{column}_{self.config.output_column_suffix}"
-                    ] = all_decoded_texts
+                    batch[f"{column}_{self.config.output_column_suffix}"] = (
+                        all_decoded_texts
+                    )
                 elif all(isinstance(item, list) for item in embeddings):
                     all_embeddings = np.vstack(
                         [
@@ -351,9 +351,9 @@ def process_batch(self, batch: Dict[str, Any]) -> Dict[str, Any]:
                         all_decoded_texts[start:end]
                         for start, end in zip([0] + indices[:-1], indices)
                     ]
-                    batch[
-                        f"{column}_{self.config.output_column_suffix}"
-                    ] = reconstructed_texts
+                    batch[f"{column}_{self.config.output_column_suffix}"] = (
+                        reconstructed_texts
+                    )
                 else:
                     raise ValueError(f"Invalid input type for column {column}")
                     logger.debug(
@@ -490,9 +490,9 @@ def process_batch(self, batch: Dict[str, Any]) -> Dict[str, Any]:
                     # Case: List of individual strings
                     all_texts = batch[column]
                     all_embeddings = self.encode_texts(all_texts)
-                    batch[
-                        f"{column}_{self.config.output_column_suffix}"
-                    ] = all_embeddings
+                    batch[f"{column}_{self.config.output_column_suffix}"] = (
+                        all_embeddings
+                    )
                 elif all(isinstance(item, list) for item in batch[column]):
                     # Case: List of lists (sentences)
                     all_sentences = [
@@ -513,9 +513,9 @@ def process_batch(self, batch: Dict[str, Any]) -> Dict[str, Any]:
                         for start, end in zip([0] + indices[:-1], indices)
                     ]
 
-                    batch[
-                        f"{column}_{self.config.output_column_suffix}"
-                    ] = sentence_embeddings
+                    batch[f"{column}_{self.config.output_column_suffix}"] = (
+                        sentence_embeddings
+                    )
 
                 else:
                     raise ValueError(

diff --git a/pyproject.toml b/pyproject.toml
@@ -24,7 +24,10 @@ classifiers=[
 dependencies = [
   # fairseq2 is installed with the cpu deps; if you want a gpu build, you need to install it manually.
   # see https://github.com/facebookresearch/fairseq2
-  "fairseq2~=0.4.0",
+  # we require a relaxed version of fairseq2 so that the users can be flexible with it
+  # (depending on the other dependencies of their project)
+  "fairseq2>=0.5.2",
+  # "mpmath==1.3.0", # I am not sure if we need it
   "numpy>=1.21",
   "torch",
   "torchaudio",
@@ -42,17 +45,19 @@ dependencies = [
       "pytest-cov>=2.6.1",
       "coverage[toml]>=5.1",
       # Format
-      "black==22.3.0",
+      "black==25.1.0",
       "isort>=5.10.1",
       # Linters
       "mypy>=0.782",
       "pylint>=2.8.0",
+      "flake8",
+      "types-tqdm"
   ]
 cpu = [
-  "torch==2.5.1+cpu",
-  "torchaudio==2.5.1+cpu",
-  "fairseq2n~=0.4.0", 
-  "fairseq2~=0.4.0", 
+  "torch==2.6.0+cpu",
+  "torchaudio==2.6.0+cpu",
+  "fairseq2n>=0.5.2",
+  "fairseq2>=0.5.2",
 ]
 hg = [
   "transformers>=4.44.0",