LeMaterial · Ramlaoui · Apr 11, 2025 · Apr 11, 2025 · Apr 11, 2025 · Apr 11, 2025
diff --git a/pyproject.toml b/pyproject.toml
@@ -30,6 +30,7 @@ dev = [
     "ipython>=8.29.0",
     "pre-commit>=4.0.1",
     "ruff>=0.8.0",
+    "pytest>=8.3",
     "shibuya>=2024.10.15",
     "sphinx-autoapi>=3.3.2",
     "sphinx-autodoc-typehints>=2.5.0",
@@ -38,5 +39,6 @@ dev = [
     "sphinx-design>=0.6.1",
     "sphinx-math-dollar>=1.2.1",
     "sphinxawesome-theme>=5.3.2",
+    "ipdb>=0.13.13",
 ]
 
diff --git a/src/material_hasher/benchmark/disordered.py b/src/material_hasher/benchmark/disordered.py
@@ -1,7 +1,7 @@
 # Copyright 2025 Entalpic
+import logging
 from itertools import combinations
 from typing import Dict, List, Tuple
-import logging
 
 import numpy as np
 import pandas as pd

diff --git a/src/material_hasher/benchmark/run_disordered.py b/src/material_hasher/benchmark/run_disordered.py
@@ -1,20 +1,18 @@
 # Copyright 2025 Entalpic
 import datetime
+import logging
 import os
 import time
 from collections import defaultdict
 from pathlib import Path
 from typing import Dict, List, Tuple
-import logging
 
 import numpy as np
 import pandas as pd
 import tqdm
 import yaml
 from pymatgen.core import Structure
 
-logger = logging.getLogger(__name__)
-
 from material_hasher.benchmark.disordered import (
     download_disordered_structures,
     get_classification_results_dissimilar,
@@ -25,6 +23,8 @@
 from material_hasher.similarity import SIMILARITY_MATCHERS
 from material_hasher.types import StructureEquivalenceChecker
 
+logger = logging.getLogger(__name__)
+
 STRUCTURE_CHECKERS = {**HASHERS, **SIMILARITY_MATCHERS}
 
 

diff --git a/src/material_hasher/benchmark/run_transformations.py b/src/material_hasher/benchmark/run_transformations.py
@@ -1,17 +1,17 @@
 # Copyright 2025 Entalpic
 import datetime
 import json
+import logging
 import os
 import time
 from pathlib import Path
 from typing import Optional
-import logging
 
 import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
 import yaml
-from datasets import Dataset, VerificationMode, concatenate_datasets, load_dataset
+from datasets import Dataset, VerificationMode, load_dataset
 from pymatgen.core import Structure
 
 from material_hasher.benchmark.transformations import ALL_TEST_CASES, get_test_case
@@ -27,7 +27,9 @@
 STRUCTURE_CHECKERS = {**HASHERS, **SIMILARITY_MATCHERS}
 
 
-def get_hugging_face_dataset(token: Optional[str] = None) -> Dataset:
+def get_hugging_face_dataset(
+    token: Optional[str] = None, n_rows: Optional[int] = None
+) -> Dataset:
     """
     Only returns the dataset from Hugging Face where all the subsets are concatenated.
 
@@ -36,35 +38,33 @@ def get_hugging_face_dataset(token: Optional[str] = None) -> Dataset:
     token : str, optional
         The authentication token required to access the dataset.
         Optional if the dataset is public or you have already configured the Hugging Face CLI.
+    n_rows : int, optional
+        Number of rows to load from the dataset.
 
     Returns
     -------
     Dataset
         The concatenated dataset from Hugging Face.
     """
 
-    subsets = [
+    split = "train"
+    if n_rows is not None:
+        split += f"[:{n_rows}]"
+
+    return load_dataset(
+        "LeMaterial/LeMat-Bulk",
         "compatible_pbe",
-        "compatible_scan",
-        "compatible_pbesol",
-        "non_compatible",
-    ]
-    dss = []
-    for subset in subsets:
-        dss.append(
-            load_dataset(
-                "LeMaterial/LeMat-Bulk",
-                subset,
-                token=token,
-                verification_mode=VerificationMode.NO_CHECKS,
-            )["train"]
-        )
-    ds = concatenate_datasets(dss)
-    return ds
+        split=split,
+        token=token,
+        verification_mode=VerificationMode.NO_CHECKS,
+    )
 
 
 def get_data_from_hugging_face(
-    token: Optional[str] = None, n_test_elements: int = 100, seed: int = 0
+    token: Optional[str] = None,
+    n_test_elements: int = 100,
+    n_rows: Optional[int] = None,
+    seed: int = 0,
 ) -> list[Structure]:
     """
     Downloads and processes structural data from the Hugging Face `datasets` library.
@@ -80,6 +80,8 @@ def get_data_from_hugging_face(
     n_test_elements : int
         Number of elements to select from the dataset to run the benchmark on. Default is 100.
         This is used to run the transformation benchmark only a subset of LeMat-Bulk.
+    n_rows : int, optional
+        Number of rows to load from the dataset. This will load them in sequential order.
     seed : int
         Random seed for selecting a subset of the dataset. Default is 0.
 
@@ -101,7 +103,7 @@ def get_data_from_hugging_face(
     - Errors during the transformation process are logged but do not halt execution.
     """
 
-    ds = get_hugging_face_dataset(token)
+    ds = get_hugging_face_dataset(token, n_rows=n_rows)
 
     # Convert dataset to Pandas DataFrame
     logger.info("Loaded dataset:", len(ds))
@@ -232,7 +234,11 @@ def hasher_sensitivity(
     else:
         raise ValueError("Unknown structure checker")
 
-    return matching_hashes / len(transformed_structures) if len(transformed_structures) > 0 else 0
+    return (
+        matching_hashes / len(transformed_structures)
+        if len(transformed_structures) > 0
+        else 0
+    )
 
 
 def mean_sensitivity(

diff --git a/src/material_hasher/benchmark/transformations.py b/src/material_hasher/benchmark/transformations.py
@@ -1,10 +1,9 @@
 # Copyright 2025 Entalpic
 import inspect
-import random
-from typing import Optional, Union
+from typing import Optional
 
 import numpy as np
-from pymatgen.core import Structure, SymmOp
+from pymatgen.core import Structure
 from pymatgen.symmetry.analyzer import SpacegroupAnalyzer
 
 ALL_TEST_CASES = [
@@ -16,10 +15,10 @@
 ]
 
 PARAMETERS = {
-    "gaussian_noise": {"sigma": np.logspace(0.0001, 0.5,15, base=0.0000001)},
-    "isometric_strain": {"pct": [1,1.05,1.1,1.2,1.5]},
-    "strain": {"sigma": np.logspace(0.001, 0.5,10, base=0.0000001)},
-    "translation": {"sigma": np.logspace(0.0001, 0.5,15, base=0.0000001)},
+    "gaussian_noise": {"sigma": np.logspace(0.0001, 0.5, 15, base=0.0000001)},
+    "isometric_strain": {"pct": [1, 1.05, 1.1, 1.2, 1.5]},
+    "strain": {"sigma": np.logspace(0.001, 0.5, 10, base=0.0000001)},
+    "translation": {"sigma": np.logspace(0.0001, 0.5, 15, base=0.0000001)},
     "symm_ops": {"structure_symmetries": ["all_symmetries_found"]},
 }
 

diff --git a/src/material_hasher/hasher/__init__.py b/src/material_hasher/hasher/__init__.py
@@ -1,10 +1,13 @@
 # Copyright 2025 Entalpic
-from material_hasher.hasher.entalpic import EntalpicMaterialsHasher, ShortenedEntalpicMaterialsHasher
-from material_hasher.hasher.example import SimpleCompositionHasher
+import warnings
+
+from material_hasher.hasher.entalpic import (
+    EntalpicMaterialsHasher,
+    ShortenedEntalpicMaterialsHasher,
+)
 from material_hasher.hasher.pdd import PointwiseDistanceDistributionHasher
 
-import warnings
-warnings.filterwarnings('always')
+warnings.filterwarnings("always")
 
 __all__ = ["EntalpicMaterialsHasher"]
 
@@ -17,6 +20,10 @@
 
 try:
     from material_hasher.hasher.slices import SLICESHasher
+
     HASHERS.update({"SLICES": SLICESHasher})
 except ImportError:
-    warnings.warn('Failed to import SLICES. If you would like to use this module, please consider running uv pip install -r requirements_slices.txt', ImportWarning)
+    warnings.info(
+        "Failed to import SLICES. If you would like to use this module, please consider running uv pip install -r requirements_slices.txt",
+        ImportWarning,
+    )
diff --git a/src/material_hasher/hasher/entalpic.py b/src/material_hasher/hasher/entalpic.py
@@ -13,6 +13,7 @@ class EntalpicMaterialsHasher(HasherBase):
     Returns hash based on bonding graph structure, composition,
     and symmetry.
     """
+
     def __init__(
         self,
         graphing_algorithm: str = "WL",

diff --git a/src/material_hasher/hasher/pdd.py b/src/material_hasher/hasher/pdd.py
@@ -77,9 +77,7 @@ def get_material_hash(self, structure: Structure) -> str:
         """
         periodic_set = self.periodicset_from_structure(structure)
 
-        pdd = PDD(
-            periodic_set, int(self.cutoff), collapse=False
-        )
+        pdd = PDD(periodic_set, int(self.cutoff), collapse=False)
 
         # Round the PDD values to 4 decimal places for numerical stability and consistency.
         pdd = np.round(pdd, decimals=4)

diff --git a/src/material_hasher/hasher/slices.py b/src/material_hasher/hasher/slices.py
@@ -4,11 +4,14 @@
 # uv pip install -r requirements_slices.txt
 
 
+import tensorflow as tf
 from pymatgen.core.structure import Structure
 from slices.core import SLICES
 
 from material_hasher.hasher.base import HasherBase
 
+tf.get_logger().setLevel("ERROR")
+
 
 class SLICESHasher(HasherBase):
     def __init__(self):
@@ -32,4 +35,3 @@ def get_material_hash(self, structure: Structure) -> str:
             The SLICES string representation of the structure.
         """
         return self.backend.structure2SLICES(structure)
-
diff --git a/src/material_hasher/hasher/utils/graph_structure.py b/src/material_hasher/hasher/utils/graph_structure.py
@@ -1,8 +1,7 @@
 # Copyright 2025 Entalpic
-from pymatgen.analysis.graphs import StructureGraph
+from networkx import Graph
 from pymatgen.analysis.local_env import EconNN, NearNeighbors
 from pymatgen.core import Structure
-from networkx import Graph
 
 
 def get_structure_graph(
@@ -23,10 +22,8 @@ class to build bonded structure. Defaults to EconNN.
     Returns:
         Graph: networkx Graph object
     """
-    structure_graph = StructureGraph.with_local_env_strategy(
-        structure=structure,
-        strategy=bonding_algorithm(**bonding_kwargs),
-    )
+    bonding = bonding_algorithm(**bonding_kwargs)
+    structure_graph = bonding.get_bonded_structure(structure)
     for n, site in zip(range(len(structure)), structure):
         structure_graph.graph.nodes[n]["specie"] = site.specie.name
     for edge in structure_graph.graph.edges:

diff --git a/src/material_hasher/similarity/structure_matchers.py b/src/material_hasher/similarity/structure_matchers.py
@@ -33,12 +33,18 @@ def is_equivalent(
             First structure to compare.
         structure2 : Structure
             Second structure to compare.
+        threshold : Optional[float]
+            Optional threshold to override the default tolerance.
 
         Returns
         -------
         bool
             True if the two structures are similar, False otherwise.
         """
+        if threshold is not None:
+            # Create a temporary matcher with the new threshold
+            temp_matcher = StructureMatcher(ltol=threshold)
+            return temp_matcher.fit(structure1, structure2)
         return self.matcher.fit(structure1, structure2)
 
     def get_similarity_score(
@@ -60,7 +66,11 @@ def get_similarity_score(
         float
             Similarity score between the two structures.
         """
-        return self.matcher.get_rms_dist(structure1, structure2)
+        # RMS displacement is normalized by (Vol / nsites) ** (1/3) in PMG
+        distance = self.matcher.get_rms_dist(structure1, structure2)
+        if distance is None:  # No alignment found
+            return 0.0
+        return 1.0 - distance[0]
 
     def get_pairwise_equivalence(
         self, structures: list[Structure], threshold: Optional[float] = None

diff --git a/tests/benchmark/test_dataset_benchmarks.py b/tests/benchmark/test_dataset_benchmarks.py
@@ -0,0 +1,51 @@
+import pytest
+from datasets import Dataset
+from material_hasher.benchmark.disordered import download_disordered_structures
+from material_hasher.benchmark.run_transformations import get_data_from_hugging_face
+
+
+@pytest.fixture
+def small_test_dataset():
+    """Create a small synthetic dataset for testing benchmark functions"""
+    data = {
+        "lattice_vectors": [[[1, 0, 0], [0, 1, 0], [0, 0, 1]]],
+        "species_at_sites": [["Si"]],
+        "cartesian_site_positions": [[[0, 0, 0]]],
+    }
+    return Dataset.from_dict(data)
+
+
+def test_hugging_face_data_loading(small_test_dataset, monkeypatch):
+    """Test that data loading works with a small test dataset"""
+
+    def mock_load(*args, **kwargs):
+        return small_test_dataset
+
+    # Mock the dataset loading
+    monkeypatch.setattr(
+        "material_hasher.benchmark.run_transformations.load_dataset", mock_load
+    )
+
+    structures = get_data_from_hugging_face(n_test_elements=1)
+    assert len(structures) == 1
+    assert structures[0].formula == "Si1"
+
+
+@pytest.mark.integration
+def test_download_transformations_dataset():
+    """
+    Download the HF dataset
+    """
+    data = get_data_from_hugging_face(
+        n_test_elements=2, n_rows=2
+    )  # Use small number for test
+    assert len(data) == 2
+
+
+@pytest.mark.integration
+def test_download_disordered_structures():
+    """
+    Download the HF dataset
+    """
+    structures = download_disordered_structures()
+    assert len(structures) > 0