Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ dev = [
"ipython>=8.29.0",
"pre-commit>=4.0.1",
"ruff>=0.8.0",
"pytest>=8.3",
"shibuya>=2024.10.15",
"sphinx-autoapi>=3.3.2",
"sphinx-autodoc-typehints>=2.5.0",
Expand All @@ -38,5 +39,6 @@ dev = [
"sphinx-design>=0.6.1",
"sphinx-math-dollar>=1.2.1",
"sphinxawesome-theme>=5.3.2",
"ipdb>=0.13.13",
]

2 changes: 1 addition & 1 deletion src/material_hasher/benchmark/disordered.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# Copyright 2025 Entalpic
import logging
from itertools import combinations
from typing import Dict, List, Tuple
import logging

import numpy as np
import pandas as pd
Expand Down
6 changes: 3 additions & 3 deletions src/material_hasher/benchmark/run_disordered.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,18 @@
# Copyright 2025 Entalpic
import datetime
import logging
import os
import time
from collections import defaultdict
from pathlib import Path
from typing import Dict, List, Tuple
import logging

import numpy as np
import pandas as pd
import tqdm
import yaml
from pymatgen.core import Structure

logger = logging.getLogger(__name__)

from material_hasher.benchmark.disordered import (
download_disordered_structures,
get_classification_results_dissimilar,
Expand All @@ -25,6 +23,8 @@
from material_hasher.similarity import SIMILARITY_MATCHERS
from material_hasher.types import StructureEquivalenceChecker

logger = logging.getLogger(__name__)

STRUCTURE_CHECKERS = {**HASHERS, **SIMILARITY_MATCHERS}


Expand Down
52 changes: 29 additions & 23 deletions src/material_hasher/benchmark/run_transformations.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,17 @@
# Copyright 2025 Entalpic
import datetime
import json
import logging
import os
import time
from pathlib import Path
from typing import Optional
import logging

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import yaml
from datasets import Dataset, VerificationMode, concatenate_datasets, load_dataset
from datasets import Dataset, VerificationMode, load_dataset
from pymatgen.core import Structure

from material_hasher.benchmark.transformations import ALL_TEST_CASES, get_test_case
Expand All @@ -27,7 +27,9 @@
STRUCTURE_CHECKERS = {**HASHERS, **SIMILARITY_MATCHERS}


def get_hugging_face_dataset(token: Optional[str] = None) -> Dataset:
def get_hugging_face_dataset(
token: Optional[str] = None, n_rows: Optional[int] = None
) -> Dataset:
"""
Only returns the dataset from Hugging Face where all the subsets are concatenated.

Expand All @@ -36,35 +38,33 @@ def get_hugging_face_dataset(token: Optional[str] = None) -> Dataset:
token : str, optional
The authentication token required to access the dataset.
Optional if the dataset is public or you have already configured the Hugging Face CLI.
n_rows : int, optional
Number of rows to load from the dataset.

Returns
-------
Dataset
The concatenated dataset from Hugging Face.
"""

subsets = [
split = "train"
if n_rows is not None:
split += f"[:{n_rows}]"

return load_dataset(
"LeMaterial/LeMat-Bulk",
"compatible_pbe",
"compatible_scan",
"compatible_pbesol",
"non_compatible",
]
dss = []
for subset in subsets:
dss.append(
load_dataset(
"LeMaterial/LeMat-Bulk",
subset,
token=token,
verification_mode=VerificationMode.NO_CHECKS,
)["train"]
)
ds = concatenate_datasets(dss)
return ds
split=split,
token=token,
verification_mode=VerificationMode.NO_CHECKS,
)


def get_data_from_hugging_face(
token: Optional[str] = None, n_test_elements: int = 100, seed: int = 0
token: Optional[str] = None,
n_test_elements: int = 100,
n_rows: Optional[int] = None,
seed: int = 0,
) -> list[Structure]:
"""
Downloads and processes structural data from the Hugging Face `datasets` library.
Expand All @@ -80,6 +80,8 @@ def get_data_from_hugging_face(
n_test_elements : int
Number of elements to select from the dataset to run the benchmark on. Default is 100.
This is used to run the transformation benchmark only a subset of LeMat-Bulk.
n_rows : int, optional
Number of rows to load from the dataset. This will load them in sequential order.
seed : int
Random seed for selecting a subset of the dataset. Default is 0.

Expand All @@ -101,7 +103,7 @@ def get_data_from_hugging_face(
- Errors during the transformation process are logged but do not halt execution.
"""

ds = get_hugging_face_dataset(token)
ds = get_hugging_face_dataset(token, n_rows=n_rows)

# Convert dataset to Pandas DataFrame
logger.info("Loaded dataset:", len(ds))
Expand Down Expand Up @@ -232,7 +234,11 @@ def hasher_sensitivity(
else:
raise ValueError("Unknown structure checker")

return matching_hashes / len(transformed_structures) if len(transformed_structures) > 0 else 0
return (
matching_hashes / len(transformed_structures)
if len(transformed_structures) > 0
else 0
)


def mean_sensitivity(
Expand Down
13 changes: 6 additions & 7 deletions src/material_hasher/benchmark/transformations.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,9 @@
# Copyright 2025 Entalpic
import inspect
import random
from typing import Optional, Union
from typing import Optional

import numpy as np
from pymatgen.core import Structure, SymmOp
from pymatgen.core import Structure
from pymatgen.symmetry.analyzer import SpacegroupAnalyzer

ALL_TEST_CASES = [
Expand All @@ -16,10 +15,10 @@
]

PARAMETERS = {
"gaussian_noise": {"sigma": np.logspace(0.0001, 0.5,15, base=0.0000001)},
"isometric_strain": {"pct": [1,1.05,1.1,1.2,1.5]},
"strain": {"sigma": np.logspace(0.001, 0.5,10, base=0.0000001)},
"translation": {"sigma": np.logspace(0.0001, 0.5,15, base=0.0000001)},
"gaussian_noise": {"sigma": np.logspace(0.0001, 0.5, 15, base=0.0000001)},
"isometric_strain": {"pct": [1, 1.05, 1.1, 1.2, 1.5]},
"strain": {"sigma": np.logspace(0.001, 0.5, 10, base=0.0000001)},
"translation": {"sigma": np.logspace(0.0001, 0.5, 15, base=0.0000001)},
"symm_ops": {"structure_symmetries": ["all_symmetries_found"]},
}

Expand Down
17 changes: 12 additions & 5 deletions src/material_hasher/hasher/__init__.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
# Copyright 2025 Entalpic
from material_hasher.hasher.entalpic import EntalpicMaterialsHasher, ShortenedEntalpicMaterialsHasher
from material_hasher.hasher.example import SimpleCompositionHasher
import warnings

from material_hasher.hasher.entalpic import (
EntalpicMaterialsHasher,
ShortenedEntalpicMaterialsHasher,
)
from material_hasher.hasher.pdd import PointwiseDistanceDistributionHasher

import warnings
warnings.filterwarnings('always')
warnings.filterwarnings("always")

__all__ = ["EntalpicMaterialsHasher"]

Expand All @@ -17,6 +20,10 @@

try:
from material_hasher.hasher.slices import SLICESHasher

HASHERS.update({"SLICES": SLICESHasher})
except ImportError:
warnings.warn('Failed to import SLICES. If you would like to use this module, please consider running uv pip install -r requirements_slices.txt', ImportWarning)
warnings.info(
"Failed to import SLICES. If you would like to use this module, please consider running uv pip install -r requirements_slices.txt",
ImportWarning,
)
1 change: 1 addition & 0 deletions src/material_hasher/hasher/entalpic.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ class EntalpicMaterialsHasher(HasherBase):
Returns hash based on bonding graph structure, composition,
and symmetry.
"""

def __init__(
self,
graphing_algorithm: str = "WL",
Expand Down
4 changes: 1 addition & 3 deletions src/material_hasher/hasher/pdd.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,9 +77,7 @@ def get_material_hash(self, structure: Structure) -> str:
"""
periodic_set = self.periodicset_from_structure(structure)

pdd = PDD(
periodic_set, int(self.cutoff), collapse=False
)
pdd = PDD(periodic_set, int(self.cutoff), collapse=False)

# Round the PDD values to 4 decimal places for numerical stability and consistency.
pdd = np.round(pdd, decimals=4)
Expand Down
4 changes: 3 additions & 1 deletion src/material_hasher/hasher/slices.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,14 @@
# uv pip install -r requirements_slices.txt


import tensorflow as tf
from pymatgen.core.structure import Structure
from slices.core import SLICES

from material_hasher.hasher.base import HasherBase

tf.get_logger().setLevel("ERROR")


class SLICESHasher(HasherBase):
def __init__(self):
Expand All @@ -32,4 +35,3 @@ def get_material_hash(self, structure: Structure) -> str:
The SLICES string representation of the structure.
"""
return self.backend.structure2SLICES(structure)

9 changes: 3 additions & 6 deletions src/material_hasher/hasher/utils/graph_structure.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
# Copyright 2025 Entalpic
from pymatgen.analysis.graphs import StructureGraph
from networkx import Graph
from pymatgen.analysis.local_env import EconNN, NearNeighbors
from pymatgen.core import Structure
from networkx import Graph


def get_structure_graph(
Expand All @@ -23,10 +22,8 @@ class to build bonded structure. Defaults to EconNN.
Returns:
Graph: networkx Graph object
"""
structure_graph = StructureGraph.with_local_env_strategy(
structure=structure,
strategy=bonding_algorithm(**bonding_kwargs),
)
bonding = bonding_algorithm(**bonding_kwargs)
structure_graph = bonding.get_bonded_structure(structure)
for n, site in zip(range(len(structure)), structure):
structure_graph.graph.nodes[n]["specie"] = site.specie.name
for edge in structure_graph.graph.edges:
Expand Down
12 changes: 11 additions & 1 deletion src/material_hasher/similarity/structure_matchers.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,12 +33,18 @@ def is_equivalent(
First structure to compare.
structure2 : Structure
Second structure to compare.
threshold : Optional[float]
Optional threshold to override the default tolerance.

Returns
-------
bool
True if the two structures are similar, False otherwise.
"""
if threshold is not None:
# Create a temporary matcher with the new threshold
temp_matcher = StructureMatcher(ltol=threshold)
return temp_matcher.fit(structure1, structure2)
return self.matcher.fit(structure1, structure2)

def get_similarity_score(
Expand All @@ -60,7 +66,11 @@ def get_similarity_score(
float
Similarity score between the two structures.
"""
return self.matcher.get_rms_dist(structure1, structure2)
# RMS displacement is normalized by (Vol / nsites) ** (1/3) in PMG
distance = self.matcher.get_rms_dist(structure1, structure2)
if distance is None: # No alignment found
return 0.0
return 1.0 - distance[0]

def get_pairwise_equivalence(
self, structures: list[Structure], threshold: Optional[float] = None
Expand Down
51 changes: 51 additions & 0 deletions tests/benchmark/test_dataset_benchmarks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
import pytest
from datasets import Dataset
from material_hasher.benchmark.disordered import download_disordered_structures
from material_hasher.benchmark.run_transformations import get_data_from_hugging_face


@pytest.fixture
def small_test_dataset():
"""Create a small synthetic dataset for testing benchmark functions"""
data = {
"lattice_vectors": [[[1, 0, 0], [0, 1, 0], [0, 0, 1]]],
"species_at_sites": [["Si"]],
"cartesian_site_positions": [[[0, 0, 0]]],
}
return Dataset.from_dict(data)


def test_hugging_face_data_loading(small_test_dataset, monkeypatch):
"""Test that data loading works with a small test dataset"""

def mock_load(*args, **kwargs):
return small_test_dataset

# Mock the dataset loading
monkeypatch.setattr(
"material_hasher.benchmark.run_transformations.load_dataset", mock_load
)

structures = get_data_from_hugging_face(n_test_elements=1)
assert len(structures) == 1
assert structures[0].formula == "Si1"


@pytest.mark.integration
def test_download_transformations_dataset():
"""
Download the HF dataset
"""
data = get_data_from_hugging_face(
n_test_elements=2, n_rows=2
) # Use small number for test
assert len(data) == 2


@pytest.mark.integration
def test_download_disordered_structures():
"""
Download the HF dataset
"""
structures = download_disordered_structures()
assert len(structures) > 0
Loading