Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 26 additions & 1 deletion .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -82,4 +82,29 @@ LEMATERIALFETCHER_DEST_TABLE_NAME=optimade_materials
# Transformer processing settings
# LEMATERIALFETCHER_BATCH_SIZE=500
# LEMATERIALFETCHER_OFFSET=0
# LEMATERIALFETCHER_LOG_EVERY=1000
# LEMATERIALFETCHER_LOG_EVERY=1000

# ------------------------------------------------------------------------------
# LeMatRho Configuration (charge density pipeline)
# ------------------------------------------------------------------------------

# AWS credentials for authenticated S3 access (LeMatRho bucket)
# AWS_ACCESS_KEY_ID=your_access_key
# AWS_SECRET_ACCESS_KEY=your_secret_key
# AWS_DEFAULT_REGION=us-east-1

# LeMatRho S3 bucket name
# LEMATERIALFETCHER_LEMATRHO_BUCKET_NAME=lemat-rho

# VASP pseudopotential directory (required for Bader/DDEC6 analysis)
# PMG_VASP_PSP_DIR=/path/to/vasp/pseudopotentials

# External tool paths (optional, auto-detected on PATH if not set)
# LEMATERIALFETCHER_BADER_PATH=/path/to/bader
# LEMATERIALFETCHER_CHARGEMOL_PATH=/path/to/chargemol
# LEMATERIALFETCHER_CHGSUM_SCRIPT_PATH=/path/to/chgsum.pl
# LEMATERIALFETCHER_ATOMIC_DENSITIES_PATH=/path/to/atomic_densities

# HuggingFace (for pushing dataset after pipeline completes)
# HF_REPO_ID=your-org/lematrho-dataset
# HF_TOKEN=hf_your_token
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,8 @@ celerybeat.pid

# Environments
.env
.env.*
!.env.example
.venv
# env/
venv/
Expand Down
6 changes: 6 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ dependencies = [
"moyopy>=0.4.2",
"ase>=3.24.0",
"material-hasher",
"mp-pyrho>=0.3.1",
]

[project.scripts]
Expand Down Expand Up @@ -58,5 +59,10 @@ dev-dependencies = [
material-hasher = { git = "https://github.com/LeMaterial/lematerial-hasher.git" }


[tool.pytest.ini_options]
markers = [
"integration: tests that require real AWS credentials and S3 access (deselect with '-m \"not integration\"')",
]

[tool.ruff.lint]
extend-select = ["I"]
94 changes: 94 additions & 0 deletions src/lematerial_fetcher/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,9 @@
MPTrajectoryTransformer,
MPTransformer,
)
from lematerial_fetcher.fetcher.lematrho.fetch import LeMatRhoFetcher
from lematerial_fetcher.fetcher.lematrho.pipeline import LeMatRhoDirectPipeline
from lematerial_fetcher.fetcher.lematrho.transform import LeMatRhoTransformer
from lematerial_fetcher.fetcher.oqmd.fetch import OQMDFetcher
from lematerial_fetcher.fetcher.oqmd.transform import (
OQMDTrajectoryTransformer,
Expand All @@ -37,13 +40,17 @@
from lematerial_fetcher.utils.cli import (
add_common_options,
add_fetch_options,
add_lematrho_direct_options,
add_lematrho_fetch_options,
add_lematrho_transform_options,
add_mp_fetch_options,
add_mysql_options,
add_push_options,
add_transformer_options,
get_default_mp_bucket_name,
)
from lematerial_fetcher.utils.config import (
load_direct_pipeline_config,
load_fetcher_config,
load_push_config,
load_transformer_config,
Expand Down Expand Up @@ -114,9 +121,17 @@ def oqmd_cli(ctx):
pass


@click.group(name="lematrho")
@click.pass_context
def lematrho_cli(ctx):
"""Commands for fetching charge density data from LeMatRho."""
pass


cli.add_command(mp_cli)
cli.add_command(alexandria_cli)
cli.add_command(oqmd_cli)
cli.add_command(lematrho_cli)

# ------------------------------------------------------------------------------
# MP commands
Expand Down Expand Up @@ -341,6 +356,85 @@ def oqmd_transform(ctx, traj, **config_kwargs):
logger.fatal("\nAborted.", exit=1)


# ------------------------------------------------------------------------------
# LeMatRho commands
# ------------------------------------------------------------------------------


@lematrho_cli.command(name="fetch")
@click.pass_context
@add_common_options
@add_fetch_options
@add_lematrho_fetch_options
def lematrho_fetch(ctx, **config_kwargs):
"""Fetch charge density data from the LeMatRho S3 bucket.

Downloads CHGCAR/AECCAR files, compresses charge densities via pyrho,
and stores compressed grids in the raw_structures database table.

Requires AWS credentials (AWS_ACCESS_KEY_ID + AWS_SECRET_ACCESS_KEY).
"""
config_kwargs["base_url"] = "DUMMY_BASE_URL" # Not needed for LeMatRho
config_kwargs["mp_bucket_name"] = "" # Not needed for LeMatRho
config_kwargs["mp_bucket_prefix"] = "" # Not needed for LeMatRho

# Convert grid_shape tuple from Click to proper format
if "grid_shape" in config_kwargs:
config_kwargs["lematrho_grid_shape"] = config_kwargs.pop("grid_shape")

config = load_fetcher_config(**config_kwargs)
try:
fetcher = LeMatRhoFetcher(config=config, debug=ctx.obj["debug"])
fetcher.fetch()
except KeyboardInterrupt:
logger.fatal("\nAborted.", exit=1)


@lematrho_cli.command(name="transform")
@click.pass_context
@add_common_options
@add_transformer_options
@add_lematrho_transform_options
def lematrho_transform(ctx, **config_kwargs):
"""Transform raw LeMatRho structures into OPTIMADE format.

Optionally runs Bader and DDEC6 charge analysis using external tools.

External tool requirements:
- bader executable (--bader-path or on PATH)
- perl + chgsum.pl script (--chgsum-script-path)
- chargemol executable (--chargemol-path or on PATH)
- PMG_VASP_PSP_DIR environment variable for POTCAR generation
- Atomic densities directory (--atomic-densities-path) for DDEC6
"""
config = load_transformer_config(**config_kwargs)
try:
transformer = LeMatRhoTransformer(config=config, debug=ctx.obj["debug"])
transformer.transform()
except KeyboardInterrupt:
logger.fatal("\nAborted.", exit=1)


@lematrho_cli.command(name="run")
@click.pass_context
@add_lematrho_direct_options
def lematrho_run(ctx, **config_kwargs):
"""Run complete LeMatRho pipeline: S3 -> Parquet -> HuggingFace.

Downloads charge density data, compresses via pyrho, optionally runs
Bader and DDEC6 analysis, and writes Parquet files directly.
No PostgreSQL required.

Requires AWS credentials (AWS_ACCESS_KEY_ID + AWS_SECRET_ACCESS_KEY).
"""
config = load_direct_pipeline_config(**config_kwargs)
try:
pipeline = LeMatRhoDirectPipeline(config=config, debug=ctx.obj["debug"])
pipeline.run()
except KeyboardInterrupt:
logger.fatal("\nAborted.", exit=1)


# ------------------------------------------------------------------------------
# Push commands
# ------------------------------------------------------------------------------
Expand Down
40 changes: 40 additions & 0 deletions src/lematerial_fetcher/database/postgres.py
Original file line number Diff line number Diff line change
Expand Up @@ -494,6 +494,14 @@ def columns(cls) -> dict[str, str]:
"space_group_it_number": "INTEGER",
"cross_compatibility": "BOOLEAN",
"bawl_fingerprint": "TEXT",
"compressed_charge_density": "JSONB",
"compressed_aeccar0": "JSONB",
"compressed_aeccar1": "JSONB",
"compressed_aeccar2": "JSONB",
"charge_density_grid_shape": "INTEGER[]",
"bader_charges": "FLOAT[]",
"bader_atomic_volume": "FLOAT[]",
"ddec6_charges": "FLOAT[]",
}

def _prepare_species_data(self, species: list[dict[str, Any]]) -> list[Json]:
Expand Down Expand Up @@ -572,6 +580,14 @@ def insert_data(self, structure: OptimadeStructure) -> None:
structure.space_group_it_number,
structure.cross_compatibility,
structure.bawl_fingerprint,
Json(structure.compressed_charge_density),
Json(structure.compressed_aeccar0),
Json(structure.compressed_aeccar1),
Json(structure.compressed_aeccar2),
structure.charge_density_grid_shape,
structure.bader_charges,
structure.bader_atomic_volume,
structure.ddec6_charges,
)
cur.execute(query, input_data)
self.conn.commit()
Expand Down Expand Up @@ -639,6 +655,14 @@ def batch_insert_data(
structure.space_group_it_number,
structure.cross_compatibility,
structure.bawl_fingerprint,
Json(structure.compressed_charge_density),
Json(structure.compressed_aeccar0),
Json(structure.compressed_aeccar1),
Json(structure.compressed_aeccar2),
structure.charge_density_grid_shape,
structure.bader_charges,
structure.bader_atomic_volume,
structure.ddec6_charges,
)
)

Expand Down Expand Up @@ -740,6 +764,14 @@ def insert_data(self, structure: Trajectory) -> None:
structure.space_group_it_number,
structure.cross_compatibility,
structure.bawl_fingerprint,
Json(structure.compressed_charge_density),
Json(structure.compressed_aeccar0),
Json(structure.compressed_aeccar1),
Json(structure.compressed_aeccar2),
structure.charge_density_grid_shape,
structure.bader_charges,
structure.bader_atomic_volume,
structure.ddec6_charges,
# trajectory-specific fields
structure.relaxation_step,
structure.relaxation_number,
Expand Down Expand Up @@ -810,6 +842,14 @@ def batch_insert_data(
structure.space_group_it_number,
structure.cross_compatibility,
structure.bawl_fingerprint,
Json(structure.compressed_charge_density),
Json(structure.compressed_aeccar0),
Json(structure.compressed_aeccar1),
Json(structure.compressed_aeccar2),
structure.charge_density_grid_shape,
structure.bader_charges,
structure.bader_atomic_volume,
structure.ddec6_charges,
# trajectory-specific fields
structure.relaxation_step,
structure.relaxation_number,
Expand Down
2 changes: 1 addition & 1 deletion src/lematerial_fetcher/fetch.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ def db(self) -> StructuresDatabase:

Returns
-------
StructuresDatabase
StructuresDatabase
Database connection
"""
if self._db is None:
Expand Down
1 change: 1 addition & 0 deletions src/lematerial_fetcher/fetcher/lematrho/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# Copyright 2025 Entalpic
Loading
Loading