From a4cdf9a7de502d62c12f1b1e57a7122e84df5a76 Mon Sep 17 00:00:00 2001 From: Arn Date: Sat, 7 Feb 2026 21:47:06 +0000 Subject: [PATCH 01/12] ethanol-water density curve --- .../analyse_ethanol_water_density.py | 238 ++++++++++++++++++ .../liquids/ethanol_water_density/analysis.py | 103 ++++++++ .../liquids/ethanol_water_density/io_tools.py | 99 ++++++++ .../liquids/ethanol_water_density/metrics.yml | 23 ++ .../app_ethanol_water_density.py | 82 ++++++ ml_peg/app/liquids/liquids.yml | 2 + .../calc_ethanol_water_density.py | 112 +++++++++ .../ethanol_water_density/compositions.py | 34 +++ .../ethanol_water_density/fake_data.py | 191 ++++++++++++++ .../liquids/ethanol_water_density/md_code.py | 167 ++++++++++++ 10 files changed, 1051 insertions(+) create mode 100644 ml_peg/analysis/liquids/ethanol_water_density/analyse_ethanol_water_density.py create mode 100644 ml_peg/analysis/liquids/ethanol_water_density/analysis.py create mode 100644 ml_peg/analysis/liquids/ethanol_water_density/io_tools.py create mode 100644 ml_peg/analysis/liquids/ethanol_water_density/metrics.yml create mode 100644 ml_peg/app/liquids/ethanol_water_density/app_ethanol_water_density.py create mode 100644 ml_peg/app/liquids/liquids.yml create mode 100644 ml_peg/calcs/liquids/ethanol_water_density/calc_ethanol_water_density.py create mode 100644 ml_peg/calcs/liquids/ethanol_water_density/compositions.py create mode 100644 ml_peg/calcs/liquids/ethanol_water_density/fake_data.py create mode 100644 ml_peg/calcs/liquids/ethanol_water_density/md_code.py diff --git a/ml_peg/analysis/liquids/ethanol_water_density/analyse_ethanol_water_density.py b/ml_peg/analysis/liquids/ethanol_water_density/analyse_ethanol_water_density.py new file mode 100644 index 000000000..6c8c1a0b1 --- /dev/null +++ b/ml_peg/analysis/liquids/ethanol_water_density/analyse_ethanol_water_density.py @@ -0,0 +1,238 @@ +# TODO: remove hardcoded things? +from pathlib import Path + +import numpy as np +import matplotlib.pyplot as plt +import pytest + +from ml_peg.analysis.liquids.ethanol_water_density.analysis import _rmse, _interp_1d, \ + _excess_curve, _peak_x_quadratic, x_to_phi_ethanol +from ml_peg.analysis.liquids.ethanol_water_density.io_tools import OUT_PATH, _debug_plot_enabled, _savefig, \ + _read_model_curve, read_ref_curve +from ml_peg.analysis.utils.decorators import build_table, plot_parity +from ml_peg.analysis.utils.utils import load_metrics_config +from ml_peg.models.get_models import get_model_names +from ml_peg.models.models import current_models + + +MODELS = get_model_names(current_models) +MODEL_INDEX = {name: i for i, name in enumerate(MODELS)} # duplicate in calc + +METRICS_CONFIG_PATH = Path(__file__).with_name("metrics.yml") +DEFAULT_THRESHOLDS, DEFAULT_TOOLTIPS, DEFAULT_WEIGHTS = load_metrics_config( + METRICS_CONFIG_PATH +) + + +OUT_PATH.mkdir(parents=True, exist_ok=True) + + +@pytest.fixture(scope="session") +def ref_curve() -> tuple[np.ndarray, np.ndarray]: + x_ref, rho_ref = read_ref_curve() + x = np.asarray(x_ref, dtype=float) + rho = np.asarray(rho_ref, dtype=float) + + # Ensure monotonic x for interpolation + order = np.argsort(x) + return x[order], rho[order] + + +@pytest.fixture +def model_curves() -> dict[str, tuple[np.ndarray, np.ndarray]]: + curves: dict[str, tuple[np.ndarray, np.ndarray]] = {} + for model_name in MODELS: + xs, rhos = _read_model_curve(model_name) + x = np.asarray(xs, dtype=float) + rho = np.asarray(rhos, dtype=float) + + order = np.argsort(x) + curves[model_name] = (x[order], rho[order]) + return curves + + +@pytest.fixture +@plot_parity( + filename=OUT_PATH / "density_parity.json", + title="Ethanol–water density (293.15 K)", + x_label="Reference density / g cm⁻³", + y_label="Predicted density / g cm⁻³", + #hoverdata={ + # "x_ethanol": [], # filled in fixture + #}, +) # TODO: read docs!!! doesn't seem to work yet. +def densities_parity(ref_curve, model_curves) -> dict[str, list]: + x_ref, rho_ref = ref_curve + + # Use the first model's x grid for hover labels (parity requires same-length lists) + # We’ll choose the densest model grid if they differ. + model_name_for_grid = max(model_curves, key=lambda m: len(model_curves[m][0])) + x_grid = model_curves[model_name_for_grid][0] + + results: dict[str, list] = {"ref": []} | {m: [] for m in MODELS} + + rho_ref_on_grid = _interp_1d(x_ref, rho_ref, x_grid) + results["ref"] = list(rho_ref_on_grid) + + for m in MODELS: + x_m, rho_m = model_curves[m] + # Interpolate model to x_grid if needed + if len(x_m) != len(x_grid) or np.any(np.abs(x_m - x_grid) > 1e-12): + # This assumes model spans the grid range; otherwise raise. + rho_m_on_grid = _interp_1d(x_m, rho_m, x_grid) + else: + rho_m_on_grid = rho_m + results[m] = list(rho_m_on_grid) + + ## Patch hoverdata list in-place (decorator reads the dict) + ## NOTE: if your decorator captures hoverdata at decoration time, + ## switch to hoverdata={"x_ethanol": x_labels()} fixture pattern like the docs. + #densities_parity.__wrapped__.__dict__.setdefault("hoverdata", {})["x_ethanol"] = list(x_grid) + + return results + +@pytest.fixture +def debug_curve_plots(ref_curve, model_curves) -> None: # TODO should I remove or use a different format? + if not _debug_plot_enabled(): + return + print("plotting curves") + + x_ref, rho_ref = ref_curve + + for m, (x_m, rho_m) in model_curves.items(): + rho_ref_m = _interp_1d(x_ref, rho_ref, x_m) + + fig, ax = plt.subplots() + ax.plot(x_ref, rho_ref, label="ref (dense)") + ax.plot(x_m, rho_m, marker="o", label=f"{m} (model)") + ax.plot(x_m, rho_ref_m, marker="x", label="ref on model grid") + ax.set_title(f"Density curve: {m}") + ax.set_xlabel("x_ethanol") + ax.set_ylabel("rho / g cm$^{-3}$") + ax.legend() + + print("saving a curve at:", OUT_PATH / "debug" / m / "density_curve.svg") + + # excess density + _savefig(fig, OUT_PATH / "debug" / m / "density_curve.svg") + rho_ref_m = _interp_1d(x_ref, rho_ref, x_m) + + fig, ax = plt.subplots() + ax.plot(x_ref, _excess_curve(x_ref, rho_ref), label="ref (dense)") + ax.plot(x_m, _excess_curve(x_m, rho_m), marker="o", label=f"{m} (model)") + ax.plot(x_m, _excess_curve(x_m, rho_ref_m), marker="x", label="ref on model grid") + ax.set_title(f"Density curve: {m}") + ax.set_xlabel("x_ethanol") + ax.set_ylabel("rho / g cm$^{-3}$") + ax.legend() + + print("saving a curve at:", OUT_PATH / "debug" / m / "excess_density_curve.svg") + _savefig(fig, OUT_PATH / "debug" / m / "excess_density_curve.svg") + + # volume fraction plot + phi_ref = x_to_phi_ethanol(x_ref, rho_ref) + phi_m = x_to_phi_ethanol(x_m, rho_m) + + fig, ax = plt.subplots() + ax.plot(phi_ref, rho_ref, label="ref (dense)") + ax.plot(phi_m, rho_m, marker="o", label=f"{m} (model)") + ax.plot(phi_m, rho_ref_m, marker="x", label="ref on model grid") + + ax.set_title(f"Density curve (volume fraction): {m}") + ax.set_xlabel(r"$\phi_\mathrm{ethanol}$") + ax.set_ylabel("rho / g cm$^{-3}$") + ax.legend() + + out_phi = OUT_PATH / "debug" / m / "density_curve_phi.svg" + print("saving a curve at:", out_phi) + _savefig(fig, out_phi) + + +@pytest.fixture +def rmse_density(ref_curve, model_curves) -> dict[str, float]: + x_ref, rho_ref = ref_curve + out: dict[str, float] = {} + for m, (x_m, rho_m) in model_curves.items(): + rho_ref_m = _interp_1d(x_ref, rho_ref, x_m) + out[m] = _rmse(rho_m, rho_ref_m) + return out + + +@pytest.fixture +def rmse_excess_density(ref_curve, model_curves) -> dict[str, float]: + """ + RMSE of excess density (detrended by each dataset's own pure endpoints). + """ + x_ref, rho_ref = ref_curve + out: dict[str, float] = {} + + for m, (x_m, rho_m) in model_curves.items(): + rho_ref_m = _interp_1d(x_ref, rho_ref, x_m) + + ex_ref = _excess_curve(x_m, rho_ref_m) + ex_m = _excess_curve(x_m, rho_m) + + out[m] = _rmse(ex_m, ex_ref) + + return out + + +@pytest.fixture +def peak_x_error(ref_curve, model_curves) -> dict[str, float]: + """ + Absolute error in the x-position of the maximum excess density. + + Ref peak is computed on the dense reference curve. + Model peak is computed on its (coarse) grid with a local quadratic refinement. + """ + x_ref, rho_ref = ref_curve + ex_ref_dense = _excess_curve(x_ref, rho_ref) + x_peak_ref = _peak_x_quadratic(x_ref, ex_ref_dense) + print("ref peak at:", x_peak_ref) + + out: dict[str, float] = {} + for m, (x_m, rho_m) in model_curves.items(): + ex_m = _excess_curve(x_m, rho_m) + x_peak_m = _peak_x_quadratic(x_m, ex_m) + out[m] = float(abs(x_peak_m - x_peak_ref)) + + return out + +# ----------------------------------------------------------------------------- +# Table +# ----------------------------------------------------------------------------- + +@pytest.fixture +@build_table( + thresholds=DEFAULT_THRESHOLDS, + filename=OUT_PATH / "density_metrics_table.json", + metric_tooltips={ + "Model": "Name of the model", + "RMSE density": "RMSE between model and reference density at model compositions (g cm⁻³).", + "RMSE excess density": ( + "RMSE after subtracting each curve’s linear baseline between pure endpoints (g cm⁻³)." + ), + "Peak x error": ( + "Absolute difference in mole-fraction location of maximum excess density." + ), + }, +) +def metrics( + rmse_density: dict[str, float], + rmse_excess_density: dict[str, float], + peak_x_error: dict[str, float], +) -> dict[str, dict]: + return { + "RMSE density": rmse_density, + "RMSE excess density": rmse_excess_density, + "Peak x error": peak_x_error, + } + + +def test_ethanol_water_density(metrics: dict[str, dict], densities_parity: dict[str, list], debug_curve_plots) -> None: + """ + Launch analysis (decorators handle writing JSON artifacts for the app). + """ + print(MODEL_INDEX) # TODO: these print statements may be useful for debugging, but should I remove? + print({key0:{MODEL_INDEX[name]: value for name, value in value0.items()} for key0, value0 in metrics.items()}) + return diff --git a/ml_peg/analysis/liquids/ethanol_water_density/analysis.py b/ml_peg/analysis/liquids/ethanol_water_density/analysis.py new file mode 100644 index 000000000..c96c2aaf1 --- /dev/null +++ b/ml_peg/analysis/liquids/ethanol_water_density/analysis.py @@ -0,0 +1,103 @@ +import numpy as np + +M_WATER = 18.01528 # g/mol +M_ETOH = 46.06844 # g/mol + +# Pick densities consistent with your reference conditions (T,P!) +# If your ref curve is at ~20°C, these are around: +RHO_WATER_PURE = 0.9982 # g/cm^3 +RHO_ETH_PURE = 0.7893 # g/cm^3 + +def x_to_phi_ethanol(x, rho_mix, + *, M_eth=M_ETOH, M_water=M_WATER, + rho_eth=RHO_ETH_PURE, rho_water=RHO_WATER_PURE): + """ + Convert ethanol mole fraction x to ethanol volume fraction phi using + mixture density rho_mix and pure-component densities as volume proxies. + """ + x = np.asarray(x, dtype=float) + rho_mix = np.asarray(rho_mix, dtype=float) + + m_eth = x * M_eth + m_wat = (1.0 - x) * M_water + + V_mix = (m_eth + m_wat) / rho_mix # cm^3 per "1 mol mixture basis" + V_eth = m_eth / rho_eth # cm^3 (proxy) + phi = V_eth / V_mix + return phi + +def weight_to_mole_fraction(w): + """ + Convert ethanol weight fraction -> mole fraction. + + w = mass_ethanol / total_mass + """ + n_e = w / M_ETOH + n_w = (1 - w) / M_WATER + return n_e / (n_e + n_w) + + +def _rmse(a: np.ndarray, b: np.ndarray) -> float: + d = a - b + return float(np.sqrt(np.mean(d * d))) + + +def _interp_1d(x_src: np.ndarray, y_src: np.ndarray, x_tgt: np.ndarray) -> np.ndarray: + """ + Linear interpolation. Requires x_tgt within [min(x_src), max(x_src)]. + """ + if np.any(x_tgt < x_src.min() - 1e-12) or np.any(x_tgt > x_src.max() + 1e-12): + raise ValueError("Target x values fall outside reference interpolation range.") + return np.interp(x_tgt, x_src, y_src) + + +def _endpoints_at_0_1(x: np.ndarray, y: np.ndarray, tol: float = 1e-8) -> tuple[float, float]: + """ + Return y(x=0) and y(x=1). Requires that x includes (approximately) 0 and 1. + """ + i0 = np.where(np.isclose(x, 0.0, atol=tol))[0] + i1 = np.where(np.isclose(x, 1.0, atol=tol))[0] + if len(i0) != 1 or len(i1) != 1: + raise ValueError("Curve must include x=0 and x=1 to define linear baseline.") + return float(y[i0[0]]), float(y[i1[0]]) + + +def _linear_baseline(x: np.ndarray, y0: float, y1: float) -> np.ndarray: + return y0 + x * (y1 - y0) + + +def _excess_curve(x: np.ndarray, y: np.ndarray) -> np.ndarray: + """ + Excess relative to the dataset's own pure endpoints (x=0 and x=1). + """ + y0, y1 = _endpoints_at_0_1(x, y) + return y - _linear_baseline(x, y0, y1) + + +def _peak_x_quadratic(x: np.ndarray, y: np.ndarray) -> float: + """ + Estimate x position of minimum y. + - If min is interior and we have neighbors, fit quadratic through 3 points. + - Otherwise fall back to argmin x. + """ + if len(x) < 3: + return float(x[int(np.argmin(y))]) + + i = int(np.argmin(y)) + if i == 0 or i == len(x) - 1: + return float(x[i]) + + # Fit a parabola to (i-1, i, i+1) + xs = x[i - 1 : i + 2] + ys = y[i - 1 : i + 2] + + # y = ax^2 + bx + c + a, b, _c = np.polyfit(xs, ys, deg=2) + if abs(a) < 1e-16: + return float(x[i]) + + xv = -b / (2.0 * a) + + # Clamp to local bracket so we don't get silly extrapolation + xv = float(np.clip(xv, xs.min(), xs.max())) + return xv diff --git a/ml_peg/analysis/liquids/ethanol_water_density/io_tools.py b/ml_peg/analysis/liquids/ethanol_water_density/io_tools.py new file mode 100644 index 000000000..2d4032d9c --- /dev/null +++ b/ml_peg/analysis/liquids/ethanol_water_density/io_tools.py @@ -0,0 +1,99 @@ +import csv +import os +from pathlib import Path + +import numpy as np +from matplotlib import pyplot as plt + +from ml_peg.analysis.liquids.ethanol_water_density.analysis import weight_to_mole_fraction +from ml_peg.app import APP_ROOT +from ml_peg.calcs import CALCS_ROOT + +CATEGORY = "liquids" +BENCHMARK = "ethanol_water_density" +CALC_PATH = CALCS_ROOT / CATEGORY / BENCHMARK / "outputs" +OUT_PATH = APP_ROOT / "data" / CATEGORY / BENCHMARK +DATA_PATH = CALCS_ROOT / CATEGORY / BENCHMARK / "data" + + +def _debug_plot_enabled() -> bool: + # Turn on plots by: DEBUG_PLOTS=1 pytest ... + return os.environ.get("DEBUG_PLOTS", "0") not in ("0", "", "false", "False") + + +def _savefig(fig, outpath: Path) -> None: + outpath.parent.mkdir(parents=True, exist_ok=True) + fig.tight_layout() + fig.savefig(outpath, dpi=200) + plt.close(fig) + + +def _read_model_curve(model_name: str) -> tuple[list[float], list[float]]: + """ + Read model density curve by computing averages from raw time series. + + Expects per-composition files: + x_ethanol_XX/density_timeseries.csv + with columns: step, rho_g_cm3 + """ + model_dir = CALC_PATH / model_name + xs: list[float] = [] + rhos: list[float] = [] + + for case_dir in sorted(model_dir.glob("x_ethanol_*")): + x_ethanol = float(case_dir.name.replace("x_ethanol_", "")) + + ts_path = case_dir / "density_timeseries.csv" + if not ts_path.exists(): + raise FileNotFoundError(f"Missing density time series: {ts_path}") + + rho_vals = [] + steps = [] + with ts_path.open(newline="") as f: + r = csv.DictReader(f) + for row in r: + steps.append(int(row["step"])) + rho_vals.append(float(row["rho_g_cm3"])) + + if not rho_vals: + raise ValueError(f"No density samples found in {ts_path}") + + rho_mean = float(np.mean(rho_vals)) + xs.append(x_ethanol) + rhos.append(rho_mean) + + if _debug_plot_enabled(): + fig, ax = plt.subplots() + ax.plot(steps, rho_vals) + ax.axhline(rho_mean, linestyle="--") + ax.set_title(f"{model_name} x={x_ethanol:.2f} density timeseries") + ax.set_xlabel("step") + ax.set_ylabel("rho / g cm$^{-3}$") + + _savefig(fig, OUT_PATH / "debug" / model_name / f"x_{x_ethanol:.2f}_timeseries.svg") + + return xs, rhos + + +def read_ref_curve() -> tuple[list[float], list[float]]: + """ + Load densities given on a uniform weight-fraction grid + and convert to mole fraction. + + Assumes: + - 101 evenly spaced points + - first line = 0 wt% ethanol + - last line = 100 wt% + """ + ref_file = DATA_PATH / "densities_293.15.txt" + rho_ref = np.loadtxt(ref_file) + + n = len(rho_ref) + + # weight fraction grid + w = np.linspace(0.0, 1.0, n) + + # convert to mole fraction + x = weight_to_mole_fraction(w) + + return list(x), list(rho_ref) diff --git a/ml_peg/analysis/liquids/ethanol_water_density/metrics.yml b/ml_peg/analysis/liquids/ethanol_water_density/metrics.yml new file mode 100644 index 000000000..6da5b4f89 --- /dev/null +++ b/ml_peg/analysis/liquids/ethanol_water_density/metrics.yml @@ -0,0 +1,23 @@ +# TODO: the 'bad' defaults are quite arbitrary so pick decent ones; should I log rescale? +# TODO: add more metrics? e.g. radial distribution (or diffusion). +metrics: + RMSE density: + good: 0.0003 # approximate error of the reference on the coarse, sampled grid + bad: 0.02 + unit: g cm^-3 + tooltip: "RMSE between model and reference densities at the sampled compositions" + level_of_theory: experiment + + RMSE excess density: + good: 0.0003 # approximate error of the reference on the coarse, sampled grid + bad: 0.01 + unit: g cm^-3 + tooltip: "RMSE of excess (non-ideal) density after subtracting linear mixing baseline" + level_of_theory: experiment + + Peak x error: + good: 0.005 # approximate error of the reference on the coarse, sampled grid + bad: 0.20 + unit: mole fraction + tooltip: "Absolute error in the mole-fraction location of maximum excess density" + level_of_theory: experiment diff --git a/ml_peg/app/liquids/ethanol_water_density/app_ethanol_water_density.py b/ml_peg/app/liquids/ethanol_water_density/app_ethanol_water_density.py new file mode 100644 index 000000000..2ded1d118 --- /dev/null +++ b/ml_peg/app/liquids/ethanol_water_density/app_ethanol_water_density.py @@ -0,0 +1,82 @@ +#TODO: This does not work. Fix this + +"""Run ethanol–water density (decomposition curves) app.""" +from dash import Dash +from dash.html import Div + +from ml_peg.app import APP_ROOT +from ml_peg.app.base_app import BaseApp +from ml_peg.app.utils.build_callbacks import plot_from_table_column +from ml_peg.app.utils.load import read_plot + +# ----------------------------------------------------------------------------- +# Configuration +# ----------------------------------------------------------------------------- + +CATEGORY = "liquids" +BENCHMARK_NAME = "ethanol_water_density" + +DOCS_URL = ( + "https://ddmms.github.io/ml-peg/user_guide/benchmarks/" # TODO: update to the right anchor +) + +DATA_PATH = APP_ROOT / "data" / CATEGORY / BENCHMARK_NAME + + +class EthanolWaterDecompositionCurvesApp(BaseApp): + """Ethanol–water density benchmark app layout and callbacks.""" + + def register_callbacks(self) -> None: + """Register callbacks to app.""" + parity = read_plot(DATA_PATH / "density_parity.json", id=f"{BENCHMARK_NAME}-figure") + + # When the user clicks a metric column in the table, show the parity plot. + # (This mirrors the GMTKN55 pattern: different columns can map to different plots; + # here they all map to the same parity plot artifact.) + plot_from_table_column( + table_id=self.table_id, + plot_id=f"{BENCHMARK_NAME}-figure-placeholder", + column_to_plot={ + "RMSE density": parity, + "RMSE excess density": parity, + "Peak x error": parity, + }, + ) + + +def get_app() -> EthanolWaterDecompositionCurvesApp: + """ + Get ethanol–water benchmark app layout and callback registration. + + Returns + ------- + EthanolWaterDecompositionCurvesApp + Benchmark layout and callback registration. + """ + return EthanolWaterDecompositionCurvesApp( + name=BENCHMARK_NAME, + description=( + "Ethanol–water mixture density at 293.15 K. Metrics include density RMSE, " + "excess-density RMSE (baseline-subtracted), and error in the mole-fraction " + "location of the maximum excess density." + ), + docs_url=DOCS_URL, + table_path=DATA_PATH / "density_metrics_table.json", + extra_components=[ + Div(id=f"{BENCHMARK_NAME}-figure-placeholder"), + ], + ) + + +if __name__ == "__main__": + # Create Dash app + # assets_folder should be the parent of the "assets///..." tree + full_app = Dash(__name__, assets_folder=DATA_PATH.parent) + + # Construct layout and register callbacks + app = get_app() + full_app.layout = app.layout + app.register_callbacks() + + # Run app + full_app.run(port=8051, debug=True) diff --git a/ml_peg/app/liquids/liquids.yml b/ml_peg/app/liquids/liquids.yml new file mode 100644 index 000000000..19fcd877d --- /dev/null +++ b/ml_peg/app/liquids/liquids.yml @@ -0,0 +1,2 @@ +title: Liquids # may remove this and move contents to other category +description: Properties of liquids, including densities diff --git a/ml_peg/calcs/liquids/ethanol_water_density/calc_ethanol_water_density.py b/ml_peg/calcs/liquids/ethanol_water_density/calc_ethanol_water_density.py new file mode 100644 index 000000000..6d105c7ad --- /dev/null +++ b/ml_peg/calcs/liquids/ethanol_water_density/calc_ethanol_water_density.py @@ -0,0 +1,112 @@ +import csv +import os +from pathlib import Path +from typing import Any + +import numpy as np +import pytest + +from ml_peg.calcs.liquids.ethanol_water_density.compositions import BENCH_ROOT, DATA_PATH, load_compositions +from ml_peg.calcs.liquids.ethanol_water_density.fake_data import make_fake_curve, make_fake_density_timeseries +from ml_peg.calcs.liquids.ethanol_water_density.md_code import run_one_case +from ml_peg.models.get_models import load_models +from ml_peg.models.models import current_models + +# Local paths +OUT_PATH = BENCH_ROOT / "outputs" + +MODELS = load_models(current_models) +MODEL_INDEX = {name: i for i, name in enumerate(MODELS)} +FAKE_DATA = os.getenv("FAKE_DENSITY_DATA", "") == "1" + + +@pytest.mark.parametrize("mlip", MODELS.items()) +def test_water_ethanol_density_curve(mlip: tuple[str, Any]) -> None: + if not FAKE_DATA: + water_ethanol_density_curve(mlip) + else: + water_ethanol_density_dummy_data(mlip) + +def water_ethanol_density_curve(mlip: tuple[str, Any]) -> None: + """ + Run water–ethanol density curve benchmark for a single MLIP. + + Writes: + - per-composition density time series (raw data) + - a summary CSV derived from those time series + """ + model_name, model = mlip # TODO: dispersion ??? + cases = load_compositions() + + # Where this model writes outputs + model_out = OUT_PATH / model_name + model_out.mkdir(parents=True, exist_ok=True) + + # Get calculator + calc = model.get_calculator() + + for case in cases: + struct_path = DATA_PATH / case.filename + if not struct_path.exists(): + raise FileNotFoundError( + f"Missing structure for x={case.x_ethanol}: {struct_path}" + ) + + case_dir = model_out / f"x_ethanol_{case.x_ethanol:.2f}" + case_dir.mkdir(parents=True, exist_ok=True) + + # --- run simulation --- + rho_series = run_one_case( + struct_path, + calc, + workdir=case_dir, + ) + + # --- write raw density time series --- + ts_path = case_dir / "density_timeseries.csv" + with ts_path.open("w", newline="") as f: + w = csv.writer(f) + w.writerow(["step", "rho_g_cm3"]) + for i, rho in enumerate(rho_series): + w.writerow([i, f"{rho:.8f}"]) + + +def water_ethanol_density_dummy_data(mlip: tuple[str, Any]) -> None: + model_name, model = mlip + cases = load_compositions() + + model_out = OUT_PATH / model_name + model_out.mkdir(parents=True, exist_ok=True) + + # one curve per model + model_kind = MODEL_INDEX[model_name] % 4 + xs_curve, ys_curve = make_fake_curve(model_kind, seed=MODEL_INDEX[model_name]) + xs_curve = np.asarray(xs_curve, dtype=float) + ys_curve = np.asarray(ys_curve, dtype=float) + + for case in cases: + case_dir = model_out / f"x_ethanol_{case.x_ethanol:.2f}" + case_dir.mkdir(parents=True, exist_ok=True) + + rho_eq = float(np.interp(case.x_ethanol, xs_curve, ys_curve)) + n_steps = 200 # fixed for dummy data + + seed = (hash(model_name) ^ hash(round(case.x_ethanol, 4))) & 0xFFFFFFFF + rho_series = make_fake_density_timeseries( + rho_eq, n_steps, seed=seed, start_offset=0.01, tau=0.10, noise_sigma=0.0005 + ) + + ts_path = case_dir / "density_timeseries.csv" + with ts_path.open("w", newline="") as f: + w = csv.writer(f) + w.writerow(["step", "rho_g_cm3"]) + for i, rho in enumerate(rho_series): + w.writerow([i, f"{rho:.8f}"]) + + +if __name__ == "__main__": # TODO: delete this + # run a very small simulation to see if it does something reasonable + from mace.calculators import mace_mp + calc = mace_mp("data_old/mace-omat-0-small.model") + rho = run_one_case("data/mix_xe_0.10.extxyz", calc, nvt_stabilise_steps=250, npt_settle_steps=1000, nvt_thermalise_steps=250, npt_equil_steps=1000, npt_prod_steps=1000, log_every=50, workdir=Path("debug")) + print(rho) \ No newline at end of file diff --git a/ml_peg/calcs/liquids/ethanol_water_density/compositions.py b/ml_peg/calcs/liquids/ethanol_water_density/compositions.py new file mode 100644 index 000000000..0c47100bc --- /dev/null +++ b/ml_peg/calcs/liquids/ethanol_water_density/compositions.py @@ -0,0 +1,34 @@ +import csv +from dataclasses import dataclass +from pathlib import Path + +BENCH_ROOT = Path(__file__).resolve().parent +DATA_PATH = BENCH_ROOT / "data" + + +@dataclass(frozen=True) +class CompositionCase: + x_ethanol: float + filename: str + + +def load_compositions() -> list[CompositionCase]: + """ + Load composition grid. + + Expected CSV columns: x_ethanol, filename + """ + comps_file = DATA_PATH / "compositions.csv" + cases: list[CompositionCase] = [] + with comps_file.open(newline="") as f: + reader = csv.DictReader(f) + for row in reader: + cases.append( + CompositionCase( + x_ethanol=float(row["x_ethanol"]), + filename=row["filename"], + ) + ) + if not cases: + raise RuntimeError("No compositions found in compositions.csv") + return cases diff --git a/ml_peg/calcs/liquids/ethanol_water_density/fake_data.py b/ml_peg/calcs/liquids/ethanol_water_density/fake_data.py new file mode 100644 index 000000000..8fddcc073 --- /dev/null +++ b/ml_peg/calcs/liquids/ethanol_water_density/fake_data.py @@ -0,0 +1,191 @@ +# for debugging, to verify that metrics actually do something reasonable + +from dataclasses import dataclass +import numpy as np + +from ml_peg.analysis.liquids.ethanol_water_density.io_tools import read_ref_curve + +@dataclass(frozen=True) +class FakeCurveParams: + # Master knob: 0 -> perfect match, 1 -> very poor + severity: float = 0.0 + + # Individual error components (interpreted as "max at severity=1") + bias: float = 0.0 # additive offset in y-units + scale: float = 0.0 # multiplicative: y *= (1 + scale*...) + tilt: float = 0.0 # linear-in-x additive distortion + warp: float = 0.0 # smooth nonlinear additive distortion + + noise_sigma: float = 0.0 # iid gaussian noise in y-units + corr_len: float = 0.0 # if >0, adds correlated noise along x + + bump_amp: float = 0.0 # amplitude of local bump(s) + bump_center: float = 0.5 # x location of bump + bump_width: float = 0.08 # bump width (in x units) + + +def _smooth_random_field(xs: np.ndarray, corr_len: float, rng: np.random.Generator) -> np.ndarray: + """ + Create a zero-mean, ~unit-std smooth random field along xs using + a Gaussian kernel in x-distance. O(N^2) but tiny N here (6 points). + """ + if corr_len <= 0: + return np.zeros_like(xs) + + dx = xs[:, None] - xs[None, :] + K = np.exp(-0.5 * (dx / corr_len) ** 2) + # sample correlated normal: K^(1/2) z via cholesky (add jitter for stability) + L = np.linalg.cholesky(K + 1e-12 * np.eye(len(xs))) + z = rng.standard_normal(len(xs)) + field = L @ z + field = field - field.mean() + field = field / (field.std() + 1e-12) + return field + + +def make_fake_curve_from_ref( + xs_ref: list[float], + ys_ref: list[float], + *, + params: FakeCurveParams, + seed: int | None = 0, + clip: tuple[float | None, float | None] = (None, None), +) -> tuple[list[float], list[float]]: + """ + Return (xs, ys_fake) using the same xs as the reference. + Designed for density-like curves but works generically. + + `severity` scales *all* enabled components. For example, if bias=10 and + severity=0.2, you get ~2 units of bias (with a tiny randomization). + """ + sev = float(np.clip(params.severity, 0.0, 1.0)) + rng = np.random.default_rng(seed) + + xs = np.asarray(xs_ref, dtype=float) + y = np.asarray(ys_ref, dtype=float).copy() + + # Normalize x into [-1, 1] for stable “tilt/warp” magnitudes + x01 = (xs - xs.min()) / (xs.max() - xs.min() + 1e-12) + xpm = 2.0 * x01 - 1.0 + + # Small randomization so multiple models with same severity aren’t identical + # (but still deterministic for a given seed). + jitter = lambda: (0.85 + 0.30 * rng.random()) + + # 1) multiplicative scale error + if params.scale != 0.0 and sev > 0: + y *= (1.0 + (params.scale * sev * jitter())) + + # 2) additive bias + if params.bias != 0.0 and sev > 0: + y += (params.bias * sev * jitter()) + + # 3) linear tilt (additive) + if params.tilt != 0.0 and sev > 0: + y += (params.tilt * sev * jitter()) * xpm + + # 4) smooth nonlinear warp (additive): use a low-order smooth basis + if params.warp != 0.0 and sev > 0: + # cubic-ish shape distortion with zero mean + w = (xpm**3 - xpm * np.mean(xpm**2)) + w = w - w.mean() + w = w / (np.std(w) + 1e-12) + y += (params.warp * sev * jitter()) * w + + # 5) local bump to simulate specific composition failure + if params.bump_amp != 0.0 and sev > 0: + bump = np.exp(-0.5 * ((xs - params.bump_center) / (params.bump_width + 1e-12)) ** 2) + bump = bump / (bump.max() + 1e-12) + y += (params.bump_amp * sev * jitter()) * bump + + # 6) noise: iid + optional correlated component + if params.noise_sigma != 0.0 and sev > 0: + y += rng.normal(0.0, params.noise_sigma * sev, size=len(xs)) + + if params.corr_len > 0.0 and params.noise_sigma != 0.0 and sev > 0: + field = _smooth_random_field(xs, params.corr_len, rng) + y += field * (params.noise_sigma * sev * 0.8) + + lo, hi = clip + if lo is not None: + y = np.maximum(y, lo) + if hi is not None: + y = np.minimum(y, hi) + + return xs, y + + +# Convenience presets: "good", "medium", "bad" +def make_fake_curve( + kind: str|int, + seed: int | None = 0, +) -> tuple[list[float], list[float]]: + xs_ref, ys_ref = read_ref_curve() + + kind = kind.lower().strip() if isinstance(kind, str) else kind + if kind == "perfect" or kind == 0: + params = FakeCurveParams(severity=0.0) + elif kind == "good" or kind == 1: + params = FakeCurveParams( + severity=0.15, + bias=0.0, + scale=0.01, + tilt=0.003, + warp=0.002, + noise_sigma=0.001, + corr_len=0.12, + bump_amp=0.0, + ) + elif kind in {"medium", "ok"} or kind == 2: + params = FakeCurveParams( + severity=0.45, + bias=0.0, + scale=0.03, + tilt=0.01, + warp=0.01, + noise_sigma=0.004, + corr_len=0.15, + bump_amp=0.01, + bump_center=0.6, + bump_width=0.10, + ) + elif kind == "bad" or kind == 3: + params = FakeCurveParams( + severity=0.85, + bias=0.02, + scale=0.06, + tilt=0.03, + warp=0.04, + noise_sigma=0.01, + corr_len=0.18, + bump_amp=0.05, + bump_center=0.4, + bump_width=0.08, + ) + else: + raise ValueError(f"Unknown kind={kind!r} (use perfect/good/medium/bad)") + + return make_fake_curve_from_ref(xs_ref, ys_ref, params=params, seed=seed) + + +def make_fake_density_timeseries( + rho_eq: float, + n_steps: int, + *, + seed: int, + start_offset: float = 0.02, # initial deviation from eq + tau: float = 0.15, # relaxation rate (bigger -> faster) + noise_sigma: float = 0.001, # per-step noise +) -> list[float]: + rng = np.random.default_rng(seed) + rho0 = rho_eq + start_offset * (2 * rng.random() - 1) + + series = [] + rho = rho0 + for t in range(n_steps): + # exponential-ish relaxation to rho_eq + rho += tau * (rho_eq - rho) + # add noise + rho += rng.normal(0.0, noise_sigma) + series.append(float(rho)) + return series diff --git a/ml_peg/calcs/liquids/ethanol_water_density/md_code.py b/ml_peg/calcs/liquids/ethanol_water_density/md_code.py new file mode 100644 index 000000000..a7adc65c0 --- /dev/null +++ b/ml_peg/calcs/liquids/ethanol_water_density/md_code.py @@ -0,0 +1,167 @@ +import time +from contextlib import contextmanager +from pathlib import Path +from typing import Any + +import numpy as np +from ase.io import Trajectory, read, write +from ase.md import MDLogger, Langevin +from ase.md.langevinbaoab import LangevinBAOAB +from ase.md.nptberendsen import NPTBerendsen +from ase.md.velocitydistribution import MaxwellBoltzmannDistribution, Stationary, ZeroRotation +from ase.optimize import FIRE +from ase.units import fs, bar + + +def total_mass_kg(atoms): + amu_to_kg = 1.66053906660e-27 + return atoms.get_masses().sum() * amu_to_kg + + +def density_g_cm3(atoms): + V_A3 = atoms.get_volume() + V_m3 = V_A3 * 1e-30 + m_kg = total_mass_kg(atoms) + rho_kg_m3 = m_kg / V_m3 + return rho_kg_m3 / 1000.0 + + +def attach_basic_logging(dyn, atoms, md_logfile, log_every, t0): + logger = MDLogger( + dyn, + atoms, + md_logfile, + header=True, + stress=False, + peratom=False, + mode="a", + ) + dyn.attach(logger, interval=log_every) + + def progress(): + step = dyn.get_number_of_steps() + rho = density_g_cm3(atoms) + V = atoms.get_volume() + T = atoms.get_temperature() + elapsed = time.time() - t0 + + print( + f"[step {step:>8}] " + f"T={T:7.2f} K | " + f"V={V:10.2f} A^3 | " + f"rho={rho:7.4f} g/cm^3 | " + f"elapsed={elapsed:6.1f}s" + ) + + dyn.attach(progress, interval=log_every) + + +@contextmanager +def traj_logging(dyn, atoms, workdir, traj_every: int, name="md.traj"): + traj = None + if traj_every and traj_every > 0: + traj = Trajectory(str(workdir/name), "a", atoms) + dyn.attach(traj.write, interval=traj_every) + try: + yield traj + finally: + if traj is not None: + traj.close() + +def run_one_case( + struct_path: Path, + calc: Any, + *, + T_K: float = 298.15, + P_bar: float = 1.0, + dt_fs: float = 0.5, + nvt_stabilise_steps: int = 250, + npt_settle_steps = 7_500, + nvt_thermalise_steps: int = 1_000, + npt_equil_steps: int = 10_000, + npt_prod_steps: int = 25_000, + sample_every: int = 20, + log_every: int = 200, + log_trajectory_every: int=400, + dummy_data=False, + workdir: Path, +) -> float: + """ + Run NPT and return (mean_density, std_density). + + TODO: use lammps? Though I would guess GPU is the bottleneck so it wouldn't matter? + """ + if dummy_data: + return np.random.normal(loc=0.9,scale=0.05,size=npt_prod_steps//sample_every) + atoms = read(struct_path) + atoms.set_pbc(True) + atoms.wrap() + atoms.calc = calc + + # fast pre-relax + opt = FIRE(atoms, logfile=str(workdir / "opt.log")) + opt.run(fmax=0.15) + + # velocities + MaxwellBoltzmannDistribution(atoms, temperature_K=T_K) + Stationary(atoms) + ZeroRotation(atoms) + + dt = dt_fs * fs + t0 = time.time() + + # the used pre-relax is not good enough, do some Langevin NVT steps before starting NPT + dyn = Langevin(atoms, timestep=dt, temperature_K=T_K, friction=0.01) + attach_basic_logging(dyn, atoms, str(workdir / "md.log"), log_every, t0) + with traj_logging(dyn, atoms, workdir, traj_every=log_trajectory_every): + dyn.run(nvt_stabilise_steps) + + # quick Berendsen settle close to target density + ps = 1000 * fs + dyn = NPTBerendsen( + atoms, + timestep=dt, + temperature_K=T_K, + pressure_au=P_bar * bar, + taut=0.07 * ps, + taup=0.4 * ps, + compressibility=4.5e-5, + ) + attach_basic_logging(dyn, atoms, str(workdir / "md.log"), log_every, t0) + with traj_logging(dyn, atoms, workdir, traj_every=log_trajectory_every): + dyn.run(npt_settle_steps) + + # thermalise + MaxwellBoltzmannDistribution(atoms, temperature_K=T_K) + Stationary(atoms) + ZeroRotation(atoms) + dyn = Langevin(atoms, timestep=dt, temperature_K=T_K, friction=0.03) + attach_basic_logging(dyn, atoms, str(workdir / "md.log"), log_every, t0) + with traj_logging(dyn, atoms, workdir, traj_every=log_trajectory_every): + dyn.run(nvt_thermalise_steps) + + # real NPT + dyn = LangevinBAOAB( # MTK + atoms, + timestep=dt, + temperature_K=T_K, + externalstress=P_bar * bar, + T_tau=0.1 * ps, + P_tau=1 * ps, + hydrostatic=True, + rng=0, + ) + attach_basic_logging(dyn, atoms, str(workdir / "md.log"), log_every, t0) + with traj_logging(dyn, atoms, workdir, traj_every=log_trajectory_every): + dyn.run(npt_equil_steps) + + rhos = [] + n_samples = npt_prod_steps // sample_every + for _ in range(n_samples): + dyn.run(sample_every) + rhos.append(density_g_cm3(atoms)) + + # save final structure for debugging/repro + write(workdir / "final.extxyz", atoms) + + return np.array(rhos) From 2b5a1230622c11cbe57a5d6224bce78d7045a375 Mon Sep 17 00:00:00 2001 From: Arn Date: Sat, 7 Feb 2026 22:26:15 +0000 Subject: [PATCH 02/12] add reference --- ml_peg/analysis/liquids/ethanol_water_density/io_tools.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ml_peg/analysis/liquids/ethanol_water_density/io_tools.py b/ml_peg/analysis/liquids/ethanol_water_density/io_tools.py index 2d4032d9c..88e662d1d 100644 --- a/ml_peg/analysis/liquids/ethanol_water_density/io_tools.py +++ b/ml_peg/analysis/liquids/ethanol_water_density/io_tools.py @@ -79,6 +79,8 @@ def read_ref_curve() -> tuple[list[float], list[float]]: """ Load densities given on a uniform weight-fraction grid and convert to mole fraction. + Densities from: + M. Southard and D. Green, Perry’s Chemical Engineers’ Handbook, 9th Edition. McGraw-Hill Education, 2018. Assumes: - 101 evenly spaced points From 7aa82c4ae04f624f4bd2e488e4a30816a3dda32f Mon Sep 17 00:00:00 2001 From: Arn Date: Tue, 10 Feb 2026 11:32:04 +0000 Subject: [PATCH 03/12] split calcs --- .../calc_ethanol_water_density.py | 118 +++++++++--------- 1 file changed, 56 insertions(+), 62 deletions(-) diff --git a/ml_peg/calcs/liquids/ethanol_water_density/calc_ethanol_water_density.py b/ml_peg/calcs/liquids/ethanol_water_density/calc_ethanol_water_density.py index 6d105c7ad..7c3d3e7f6 100644 --- a/ml_peg/calcs/liquids/ethanol_water_density/calc_ethanol_water_density.py +++ b/ml_peg/calcs/liquids/ethanol_water_density/calc_ethanol_water_density.py @@ -6,107 +6,101 @@ import numpy as np import pytest -from ml_peg.calcs.liquids.ethanol_water_density.compositions import BENCH_ROOT, DATA_PATH, load_compositions -from ml_peg.calcs.liquids.ethanol_water_density.fake_data import make_fake_curve, make_fake_density_timeseries +from ml_peg.calcs.liquids.ethanol_water_density.compositions import ( + BENCH_ROOT, + DATA_PATH, + load_compositions, +) +from ml_peg.calcs.liquids.ethanol_water_density.fake_data import ( + make_fake_curve, + make_fake_density_timeseries, +) from ml_peg.calcs.liquids.ethanol_water_density.md_code import run_one_case from ml_peg.models.get_models import load_models from ml_peg.models.models import current_models -# Local paths OUT_PATH = BENCH_ROOT / "outputs" MODELS = load_models(current_models) MODEL_INDEX = {name: i for i, name in enumerate(MODELS)} FAKE_DATA = os.getenv("FAKE_DENSITY_DATA", "") == "1" +# IMPORTANT: create the list once for parametrization +COMPOSITIONS = load_compositions() -@pytest.mark.parametrize("mlip", MODELS.items()) -def test_water_ethanol_density_curve(mlip: tuple[str, Any]) -> None: + +def _case_id(composition) -> str: + # nicer test ids in `pytest -vv` + return f"x={composition.x_ethanol:.2f}" + + +@pytest.mark.parametrize("mlip", MODELS.items(), ids=[n for n in MODELS.keys()]) +@pytest.mark.parametrize("composition", COMPOSITIONS, ids=[_case_id(c) for c in COMPOSITIONS]) +def test_water_ethanol_density_curve(mlip: tuple[str, Any], composition) -> None: if not FAKE_DATA: - water_ethanol_density_curve(mlip) + water_ethanol_density_curve_one_case(mlip, composition) else: - water_ethanol_density_dummy_data(mlip) + water_ethanol_density_dummy_data_one_case(mlip, composition) -def water_ethanol_density_curve(mlip: tuple[str, Any]) -> None: - """ - Run water–ethanol density curve benchmark for a single MLIP. - Writes: - - per-composition density time series (raw data) - - a summary CSV derived from those time series - """ +def water_ethanol_density_curve_one_case(mlip: tuple[str, Any], case) -> None: model_name, model = mlip # TODO: dispersion ??? - cases = load_compositions() - # Where this model writes outputs model_out = OUT_PATH / model_name model_out.mkdir(parents=True, exist_ok=True) - # Get calculator calc = model.get_calculator() - for case in cases: - struct_path = DATA_PATH / case.filename - if not struct_path.exists(): - raise FileNotFoundError( - f"Missing structure for x={case.x_ethanol}: {struct_path}" - ) - - case_dir = model_out / f"x_ethanol_{case.x_ethanol:.2f}" - case_dir.mkdir(parents=True, exist_ok=True) - - # --- run simulation --- - rho_series = run_one_case( - struct_path, - calc, - workdir=case_dir, - ) - - # --- write raw density time series --- - ts_path = case_dir / "density_timeseries.csv" - with ts_path.open("w", newline="") as f: - w = csv.writer(f) - w.writerow(["step", "rho_g_cm3"]) - for i, rho in enumerate(rho_series): - w.writerow([i, f"{rho:.8f}"]) - - -def water_ethanol_density_dummy_data(mlip: tuple[str, Any]) -> None: + struct_path = DATA_PATH / case.filename + if not struct_path.exists(): + raise FileNotFoundError(f"Missing structure for x={case.x_ethanol}: {struct_path}") + + case_dir = model_out / f"x_ethanol_{case.x_ethanol:.2f}" + case_dir.mkdir(parents=True, exist_ok=True) + + rho_series = run_one_case(struct_path, calc, workdir=case_dir) + + ts_path = case_dir / "density_timeseries.csv" + with ts_path.open("w", newline="") as f: + w = csv.writer(f) + w.writerow(["step", "rho_g_cm3"]) + for i, rho in enumerate(rho_series): + w.writerow([i, f"{rho:.8f}"]) + + +def water_ethanol_density_dummy_data_one_case(mlip: tuple[str, Any], case) -> None: model_name, model = mlip - cases = load_compositions() model_out = OUT_PATH / model_name model_out.mkdir(parents=True, exist_ok=True) - # one curve per model model_kind = MODEL_INDEX[model_name] % 4 xs_curve, ys_curve = make_fake_curve(model_kind, seed=MODEL_INDEX[model_name]) xs_curve = np.asarray(xs_curve, dtype=float) ys_curve = np.asarray(ys_curve, dtype=float) - for case in cases: - case_dir = model_out / f"x_ethanol_{case.x_ethanol:.2f}" - case_dir.mkdir(parents=True, exist_ok=True) + case_dir = model_out / f"x_ethanol_{case.x_ethanol:.2f}" + case_dir.mkdir(parents=True, exist_ok=True) - rho_eq = float(np.interp(case.x_ethanol, xs_curve, ys_curve)) - n_steps = 200 # fixed for dummy data + rho_eq = float(np.interp(case.x_ethanol, xs_curve, ys_curve)) + n_steps = 200 # fixed for dummy data - seed = (hash(model_name) ^ hash(round(case.x_ethanol, 4))) & 0xFFFFFFFF - rho_series = make_fake_density_timeseries( - rho_eq, n_steps, seed=seed, start_offset=0.01, tau=0.10, noise_sigma=0.0005 - ) + seed = (hash(model_name) ^ hash(round(case.x_ethanol, 4))) & 0xFFFFFFFF + rho_series = make_fake_density_timeseries( + rho_eq, n_steps, seed=seed, start_offset=0.01, tau=0.10, noise_sigma=0.0005 + ) - ts_path = case_dir / "density_timeseries.csv" - with ts_path.open("w", newline="") as f: - w = csv.writer(f) - w.writerow(["step", "rho_g_cm3"]) - for i, rho in enumerate(rho_series): - w.writerow([i, f"{rho:.8f}"]) + ts_path = case_dir / "density_timeseries.csv" + with ts_path.open("w", newline="") as f: + w = csv.writer(f) + w.writerow(["step", "rho_g_cm3"]) + for i, rho in enumerate(rho_series): + w.writerow([i, f"{rho:.8f}"]) if __name__ == "__main__": # TODO: delete this # run a very small simulation to see if it does something reasonable from mace.calculators import mace_mp calc = mace_mp("data_old/mace-omat-0-small.model") - rho = run_one_case("data/mix_xe_0.10.extxyz", calc, nvt_stabilise_steps=250, npt_settle_steps=1000, nvt_thermalise_steps=250, npt_equil_steps=1000, npt_prod_steps=1000, log_every=50, workdir=Path("debug")) + rho = run_one_case("data/mix_xe_0.00.extxyz", calc, nvt_stabilise_steps=3000, npt_settle_steps=1000, nvt_thermalise_steps=250, npt_equil_steps=1000, npt_prod_steps=1000, log_every=50, workdir=Path("debug")) print(rho) \ No newline at end of file From 7140c3c267575892a5bdd7d5e305df5c0af5da84 Mon Sep 17 00:00:00 2001 From: Arn Date: Tue, 10 Feb 2026 12:07:45 +0000 Subject: [PATCH 04/12] Better checkpointing --- .../calc_ethanol_water_density.py | 15 +-- .../liquids/ethanol_water_density/io_tools.py | 120 ++++++++++++++++++ .../liquids/ethanol_water_density/md_code.py | 30 +++-- 3 files changed, 144 insertions(+), 21 deletions(-) create mode 100644 ml_peg/calcs/liquids/ethanol_water_density/io_tools.py diff --git a/ml_peg/calcs/liquids/ethanol_water_density/calc_ethanol_water_density.py b/ml_peg/calcs/liquids/ethanol_water_density/calc_ethanol_water_density.py index 7c3d3e7f6..1b0574554 100644 --- a/ml_peg/calcs/liquids/ethanol_water_density/calc_ethanol_water_density.py +++ b/ml_peg/calcs/liquids/ethanol_water_density/calc_ethanol_water_density.py @@ -15,6 +15,7 @@ make_fake_curve, make_fake_density_timeseries, ) +from ml_peg.calcs.liquids.ethanol_water_density.io_tools import write_density_timeseries_checkpointed from ml_peg.calcs.liquids.ethanol_water_density.md_code import run_one_case from ml_peg.models.get_models import load_models from ml_peg.models.models import current_models @@ -61,11 +62,7 @@ def water_ethanol_density_curve_one_case(mlip: tuple[str, Any], case) -> None: rho_series = run_one_case(struct_path, calc, workdir=case_dir) ts_path = case_dir / "density_timeseries.csv" - with ts_path.open("w", newline="") as f: - w = csv.writer(f) - w.writerow(["step", "rho_g_cm3"]) - for i, rho in enumerate(rho_series): - w.writerow([i, f"{rho:.8f}"]) + write_density_timeseries_checkpointed(ts_path, rho_series) def water_ethanol_density_dummy_data_one_case(mlip: tuple[str, Any], case) -> None: @@ -91,16 +88,12 @@ def water_ethanol_density_dummy_data_one_case(mlip: tuple[str, Any], case) -> No ) ts_path = case_dir / "density_timeseries.csv" - with ts_path.open("w", newline="") as f: - w = csv.writer(f) - w.writerow(["step", "rho_g_cm3"]) - for i, rho in enumerate(rho_series): - w.writerow([i, f"{rho:.8f}"]) + write_density_timeseries_checkpointed(ts_path, rho_series) if __name__ == "__main__": # TODO: delete this # run a very small simulation to see if it does something reasonable from mace.calculators import mace_mp calc = mace_mp("data_old/mace-omat-0-small.model") - rho = run_one_case("data/mix_xe_0.00.extxyz", calc, nvt_stabilise_steps=3000, npt_settle_steps=1000, nvt_thermalise_steps=250, npt_equil_steps=1000, npt_prod_steps=1000, log_every=50, workdir=Path("debug")) + rho = run_one_case("data/mix_xe_0.00.extxyz", calc, nvt_stabilise_steps=200, npt_settle_steps=1000, nvt_thermalise_steps=250, npt_equil_steps=1000, npt_prod_steps=1000, log_every=50, workdir=Path("debug")) print(rho) \ No newline at end of file diff --git a/ml_peg/calcs/liquids/ethanol_water_density/io_tools.py b/ml_peg/calcs/liquids/ethanol_water_density/io_tools.py new file mode 100644 index 000000000..e3a03a4c4 --- /dev/null +++ b/ml_peg/calcs/liquids/ethanol_water_density/io_tools.py @@ -0,0 +1,120 @@ +import csv +from pathlib import Path +from typing import Iterable + + +def write_density_timeseries_checkpointed( + ts_path: Path, + rho_series: Iterable[float], + *, + min_match_fraction: float = 0.8, +) -> None: + """ + Write density_timeseries.csv with checkpoint validation. + + Behavior + -------- + If file exists: + - read existing values + - verify >= min_match_fraction already match rho_series + - overwrite anyway + - raise AssertionError if insufficient match + + If file does not exist: + - just write + + Helps detect broken resume logic while still allowing overwrite. + """ + rho_series = list(rho_series) + + # ------------------------- + # 1) Validate existing file + # ------------------------- + if ts_path.exists(): + old_vals: list[float] = [] + + try: + with ts_path.open() as f: + r = csv.reader(f) + next(r, None) # header + for row in r: + if len(row) >= 2: + old_vals.append(float(row[1])) + except Exception: + old_vals = [] + + if old_vals: + n = min(len(old_vals), len(rho_series)) + + matches = sum( + abs(old_vals[i] - rho_series[i]) < 1e-6 + for i in range(n) + ) + + frac = matches / n if n else 0.0 + + if frac < min_match_fraction: + raise AssertionError( + f"{ts_path}: only {frac:.1%} of checkpoint values match " + f"(expected ≥ {min_match_fraction:.0%}). " + "run_one_case resume likely broken." + ) + + # ------------------------- + # 2) Always rewrite file + # ------------------------- + with ts_path.open("w", newline="") as f: + w = csv.writer(f) + w.writerow(["step", "rho_g_cm3"]) + for i, rho in enumerate(rho_series): + w.writerow([i, f"{rho:.8f}"]) + + + + + +class DensityTimeseriesLogger: + """ + Streaming CSV logger for density time series. + + - deletes existing file on start (optional) + - writes header once + - append rows as simulation runs + - flushes every write (crash-safe) + - usable as context manager + """ + + def __init__(self, path: Path, *, overwrite: bool = True): + self.path = Path(path) + self.overwrite = overwrite + self._f = None + self._writer = None + self._step = 0 + + # --------------------- + # context manager API + # --------------------- + def __enter__(self): + if self.overwrite and self.path.exists(): + self.path.unlink() + + self._f = self.path.open("w", newline="") + self._writer = csv.writer(self._f) + + self._writer.writerow(["step", "rho_g_cm3"]) + self._f.flush() + + return self + + def __exit__(self, exc_type, exc, tb): + if self._f: + self._f.close() + + # --------------------- + # logging + # --------------------- + def write(self, rho: float): + """Write one density value.""" + self._writer.writerow([self._step, f"{rho:.8f}"]) + self._f.flush() # critical for crash safety + self._step += 1 diff --git a/ml_peg/calcs/liquids/ethanol_water_density/md_code.py b/ml_peg/calcs/liquids/ethanol_water_density/md_code.py index a7adc65c0..c8c2ba280 100644 --- a/ml_peg/calcs/liquids/ethanol_water_density/md_code.py +++ b/ml_peg/calcs/liquids/ethanol_water_density/md_code.py @@ -1,7 +1,7 @@ import time from contextlib import contextmanager from pathlib import Path -from typing import Any +from typing import Any, Iterable import numpy as np from ase.io import Trajectory, read, write @@ -12,6 +12,8 @@ from ase.optimize import FIRE from ase.units import fs, bar +from ml_peg.calcs.liquids.ethanol_water_density.io_tools import DensityTimeseriesLogger + def total_mass_kg(atoms): amu_to_kg = 1.66053906660e-27 @@ -75,7 +77,7 @@ def run_one_case( T_K: float = 298.15, P_bar: float = 1.0, dt_fs: float = 0.5, - nvt_stabilise_steps: int = 250, + nvt_stabilise_steps: int = 4_000, npt_settle_steps = 7_500, nvt_thermalise_steps: int = 1_000, npt_equil_steps: int = 10_000, @@ -85,14 +87,19 @@ def run_one_case( log_trajectory_every: int=400, dummy_data=False, workdir: Path, -) -> float: +) -> Iterable[float]: """ Run NPT and return (mean_density, std_density). TODO: use lammps? Though I would guess GPU is the bottleneck so it wouldn't matter? """ + ts_path = workdir / "density_timeseries.csv" if dummy_data: - return np.random.normal(loc=0.9,scale=0.05,size=npt_prod_steps//sample_every) + rho_series = np.random.normal(loc=0.9,scale=0.05,size=npt_prod_steps//sample_every) + with DensityTimeseriesLogger(ts_path) as density_log: + for rho in rho_series: + density_log.write(rho) + return rho_series atoms = read(struct_path) atoms.set_pbc(True) atoms.wrap() @@ -111,7 +118,7 @@ def run_one_case( t0 = time.time() # the used pre-relax is not good enough, do some Langevin NVT steps before starting NPT - dyn = Langevin(atoms, timestep=dt, temperature_K=T_K, friction=0.01) + dyn = Langevin(atoms, timestep=dt, temperature_K=T_K, friction=0.02) attach_basic_logging(dyn, atoms, str(workdir / "md.log"), log_every, t0) with traj_logging(dyn, atoms, workdir, traj_every=log_trajectory_every): dyn.run(nvt_stabilise_steps) @@ -155,11 +162,14 @@ def run_one_case( with traj_logging(dyn, atoms, workdir, traj_every=log_trajectory_every): dyn.run(npt_equil_steps) - rhos = [] - n_samples = npt_prod_steps // sample_every - for _ in range(n_samples): - dyn.run(sample_every) - rhos.append(density_g_cm3(atoms)) + rhos = [] + n_samples = npt_prod_steps // sample_every + with DensityTimeseriesLogger(ts_path) as density_log: + for _ in range(n_samples): + dyn.run(sample_every) + rho = density_g_cm3(atoms) + rhos.append(rho) + density_log.write(rho) # save final structure for debugging/repro write(workdir / "final.extxyz", atoms) From a2db61a4abe787ca8ec9fb1de77d8d99aeadec49 Mon Sep 17 00:00:00 2001 From: Arn Date: Tue, 10 Feb 2026 16:00:22 +0000 Subject: [PATCH 05/12] linting and ruff --- .../{analysis.py => _analysis.py} | 64 ++++++++------- .../{io_tools.py => _io_tools.py} | 25 ++++-- .../analyse_ethanol_water_density.py | 82 +++++++++++++------ .../app_ethanol_water_density.py | 15 ++-- .../{compositions.py => _compositions.py} | 6 ++ .../{fake_data.py => _fake_data.py} | 66 ++++++++------- .../{io_tools.py => _io_tools.py} | 16 ++-- .../calc_ethanol_water_density.py | 37 +++++++-- .../liquids/ethanol_water_density/md_code.py | 73 ++++++++++------- 9 files changed, 242 insertions(+), 142 deletions(-) rename ml_peg/analysis/liquids/ethanol_water_density/{analysis.py => _analysis.py} (65%) rename ml_peg/analysis/liquids/ethanol_water_density/{io_tools.py => _io_tools.py} (85%) rename ml_peg/calcs/liquids/ethanol_water_density/{compositions.py => _compositions.py} (89%) rename ml_peg/calcs/liquids/ethanol_water_density/{fake_data.py => _fake_data.py} (74%) rename ml_peg/calcs/liquids/ethanol_water_density/{io_tools.py => _io_tools.py} (92%) diff --git a/ml_peg/analysis/liquids/ethanol_water_density/analysis.py b/ml_peg/analysis/liquids/ethanol_water_density/_analysis.py similarity index 65% rename from ml_peg/analysis/liquids/ethanol_water_density/analysis.py rename to ml_peg/analysis/liquids/ethanol_water_density/_analysis.py index c96c2aaf1..bcc5fbb55 100644 --- a/ml_peg/analysis/liquids/ethanol_water_density/analysis.py +++ b/ml_peg/analysis/liquids/ethanol_water_density/_analysis.py @@ -1,30 +1,38 @@ +"""analyse ethanol-water density curves.""" + +from __future__ import annotations + import numpy as np -M_WATER = 18.01528 # g/mol -M_ETOH = 46.06844 # g/mol +M_WATER = 18.01528 # g/mol +M_ETOH = 46.06844 # g/mol # Pick densities consistent with your reference conditions (T,P!) # If your ref curve is at ~20°C, these are around: -RHO_WATER_PURE = 0.9982 # g/cm^3 -RHO_ETH_PURE = 0.7893 # g/cm^3 - -def x_to_phi_ethanol(x, rho_mix, - *, M_eth=M_ETOH, M_water=M_WATER, - rho_eth=RHO_ETH_PURE, rho_water=RHO_WATER_PURE): - """ - Convert ethanol mole fraction x to ethanol volume fraction phi using - mixture density rho_mix and pure-component densities as volume proxies. - """ +RHO_WATER_PURE = 0.9982 # g/cm^3 +RHO_ETH_PURE = 0.7893 # g/cm^3 + + +def x_to_phi_ethanol( + x, + rho_mix, + *, + m_eth=M_ETOH, + m_water=M_WATER, + rho_eth=RHO_ETH_PURE, + rho_water=RHO_WATER_PURE, +): # TODO: double check formula + """Convert ethanol mole fraction x to ethanol volume fraction phi.""" x = np.asarray(x, dtype=float) rho_mix = np.asarray(rho_mix, dtype=float) - m_eth = x * M_eth - m_wat = (1.0 - x) * M_water + m_eth = x * m_eth + m_wat = (1.0 - x) * m_water + + v_mix = (m_eth + m_wat) / rho_mix # cm^3 per "1 mol mixture basis" + v_eth = m_eth / rho_eth # cm^3 (proxy) + return v_eth / v_mix - V_mix = (m_eth + m_wat) / rho_mix # cm^3 per "1 mol mixture basis" - V_eth = m_eth / rho_eth # cm^3 (proxy) - phi = V_eth / V_mix - return phi def weight_to_mole_fraction(w): """ @@ -44,17 +52,19 @@ def _rmse(a: np.ndarray, b: np.ndarray) -> float: def _interp_1d(x_src: np.ndarray, y_src: np.ndarray, x_tgt: np.ndarray) -> np.ndarray: """ - Linear interpolation. Requires x_tgt within [min(x_src), max(x_src)]. + Linear interpolation. + + Requires x_tgt within [min(x_src), max(x_src)]. """ if np.any(x_tgt < x_src.min() - 1e-12) or np.any(x_tgt > x_src.max() + 1e-12): raise ValueError("Target x values fall outside reference interpolation range.") return np.interp(x_tgt, x_src, y_src) -def _endpoints_at_0_1(x: np.ndarray, y: np.ndarray, tol: float = 1e-8) -> tuple[float, float]: - """ - Return y(x=0) and y(x=1). Requires that x includes (approximately) 0 and 1. - """ +def _endpoints_at_0_1( + x: np.ndarray, y: np.ndarray, tol: float = 1e-8 +) -> tuple[float, float]: + """Return y(x=0) and y(x=1). Requires that x includes (approximately) 0 and 1.""" i0 = np.where(np.isclose(x, 0.0, atol=tol))[0] i1 = np.where(np.isclose(x, 1.0, atol=tol))[0] if len(i0) != 1 or len(i1) != 1: @@ -67,9 +77,7 @@ def _linear_baseline(x: np.ndarray, y0: float, y1: float) -> np.ndarray: def _excess_curve(x: np.ndarray, y: np.ndarray) -> np.ndarray: - """ - Excess relative to the dataset's own pure endpoints (x=0 and x=1). - """ + """Excess relative to the dataset's own pure endpoints (x=0 and x=1).""" y0, y1 = _endpoints_at_0_1(x, y) return y - _linear_baseline(x, y0, y1) @@ -77,6 +85,7 @@ def _excess_curve(x: np.ndarray, y: np.ndarray) -> np.ndarray: def _peak_x_quadratic(x: np.ndarray, y: np.ndarray) -> float: """ Estimate x position of minimum y. + - If min is interior and we have neighbors, fit quadratic through 3 points. - Otherwise fall back to argmin x. """ @@ -99,5 +108,4 @@ def _peak_x_quadratic(x: np.ndarray, y: np.ndarray) -> float: xv = -b / (2.0 * a) # Clamp to local bracket so we don't get silly extrapolation - xv = float(np.clip(xv, xs.min(), xs.max())) - return xv + return float(np.clip(xv, xs.min(), xs.max())) diff --git a/ml_peg/analysis/liquids/ethanol_water_density/io_tools.py b/ml_peg/analysis/liquids/ethanol_water_density/_io_tools.py similarity index 85% rename from ml_peg/analysis/liquids/ethanol_water_density/io_tools.py rename to ml_peg/analysis/liquids/ethanol_water_density/_io_tools.py index 88e662d1d..4015f0fec 100644 --- a/ml_peg/analysis/liquids/ethanol_water_density/io_tools.py +++ b/ml_peg/analysis/liquids/ethanol_water_density/_io_tools.py @@ -1,11 +1,17 @@ +"""i/o tools for analysis of ethanol-water densities.""" + +from __future__ import annotations + import csv import os from pathlib import Path -import numpy as np from matplotlib import pyplot as plt +import numpy as np -from ml_peg.analysis.liquids.ethanol_water_density.analysis import weight_to_mole_fraction +from ml_peg.analysis.liquids.ethanol_water_density.analysis import ( + weight_to_mole_fraction, +) from ml_peg.app import APP_ROOT from ml_peg.calcs import CALCS_ROOT @@ -13,7 +19,7 @@ BENCHMARK = "ethanol_water_density" CALC_PATH = CALCS_ROOT / CATEGORY / BENCHMARK / "outputs" OUT_PATH = APP_ROOT / "data" / CATEGORY / BENCHMARK -DATA_PATH = CALCS_ROOT / CATEGORY / BENCHMARK / "data" +DATA_PATH = CALCS_ROOT / CATEGORY / BENCHMARK / "data" def _debug_plot_enabled() -> bool: @@ -70,17 +76,22 @@ def _read_model_curve(model_name: str) -> tuple[list[float], list[float]]: ax.set_xlabel("step") ax.set_ylabel("rho / g cm$^{-3}$") - _savefig(fig, OUT_PATH / "debug" / model_name / f"x_{x_ethanol:.2f}_timeseries.svg") + _savefig( + fig, + OUT_PATH / "debug" / model_name / f"x_{x_ethanol:.2f}_timeseries.svg", + ) return xs, rhos def read_ref_curve() -> tuple[list[float], list[float]]: """ - Load densities given on a uniform weight-fraction grid - and convert to mole fraction. + Load densities given on a uniform weight-fraction grid. + + And convert to mole fraction. Densities from: - M. Southard and D. Green, Perry’s Chemical Engineers’ Handbook, 9th Edition. McGraw-Hill Education, 2018. + M. Southard and D. Green, Perry’s Chemical Engineers’ Handbook, + 9th Edition. McGraw-Hill Education, 2018. Assumes: - 101 evenly spaced points diff --git a/ml_peg/analysis/liquids/ethanol_water_density/analyse_ethanol_water_density.py b/ml_peg/analysis/liquids/ethanol_water_density/analyse_ethanol_water_density.py index 6c8c1a0b1..3827ec510 100644 --- a/ml_peg/analysis/liquids/ethanol_water_density/analyse_ethanol_water_density.py +++ b/ml_peg/analysis/liquids/ethanol_water_density/analyse_ethanol_water_density.py @@ -1,20 +1,33 @@ +"""analyse ethanol-water density curves.""" + # TODO: remove hardcoded things? +from __future__ import annotations + from pathlib import Path -import numpy as np import matplotlib.pyplot as plt +import numpy as np import pytest -from ml_peg.analysis.liquids.ethanol_water_density.analysis import _rmse, _interp_1d, \ - _excess_curve, _peak_x_quadratic, x_to_phi_ethanol -from ml_peg.analysis.liquids.ethanol_water_density.io_tools import OUT_PATH, _debug_plot_enabled, _savefig, \ - _read_model_curve, read_ref_curve +from ml_peg.analysis.liquids.ethanol_water_density.analysis import ( + _excess_curve, + _interp_1d, + _peak_x_quadratic, + _rmse, + x_to_phi_ethanol, +) +from ml_peg.analysis.liquids.ethanol_water_density.io_tools import ( + OUT_PATH, + _debug_plot_enabled, + _read_model_curve, + _savefig, + read_ref_curve, +) from ml_peg.analysis.utils.decorators import build_table, plot_parity from ml_peg.analysis.utils.utils import load_metrics_config from ml_peg.models.get_models import get_model_names from ml_peg.models.models import current_models - MODELS = get_model_names(current_models) MODEL_INDEX = {name: i for i, name in enumerate(MODELS)} # duplicate in calc @@ -29,6 +42,7 @@ @pytest.fixture(scope="session") def ref_curve() -> tuple[np.ndarray, np.ndarray]: + """Return reference density curve.""" x_ref, rho_ref = read_ref_curve() x = np.asarray(x_ref, dtype=float) rho = np.asarray(rho_ref, dtype=float) @@ -40,6 +54,7 @@ def ref_curve() -> tuple[np.ndarray, np.ndarray]: @pytest.fixture def model_curves() -> dict[str, tuple[np.ndarray, np.ndarray]]: + """Return simulated density curves.""" curves: dict[str, tuple[np.ndarray, np.ndarray]] = {} for model_name in MODELS: xs, rhos = _read_model_curve(model_name) @@ -57,11 +72,12 @@ def model_curves() -> dict[str, tuple[np.ndarray, np.ndarray]]: title="Ethanol–water density (293.15 K)", x_label="Reference density / g cm⁻³", y_label="Predicted density / g cm⁻³", - #hoverdata={ + # hoverdata={ # "x_ethanol": [], # filled in fixture - #}, + # }, ) # TODO: read docs!!! doesn't seem to work yet. def densities_parity(ref_curve, model_curves) -> dict[str, list]: + """Parity plot of simulated and reference density.""" x_ref, rho_ref = ref_curve # Use the first model's x grid for hover labels (parity requires same-length lists) @@ -84,15 +100,14 @@ def densities_parity(ref_curve, model_curves) -> dict[str, list]: rho_m_on_grid = rho_m results[m] = list(rho_m_on_grid) - ## Patch hoverdata list in-place (decorator reads the dict) - ## NOTE: if your decorator captures hoverdata at decoration time, - ## switch to hoverdata={"x_ethanol": x_labels()} fixture pattern like the docs. - #densities_parity.__wrapped__.__dict__.setdefault("hoverdata", {})["x_ethanol"] = list(x_grid) - return results + @pytest.fixture -def debug_curve_plots(ref_curve, model_curves) -> None: # TODO should I remove or use a different format? +def debug_curve_plots( + ref_curve, model_curves +) -> None: # TODO should I remove or use a different format? + """Plot density curves.""" if not _debug_plot_enabled(): return print("plotting curves") @@ -120,7 +135,9 @@ def debug_curve_plots(ref_curve, model_curves) -> None: # TODO should I remove fig, ax = plt.subplots() ax.plot(x_ref, _excess_curve(x_ref, rho_ref), label="ref (dense)") ax.plot(x_m, _excess_curve(x_m, rho_m), marker="o", label=f"{m} (model)") - ax.plot(x_m, _excess_curve(x_m, rho_ref_m), marker="x", label="ref on model grid") + ax.plot( + x_m, _excess_curve(x_m, rho_ref_m), marker="x", label="ref on model grid" + ) ax.set_title(f"Density curve: {m}") ax.set_xlabel("x_ethanol") ax.set_ylabel("rho / g cm$^{-3}$") @@ -131,7 +148,7 @@ def debug_curve_plots(ref_curve, model_curves) -> None: # TODO should I remove # volume fraction plot phi_ref = x_to_phi_ethanol(x_ref, rho_ref) - phi_m = x_to_phi_ethanol(x_m, rho_m) + phi_m = x_to_phi_ethanol(x_m, rho_m) fig, ax = plt.subplots() ax.plot(phi_ref, rho_ref, label="ref (dense)") @@ -150,6 +167,7 @@ def debug_curve_plots(ref_curve, model_curves) -> None: # TODO should I remove @pytest.fixture def rmse_density(ref_curve, model_curves) -> dict[str, float]: + """RMSE of the density vs reference density.""" x_ref, rho_ref = ref_curve out: dict[str, float] = {} for m, (x_m, rho_m) in model_curves.items(): @@ -160,9 +178,7 @@ def rmse_density(ref_curve, model_curves) -> dict[str, float]: @pytest.fixture def rmse_excess_density(ref_curve, model_curves) -> dict[str, float]: - """ - RMSE of excess density (detrended by each dataset's own pure endpoints). - """ + """RMSE of excess density (detrended by each dataset's own pure endpoints).""" x_ref, rho_ref = ref_curve out: dict[str, float] = {} @@ -198,19 +214,23 @@ def peak_x_error(ref_curve, model_curves) -> dict[str, float]: return out + # ----------------------------------------------------------------------------- # Table # ----------------------------------------------------------------------------- + @pytest.fixture @build_table( thresholds=DEFAULT_THRESHOLDS, filename=OUT_PATH / "density_metrics_table.json", metric_tooltips={ "Model": "Name of the model", - "RMSE density": "RMSE between model and reference density at model compositions (g cm⁻³).", + "RMSE density": "RMSE between model and reference density" + "at model compositions (g cm⁻³).", "RMSE excess density": ( - "RMSE after subtracting each curve’s linear baseline between pure endpoints (g cm⁻³)." + "RMSE after subtracting each curve’s linear baseline" + "between pure endpoints (g cm⁻³)." ), "Peak x error": ( "Absolute difference in mole-fraction location of maximum excess density." @@ -222,6 +242,7 @@ def metrics( rmse_excess_density: dict[str, float], peak_x_error: dict[str, float], ) -> dict[str, dict]: + """Return metric data.""" return { "RMSE density": rmse_density, "RMSE excess density": rmse_excess_density, @@ -229,10 +250,17 @@ def metrics( } -def test_ethanol_water_density(metrics: dict[str, dict], densities_parity: dict[str, list], debug_curve_plots) -> None: - """ - Launch analysis (decorators handle writing JSON artifacts for the app). - """ - print(MODEL_INDEX) # TODO: these print statements may be useful for debugging, but should I remove? - print({key0:{MODEL_INDEX[name]: value for name, value in value0.items()} for key0, value0 in metrics.items()}) +def test_ethanol_water_density( + metrics: dict[str, dict], densities_parity: dict[str, list], debug_curve_plots +) -> None: + """Launch analysis.""" + print( + MODEL_INDEX + ) # TODO: these print statements may be useful for debugging, but should I remove? + print( + { + key0: {MODEL_INDEX[name]: value for name, value in value0.items()} + for key0, value0 in metrics.items() + } + ) return diff --git a/ml_peg/app/liquids/ethanol_water_density/app_ethanol_water_density.py b/ml_peg/app/liquids/ethanol_water_density/app_ethanol_water_density.py index 2ded1d118..cb4e3c400 100644 --- a/ml_peg/app/liquids/ethanol_water_density/app_ethanol_water_density.py +++ b/ml_peg/app/liquids/ethanol_water_density/app_ethanol_water_density.py @@ -1,6 +1,9 @@ -#TODO: This does not work. Fix this +# TODO: This does not work. Fix this """Run ethanol–water density (decomposition curves) app.""" + +from __future__ import annotations + from dash import Dash from dash.html import Div @@ -16,9 +19,7 @@ CATEGORY = "liquids" BENCHMARK_NAME = "ethanol_water_density" -DOCS_URL = ( - "https://ddmms.github.io/ml-peg/user_guide/benchmarks/" # TODO: update to the right anchor -) +DOCS_URL = "https://ddmms.github.io/ml-peg/user_guide/benchmarks/" DATA_PATH = APP_ROOT / "data" / CATEGORY / BENCHMARK_NAME @@ -28,11 +29,11 @@ class EthanolWaterDecompositionCurvesApp(BaseApp): def register_callbacks(self) -> None: """Register callbacks to app.""" - parity = read_plot(DATA_PATH / "density_parity.json", id=f"{BENCHMARK_NAME}-figure") + parity = read_plot( + DATA_PATH / "density_parity.json", id=f"{BENCHMARK_NAME}-figure" + ) # When the user clicks a metric column in the table, show the parity plot. - # (This mirrors the GMTKN55 pattern: different columns can map to different plots; - # here they all map to the same parity plot artifact.) plot_from_table_column( table_id=self.table_id, plot_id=f"{BENCHMARK_NAME}-figure-placeholder", diff --git a/ml_peg/calcs/liquids/ethanol_water_density/compositions.py b/ml_peg/calcs/liquids/ethanol_water_density/_compositions.py similarity index 89% rename from ml_peg/calcs/liquids/ethanol_water_density/compositions.py rename to ml_peg/calcs/liquids/ethanol_water_density/_compositions.py index 0c47100bc..bac4af789 100644 --- a/ml_peg/calcs/liquids/ethanol_water_density/compositions.py +++ b/ml_peg/calcs/liquids/ethanol_water_density/_compositions.py @@ -1,3 +1,7 @@ +"""Load composition data.""" + +from __future__ import annotations + import csv from dataclasses import dataclass from pathlib import Path @@ -8,6 +12,8 @@ @dataclass(frozen=True) class CompositionCase: + """Map composition to file.""" + x_ethanol: float filename: str diff --git a/ml_peg/calcs/liquids/ethanol_water_density/fake_data.py b/ml_peg/calcs/liquids/ethanol_water_density/_fake_data.py similarity index 74% rename from ml_peg/calcs/liquids/ethanol_water_density/fake_data.py rename to ml_peg/calcs/liquids/ethanol_water_density/_fake_data.py index 8fddcc073..0b28195e0 100644 --- a/ml_peg/calcs/liquids/ethanol_water_density/fake_data.py +++ b/ml_peg/calcs/liquids/ethanol_water_density/_fake_data.py @@ -1,46 +1,49 @@ -# for debugging, to verify that metrics actually do something reasonable +"""for debugging, to verify that metrics actually do something reasonable.""" + +from __future__ import annotations from dataclasses import dataclass + import numpy as np from ml_peg.analysis.liquids.ethanol_water_density.io_tools import read_ref_curve + @dataclass(frozen=True) class FakeCurveParams: + """Class for fake curve parameters.""" + # Master knob: 0 -> perfect match, 1 -> very poor severity: float = 0.0 # Individual error components (interpreted as "max at severity=1") - bias: float = 0.0 # additive offset in y-units - scale: float = 0.0 # multiplicative: y *= (1 + scale*...) - tilt: float = 0.0 # linear-in-x additive distortion - warp: float = 0.0 # smooth nonlinear additive distortion + bias: float = 0.0 # additive offset in y-units + scale: float = 0.0 # multiplicative: y *= (1 + scale*...) + tilt: float = 0.0 # linear-in-x additive distortion + warp: float = 0.0 # smooth nonlinear additive distortion - noise_sigma: float = 0.0 # iid gaussian noise in y-units - corr_len: float = 0.0 # if >0, adds correlated noise along x + noise_sigma: float = 0.0 # iid gaussian noise in y-units + corr_len: float = 0.0 # if >0, adds correlated noise along x - bump_amp: float = 0.0 # amplitude of local bump(s) - bump_center: float = 0.5 # x location of bump - bump_width: float = 0.08 # bump width (in x units) + bump_amp: float = 0.0 # amplitude of local bump(s) + bump_center: float = 0.5 # x location of bump + bump_width: float = 0.08 # bump width (in x units) -def _smooth_random_field(xs: np.ndarray, corr_len: float, rng: np.random.Generator) -> np.ndarray: - """ - Create a zero-mean, ~unit-std smooth random field along xs using - a Gaussian kernel in x-distance. O(N^2) but tiny N here (6 points). - """ +def _smooth_random_field( + xs: np.ndarray, corr_len: float, rng: np.random.Generator +) -> np.ndarray: + """Create a zero-mean, ~unit-std smooth random field along xs.""" if corr_len <= 0: return np.zeros_like(xs) dx = xs[:, None] - xs[None, :] - K = np.exp(-0.5 * (dx / corr_len) ** 2) + k = np.exp(-0.5 * (dx / corr_len) ** 2) # sample correlated normal: K^(1/2) z via cholesky (add jitter for stability) - L = np.linalg.cholesky(K + 1e-12 * np.eye(len(xs))) z = rng.standard_normal(len(xs)) - field = L @ z + field = np.linalg.cholesky(k + 1e-12 * np.eye(len(xs))) @ z field = field - field.mean() - field = field / (field.std() + 1e-12) - return field + return field / (field.std() + 1e-12) def make_fake_curve_from_ref( @@ -53,7 +56,6 @@ def make_fake_curve_from_ref( ) -> tuple[list[float], list[float]]: """ Return (xs, ys_fake) using the same xs as the reference. - Designed for density-like curves but works generically. `severity` scales *all* enabled components. For example, if bias=10 and severity=0.2, you get ~2 units of bias (with a tiny randomization). @@ -70,15 +72,16 @@ def make_fake_curve_from_ref( # Small randomization so multiple models with same severity aren’t identical # (but still deterministic for a given seed). - jitter = lambda: (0.85 + 0.30 * rng.random()) + def jitter(): + return 0.85 + 0.30 * rng.random() # 1) multiplicative scale error if params.scale != 0.0 and sev > 0: - y *= (1.0 + (params.scale * sev * jitter())) + y *= 1.0 + (params.scale * sev * jitter()) # 2) additive bias if params.bias != 0.0 and sev > 0: - y += (params.bias * sev * jitter()) + y += params.bias * sev * jitter() # 3) linear tilt (additive) if params.tilt != 0.0 and sev > 0: @@ -87,14 +90,16 @@ def make_fake_curve_from_ref( # 4) smooth nonlinear warp (additive): use a low-order smooth basis if params.warp != 0.0 and sev > 0: # cubic-ish shape distortion with zero mean - w = (xpm**3 - xpm * np.mean(xpm**2)) + w = xpm**3 - xpm * np.mean(xpm**2) w = w - w.mean() w = w / (np.std(w) + 1e-12) y += (params.warp * sev * jitter()) * w # 5) local bump to simulate specific composition failure if params.bump_amp != 0.0 and sev > 0: - bump = np.exp(-0.5 * ((xs - params.bump_center) / (params.bump_width + 1e-12)) ** 2) + bump = np.exp( + -0.5 * ((xs - params.bump_center) / (params.bump_width + 1e-12)) ** 2 + ) bump = bump / (bump.max() + 1e-12) y += (params.bump_amp * sev * jitter()) * bump @@ -115,11 +120,11 @@ def make_fake_curve_from_ref( return xs, y -# Convenience presets: "good", "medium", "bad" def make_fake_curve( - kind: str|int, + kind: str | int, seed: int | None = 0, ) -> tuple[list[float], list[float]]: + """Make a fake density curve based on a reference.""" xs_ref, ys_ref = read_ref_curve() kind = kind.lower().strip() if isinstance(kind, str) else kind @@ -174,15 +179,16 @@ def make_fake_density_timeseries( *, seed: int, start_offset: float = 0.02, # initial deviation from eq - tau: float = 0.15, # relaxation rate (bigger -> faster) + tau: float = 0.15, # relaxation rate (bigger -> faster) noise_sigma: float = 0.001, # per-step noise ) -> list[float]: + """Make a time series of fake density values.""" rng = np.random.default_rng(seed) rho0 = rho_eq + start_offset * (2 * rng.random() - 1) series = [] rho = rho0 - for t in range(n_steps): + for _ in range(n_steps): # exponential-ish relaxation to rho_eq rho += tau * (rho_eq - rho) # add noise diff --git a/ml_peg/calcs/liquids/ethanol_water_density/io_tools.py b/ml_peg/calcs/liquids/ethanol_water_density/_io_tools.py similarity index 92% rename from ml_peg/calcs/liquids/ethanol_water_density/io_tools.py rename to ml_peg/calcs/liquids/ethanol_water_density/_io_tools.py index e3a03a4c4..0adfce05b 100644 --- a/ml_peg/calcs/liquids/ethanol_water_density/io_tools.py +++ b/ml_peg/calcs/liquids/ethanol_water_density/_io_tools.py @@ -1,6 +1,10 @@ +"""i/o tools for calculations.""" + +from __future__ import annotations + +from collections.abc import Iterable import csv from pathlib import Path -from typing import Iterable def write_density_timeseries_checkpointed( @@ -46,10 +50,7 @@ def write_density_timeseries_checkpointed( if old_vals: n = min(len(old_vals), len(rho_series)) - matches = sum( - abs(old_vals[i] - rho_series[i]) < 1e-6 - for i in range(n) - ) + matches = sum(abs(old_vals[i] - rho_series[i]) < 1e-6 for i in range(n)) frac = matches / n if n else 0.0 @@ -70,9 +71,6 @@ def write_density_timeseries_checkpointed( w.writerow([i, f"{rho:.8f}"]) - - - class DensityTimeseriesLogger: """ Streaming CSV logger for density time series. @@ -95,6 +93,7 @@ def __init__(self, path: Path, *, overwrite: bool = True): # context manager API # --------------------- def __enter__(self): + """Open the file.""" if self.overwrite and self.path.exists(): self.path.unlink() @@ -107,6 +106,7 @@ def __enter__(self): return self def __exit__(self, exc_type, exc, tb): + """Close the file.""" if self._f: self._f.close() diff --git a/ml_peg/calcs/liquids/ethanol_water_density/calc_ethanol_water_density.py b/ml_peg/calcs/liquids/ethanol_water_density/calc_ethanol_water_density.py index 1b0574554..928b178fb 100644 --- a/ml_peg/calcs/liquids/ethanol_water_density/calc_ethanol_water_density.py +++ b/ml_peg/calcs/liquids/ethanol_water_density/calc_ethanol_water_density.py @@ -1,4 +1,7 @@ -import csv +"""calculate ethanol-water density curves.""" + +from __future__ import annotations + import os from pathlib import Path from typing import Any @@ -15,7 +18,9 @@ make_fake_curve, make_fake_density_timeseries, ) -from ml_peg.calcs.liquids.ethanol_water_density.io_tools import write_density_timeseries_checkpointed +from ml_peg.calcs.liquids.ethanol_water_density.io_tools import ( + write_density_timeseries_checkpointed, +) from ml_peg.calcs.liquids.ethanol_water_density.md_code import run_one_case from ml_peg.models.get_models import load_models from ml_peg.models.models import current_models @@ -35,9 +40,12 @@ def _case_id(composition) -> str: return f"x={composition.x_ethanol:.2f}" -@pytest.mark.parametrize("mlip", MODELS.items(), ids=[n for n in MODELS.keys()]) -@pytest.mark.parametrize("composition", COMPOSITIONS, ids=[_case_id(c) for c in COMPOSITIONS]) +@pytest.mark.parametrize("mlip", MODELS.items(), ids=list(MODELS.keys())) +@pytest.mark.parametrize( + "composition", COMPOSITIONS, ids=[_case_id(c) for c in COMPOSITIONS] +) def test_water_ethanol_density_curve(mlip: tuple[str, Any], composition) -> None: + """Either run the md simulation or fake the data.""" if not FAKE_DATA: water_ethanol_density_curve_one_case(mlip, composition) else: @@ -45,6 +53,7 @@ def test_water_ethanol_density_curve(mlip: tuple[str, Any], composition) -> None def water_ethanol_density_curve_one_case(mlip: tuple[str, Any], case) -> None: + """Run an md simulation.""" model_name, model = mlip # TODO: dispersion ??? model_out = OUT_PATH / model_name @@ -54,7 +63,9 @@ def water_ethanol_density_curve_one_case(mlip: tuple[str, Any], case) -> None: struct_path = DATA_PATH / case.filename if not struct_path.exists(): - raise FileNotFoundError(f"Missing structure for x={case.x_ethanol}: {struct_path}") + raise FileNotFoundError( + f"Missing structure for x={case.x_ethanol}: {struct_path}" + ) case_dir = model_out / f"x_ethanol_{case.x_ethanol:.2f}" case_dir.mkdir(parents=True, exist_ok=True) @@ -66,6 +77,7 @@ def water_ethanol_density_curve_one_case(mlip: tuple[str, Any], case) -> None: def water_ethanol_density_dummy_data_one_case(mlip: tuple[str, Any], case) -> None: + """Generate fake data for debugging instead of running the test.""" model_name, model = mlip model_out = OUT_PATH / model_name @@ -94,6 +106,17 @@ def water_ethanol_density_dummy_data_one_case(mlip: tuple[str, Any], case) -> No if __name__ == "__main__": # TODO: delete this # run a very small simulation to see if it does something reasonable from mace.calculators import mace_mp + calc = mace_mp("data_old/mace-omat-0-small.model") - rho = run_one_case("data/mix_xe_0.00.extxyz", calc, nvt_stabilise_steps=200, npt_settle_steps=1000, nvt_thermalise_steps=250, npt_equil_steps=1000, npt_prod_steps=1000, log_every=50, workdir=Path("debug")) - print(rho) \ No newline at end of file + rho = run_one_case( + "data/mix_xe_0.00.extxyz", + calc, + nvt_stabilise_steps=200, + npt_settle_steps=1000, + nvt_thermalise_steps=250, + npt_equil_steps=1000, + npt_prod_steps=1000, + log_every=50, + workdir=Path("debug"), + ) + print(rho) diff --git a/ml_peg/calcs/liquids/ethanol_water_density/md_code.py b/ml_peg/calcs/liquids/ethanol_water_density/md_code.py index c8c2ba280..d905b4c59 100644 --- a/ml_peg/calcs/liquids/ethanol_water_density/md_code.py +++ b/ml_peg/calcs/liquids/ethanol_water_density/md_code.py @@ -1,34 +1,46 @@ -import time +"""code for md simulation.""" + +from __future__ import annotations + +from collections.abc import Iterable from contextlib import contextmanager from pathlib import Path -from typing import Any, Iterable +import time +from typing import Any -import numpy as np from ase.io import Trajectory, read, write -from ase.md import MDLogger, Langevin +from ase.md import Langevin, MDLogger from ase.md.langevinbaoab import LangevinBAOAB from ase.md.nptberendsen import NPTBerendsen -from ase.md.velocitydistribution import MaxwellBoltzmannDistribution, Stationary, ZeroRotation +from ase.md.velocitydistribution import ( + MaxwellBoltzmannDistribution, + Stationary, + ZeroRotation, +) from ase.optimize import FIRE -from ase.units import fs, bar +from ase.units import bar, fs +import numpy as np from ml_peg.calcs.liquids.ethanol_water_density.io_tools import DensityTimeseriesLogger def total_mass_kg(atoms): + """Return the mass in kg for ase atoms.""" amu_to_kg = 1.66053906660e-27 return atoms.get_masses().sum() * amu_to_kg def density_g_cm3(atoms): - V_A3 = atoms.get_volume() - V_m3 = V_A3 * 1e-30 + """Return density in g/cm^3.""" + v_a3 = atoms.get_volume() + v_m3 = v_a3 * 1e-30 m_kg = total_mass_kg(atoms) - rho_kg_m3 = m_kg / V_m3 + rho_kg_m3 = m_kg / v_m3 return rho_kg_m3 / 1000.0 def attach_basic_logging(dyn, atoms, md_logfile, log_every, t0): + """Attach a logger to an ase md simulation.""" logger = MDLogger( dyn, atoms, @@ -43,14 +55,14 @@ def attach_basic_logging(dyn, atoms, md_logfile, log_every, t0): def progress(): step = dyn.get_number_of_steps() rho = density_g_cm3(atoms) - V = atoms.get_volume() - T = atoms.get_temperature() + volume = atoms.get_volume() + temperature = atoms.get_temperature() elapsed = time.time() - t0 print( f"[step {step:>8}] " - f"T={T:7.2f} K | " - f"V={V:10.2f} A^3 | " + f"T={temperature:7.2f} K | " + f"V={volume:10.2f} A^3 | " f"rho={rho:7.4f} g/cm^3 | " f"elapsed={elapsed:6.1f}s" ) @@ -60,9 +72,10 @@ def progress(): @contextmanager def traj_logging(dyn, atoms, workdir, traj_every: int, name="md.traj"): + """Context manager for logging trajectory.""" traj = None if traj_every and traj_every > 0: - traj = Trajectory(str(workdir/name), "a", atoms) + traj = Trajectory(str(workdir / name), "a", atoms) dyn.attach(traj.write, interval=traj_every) try: yield traj @@ -70,21 +83,22 @@ def traj_logging(dyn, atoms, workdir, traj_every: int, name="md.traj"): if traj is not None: traj.close() + def run_one_case( struct_path: Path, calc: Any, *, - T_K: float = 298.15, - P_bar: float = 1.0, + temperature: float = 298.15, + p_bar: float = 1.0, dt_fs: float = 0.5, nvt_stabilise_steps: int = 4_000, - npt_settle_steps = 7_500, + npt_settle_steps=7_500, nvt_thermalise_steps: int = 1_000, npt_equil_steps: int = 10_000, npt_prod_steps: int = 25_000, sample_every: int = 20, log_every: int = 200, - log_trajectory_every: int=400, + log_trajectory_every: int = 400, dummy_data=False, workdir: Path, ) -> Iterable[float]: @@ -95,7 +109,9 @@ def run_one_case( """ ts_path = workdir / "density_timeseries.csv" if dummy_data: - rho_series = np.random.normal(loc=0.9,scale=0.05,size=npt_prod_steps//sample_every) + rho_series = np.random.normal( + loc=0.9, scale=0.05, size=npt_prod_steps // sample_every + ) with DensityTimeseriesLogger(ts_path) as density_log: for rho in rho_series: density_log.write(rho) @@ -110,15 +126,16 @@ def run_one_case( opt.run(fmax=0.15) # velocities - MaxwellBoltzmannDistribution(atoms, temperature_K=T_K) + MaxwellBoltzmannDistribution(atoms, temperature_K=temperature) Stationary(atoms) ZeroRotation(atoms) dt = dt_fs * fs t0 = time.time() - # the used pre-relax is not good enough, do some Langevin NVT steps before starting NPT - dyn = Langevin(atoms, timestep=dt, temperature_K=T_K, friction=0.02) + # the used pre-relax is not good enough + # do some Langevin NVT steps before starting NPT + dyn = Langevin(atoms, timestep=dt, temperature_K=temperature, friction=0.02) attach_basic_logging(dyn, atoms, str(workdir / "md.log"), log_every, t0) with traj_logging(dyn, atoms, workdir, traj_every=log_trajectory_every): dyn.run(nvt_stabilise_steps) @@ -128,8 +145,8 @@ def run_one_case( dyn = NPTBerendsen( atoms, timestep=dt, - temperature_K=T_K, - pressure_au=P_bar * bar, + temperature_K=temperature, + pressure_au=p_bar * bar, taut=0.07 * ps, taup=0.4 * ps, compressibility=4.5e-5, @@ -139,10 +156,10 @@ def run_one_case( dyn.run(npt_settle_steps) # thermalise - MaxwellBoltzmannDistribution(atoms, temperature_K=T_K) + MaxwellBoltzmannDistribution(atoms, temperature_K=temperature) Stationary(atoms) ZeroRotation(atoms) - dyn = Langevin(atoms, timestep=dt, temperature_K=T_K, friction=0.03) + dyn = Langevin(atoms, timestep=dt, temperature_K=temperature, friction=0.03) attach_basic_logging(dyn, atoms, str(workdir / "md.log"), log_every, t0) with traj_logging(dyn, atoms, workdir, traj_every=log_trajectory_every): dyn.run(nvt_thermalise_steps) @@ -151,8 +168,8 @@ def run_one_case( dyn = LangevinBAOAB( # MTK atoms, timestep=dt, - temperature_K=T_K, - externalstress=P_bar * bar, + temperature_K=temperature, + externalstress=p_bar * bar, T_tau=0.1 * ps, P_tau=1 * ps, hydrostatic=True, From c079f239c8c2f1f5899070bf2515b97fa1b40c77 Mon Sep 17 00:00:00 2001 From: Arn Date: Tue, 10 Feb 2026 16:46:05 +0000 Subject: [PATCH 06/12] fix imports --- .../analyse_ethanol_water_density.py | 2 +- .../ethanol_water_density/{_io_tools.py => io_tools.py} | 2 +- ml_peg/calcs/liquids/ethanol_water_density/_io_tools.py | 3 ++- .../ethanol_water_density/calc_ethanol_water_density.py | 8 ++++---- ml_peg/calcs/liquids/ethanol_water_density/md_code.py | 2 +- 5 files changed, 9 insertions(+), 8 deletions(-) rename ml_peg/analysis/liquids/ethanol_water_density/{_io_tools.py => io_tools.py} (97%) diff --git a/ml_peg/analysis/liquids/ethanol_water_density/analyse_ethanol_water_density.py b/ml_peg/analysis/liquids/ethanol_water_density/analyse_ethanol_water_density.py index 3827ec510..abf6111ff 100644 --- a/ml_peg/analysis/liquids/ethanol_water_density/analyse_ethanol_water_density.py +++ b/ml_peg/analysis/liquids/ethanol_water_density/analyse_ethanol_water_density.py @@ -9,7 +9,7 @@ import numpy as np import pytest -from ml_peg.analysis.liquids.ethanol_water_density.analysis import ( +from ml_peg.analysis.liquids.ethanol_water_density._analysis import ( _excess_curve, _interp_1d, _peak_x_quadratic, diff --git a/ml_peg/analysis/liquids/ethanol_water_density/_io_tools.py b/ml_peg/analysis/liquids/ethanol_water_density/io_tools.py similarity index 97% rename from ml_peg/analysis/liquids/ethanol_water_density/_io_tools.py rename to ml_peg/analysis/liquids/ethanol_water_density/io_tools.py index 4015f0fec..5d469d6e1 100644 --- a/ml_peg/analysis/liquids/ethanol_water_density/_io_tools.py +++ b/ml_peg/analysis/liquids/ethanol_water_density/io_tools.py @@ -9,7 +9,7 @@ from matplotlib import pyplot as plt import numpy as np -from ml_peg.analysis.liquids.ethanol_water_density.analysis import ( +from ml_peg.analysis.liquids.ethanol_water_density._analysis import ( weight_to_mole_fraction, ) from ml_peg.app import APP_ROOT diff --git a/ml_peg/calcs/liquids/ethanol_water_density/_io_tools.py b/ml_peg/calcs/liquids/ethanol_water_density/_io_tools.py index 0adfce05b..720aff283 100644 --- a/ml_peg/calcs/liquids/ethanol_water_density/_io_tools.py +++ b/ml_peg/calcs/liquids/ethanol_water_density/_io_tools.py @@ -12,6 +12,7 @@ def write_density_timeseries_checkpointed( rho_series: Iterable[float], *, min_match_fraction: float = 0.8, + do_not_raise: bool = False, ) -> None: """ Write density_timeseries.csv with checkpoint validation. @@ -54,7 +55,7 @@ def write_density_timeseries_checkpointed( frac = matches / n if n else 0.0 - if frac < min_match_fraction: + if frac < min_match_fraction and not do_not_raise: raise AssertionError( f"{ts_path}: only {frac:.1%} of checkpoint values match " f"(expected ≥ {min_match_fraction:.0%}). " diff --git a/ml_peg/calcs/liquids/ethanol_water_density/calc_ethanol_water_density.py b/ml_peg/calcs/liquids/ethanol_water_density/calc_ethanol_water_density.py index 928b178fb..d5a0e2e16 100644 --- a/ml_peg/calcs/liquids/ethanol_water_density/calc_ethanol_water_density.py +++ b/ml_peg/calcs/liquids/ethanol_water_density/calc_ethanol_water_density.py @@ -9,16 +9,16 @@ import numpy as np import pytest -from ml_peg.calcs.liquids.ethanol_water_density.compositions import ( +from ml_peg.calcs.liquids.ethanol_water_density._compositions import ( BENCH_ROOT, DATA_PATH, load_compositions, ) -from ml_peg.calcs.liquids.ethanol_water_density.fake_data import ( +from ml_peg.calcs.liquids.ethanol_water_density._fake_data import ( make_fake_curve, make_fake_density_timeseries, ) -from ml_peg.calcs.liquids.ethanol_water_density.io_tools import ( +from ml_peg.calcs.liquids.ethanol_water_density._io_tools import ( write_density_timeseries_checkpointed, ) from ml_peg.calcs.liquids.ethanol_water_density.md_code import run_one_case @@ -100,7 +100,7 @@ def water_ethanol_density_dummy_data_one_case(mlip: tuple[str, Any], case) -> No ) ts_path = case_dir / "density_timeseries.csv" - write_density_timeseries_checkpointed(ts_path, rho_series) + write_density_timeseries_checkpointed(ts_path, rho_series, do_not_raise=True) if __name__ == "__main__": # TODO: delete this diff --git a/ml_peg/calcs/liquids/ethanol_water_density/md_code.py b/ml_peg/calcs/liquids/ethanol_water_density/md_code.py index d905b4c59..1e62b19bc 100644 --- a/ml_peg/calcs/liquids/ethanol_water_density/md_code.py +++ b/ml_peg/calcs/liquids/ethanol_water_density/md_code.py @@ -21,7 +21,7 @@ from ase.units import bar, fs import numpy as np -from ml_peg.calcs.liquids.ethanol_water_density.io_tools import DensityTimeseriesLogger +from ml_peg.calcs.liquids.ethanol_water_density._io_tools import DensityTimeseriesLogger def total_mass_kg(atoms): From af09d974326e64b9205fd88d2c05749dae637661 Mon Sep 17 00:00:00 2001 From: Arn De Moor <86566033+arnon-1@users.noreply.github.com> Date: Tue, 10 Feb 2026 16:58:42 +0000 Subject: [PATCH 07/12] Fix liquids numpydoc issues for pre-commit --- .../ethanol_water_density/_analysis.py | 143 ++++++++++++++++-- .../analyse_ethanol_water_density.py | 137 +++++++++++++++-- .../liquids/ethanol_water_density/io_tools.py | 49 ++++-- .../ethanol_water_density/_compositions.py | 18 ++- .../ethanol_water_density/_fake_data.py | 118 +++++++++++++-- .../ethanol_water_density/_io_tools.py | 95 +++++++++--- .../calc_ethanol_water_density.py | 63 +++++++- .../liquids/ethanol_water_density/md_code.py | 115 +++++++++++++- 8 files changed, 649 insertions(+), 89 deletions(-) diff --git a/ml_peg/analysis/liquids/ethanol_water_density/_analysis.py b/ml_peg/analysis/liquids/ethanol_water_density/_analysis.py index bcc5fbb55..5f292e345 100644 --- a/ml_peg/analysis/liquids/ethanol_water_density/_analysis.py +++ b/ml_peg/analysis/liquids/ethanol_water_density/_analysis.py @@ -1,4 +1,4 @@ -"""analyse ethanol-water density curves.""" +"""Analyse ethanol-water density curves.""" from __future__ import annotations @@ -22,7 +22,29 @@ def x_to_phi_ethanol( rho_eth=RHO_ETH_PURE, rho_water=RHO_WATER_PURE, ): # TODO: double check formula - """Convert ethanol mole fraction x to ethanol volume fraction phi.""" + """ + Convert ethanol mole fraction to ethanol volume fraction. + + Parameters + ---------- + x : array-like + Ethanol mole fraction. + rho_mix : array-like + Mixture density in g/cm^3 at each composition. + m_eth : float, optional + Ethanol molar mass in g/mol. + m_water : float, optional + Water molar mass in g/mol. + rho_eth : float, optional + Pure ethanol density in g/cm^3. + rho_water : float, optional + Pure water density in g/cm^3. + + Returns + ------- + numpy.ndarray + Ethanol volume fraction for each input composition. + """ x = np.asarray(x, dtype=float) rho_mix = np.asarray(rho_mix, dtype=float) @@ -35,10 +57,18 @@ def x_to_phi_ethanol( def weight_to_mole_fraction(w): - """ - Convert ethanol weight fraction -> mole fraction. - - w = mass_ethanol / total_mass + r""" + Convert ethanol weight fraction to mole fraction. + + Parameters + ---------- + w : array-like + Ethanol weight fraction :math:`m_\mathrm{ethanol} / m_\mathrm{total}`. + + Returns + ------- + numpy.ndarray + Ethanol mole fraction. """ n_e = w / M_ETOH n_w = (1 - w) / M_WATER @@ -46,15 +76,42 @@ def weight_to_mole_fraction(w): def _rmse(a: np.ndarray, b: np.ndarray) -> float: + """ + Compute root-mean-square error between two arrays. + + Parameters + ---------- + a : numpy.ndarray + First array. + b : numpy.ndarray + Second array. + + Returns + ------- + float + Root-mean-square error. + """ d = a - b return float(np.sqrt(np.mean(d * d))) def _interp_1d(x_src: np.ndarray, y_src: np.ndarray, x_tgt: np.ndarray) -> np.ndarray: """ - Linear interpolation. - - Requires x_tgt within [min(x_src), max(x_src)]. + Linearly interpolate onto target x values. + + Parameters + ---------- + x_src : numpy.ndarray + Source x grid. + y_src : numpy.ndarray + Source y values. + x_tgt : numpy.ndarray + Target x positions. + + Returns + ------- + numpy.ndarray + Interpolated y values at ``x_tgt``. """ if np.any(x_tgt < x_src.min() - 1e-12) or np.any(x_tgt > x_src.max() + 1e-12): raise ValueError("Target x values fall outside reference interpolation range.") @@ -64,7 +121,23 @@ def _interp_1d(x_src: np.ndarray, y_src: np.ndarray, x_tgt: np.ndarray) -> np.nd def _endpoints_at_0_1( x: np.ndarray, y: np.ndarray, tol: float = 1e-8 ) -> tuple[float, float]: - """Return y(x=0) and y(x=1). Requires that x includes (approximately) 0 and 1.""" + """ + Return y values at x=0 and x=1. + + Parameters + ---------- + x : numpy.ndarray + Composition grid. + y : numpy.ndarray + Property values. + tol : float, optional + Absolute tolerance used to identify endpoint compositions. + + Returns + ------- + tuple[float, float] + Pair ``(y0, y1)`` for x=0 and x=1. + """ i0 = np.where(np.isclose(x, 0.0, atol=tol))[0] i1 = np.where(np.isclose(x, 1.0, atol=tol))[0] if len(i0) != 1 or len(i1) != 1: @@ -73,21 +146,61 @@ def _endpoints_at_0_1( def _linear_baseline(x: np.ndarray, y0: float, y1: float) -> np.ndarray: + """ + Build the straight line connecting values at x=0 and x=1. + + Parameters + ---------- + x : numpy.ndarray + Composition grid. + y0 : float + Value at x=0. + y1 : float + Value at x=1. + + Returns + ------- + numpy.ndarray + Linear baseline evaluated at ``x``. + """ return y0 + x * (y1 - y0) def _excess_curve(x: np.ndarray, y: np.ndarray) -> np.ndarray: - """Excess relative to the dataset's own pure endpoints (x=0 and x=1).""" + """ + Compute excess curve relative to endpoint linear interpolation. + + Parameters + ---------- + x : numpy.ndarray + Composition grid. + y : numpy.ndarray + Property values. + + Returns + ------- + numpy.ndarray + Excess values ``y - y_linear``. + """ y0, y1 = _endpoints_at_0_1(x, y) return y - _linear_baseline(x, y0, y1) def _peak_x_quadratic(x: np.ndarray, y: np.ndarray) -> float: """ - Estimate x position of minimum y. - - - If min is interior and we have neighbors, fit quadratic through 3 points. - - Otherwise fall back to argmin x. + Estimate x position of the minimum by local quadratic fitting. + + Parameters + ---------- + x : numpy.ndarray + Composition grid. + y : numpy.ndarray + Property values. + + Returns + ------- + float + Estimated composition of the minimum. """ if len(x) < 3: return float(x[int(np.argmin(y))]) diff --git a/ml_peg/analysis/liquids/ethanol_water_density/analyse_ethanol_water_density.py b/ml_peg/analysis/liquids/ethanol_water_density/analyse_ethanol_water_density.py index abf6111ff..142b802e0 100644 --- a/ml_peg/analysis/liquids/ethanol_water_density/analyse_ethanol_water_density.py +++ b/ml_peg/analysis/liquids/ethanol_water_density/analyse_ethanol_water_density.py @@ -1,4 +1,4 @@ -"""analyse ethanol-water density curves.""" +"""Analyse ethanol-water density curves.""" # TODO: remove hardcoded things? from __future__ import annotations @@ -42,7 +42,14 @@ @pytest.fixture(scope="session") def ref_curve() -> tuple[np.ndarray, np.ndarray]: - """Return reference density curve.""" + """ + Return the reference density curve on a sorted mole-fraction grid. + + Returns + ------- + tuple[numpy.ndarray, numpy.ndarray] + Sorted mole fractions and reference densities. + """ x_ref, rho_ref = read_ref_curve() x = np.asarray(x_ref, dtype=float) rho = np.asarray(rho_ref, dtype=float) @@ -54,7 +61,14 @@ def ref_curve() -> tuple[np.ndarray, np.ndarray]: @pytest.fixture def model_curves() -> dict[str, tuple[np.ndarray, np.ndarray]]: - """Return simulated density curves.""" + """ + Return simulated model density curves on sorted composition grids. + + Returns + ------- + dict[str, tuple[numpy.ndarray, numpy.ndarray]] + Mapping from model name to x-grid and density values. + """ curves: dict[str, tuple[np.ndarray, np.ndarray]] = {} for model_name in MODELS: xs, rhos = _read_model_curve(model_name) @@ -77,7 +91,21 @@ def model_curves() -> dict[str, tuple[np.ndarray, np.ndarray]]: # }, ) # TODO: read docs!!! doesn't seem to work yet. def densities_parity(ref_curve, model_curves) -> dict[str, list]: - """Parity plot of simulated and reference density.""" + """ + Build parity-plot payload for model and reference densities. + + Parameters + ---------- + ref_curve : tuple[numpy.ndarray, numpy.ndarray] + Reference composition and density arrays. + model_curves : dict[str, tuple[numpy.ndarray, numpy.ndarray]] + Per-model composition and density arrays. + + Returns + ------- + dict[str, list] + Reference and model densities sampled on a common grid. + """ x_ref, rho_ref = ref_curve # Use the first model's x grid for hover labels (parity requires same-length lists) @@ -107,7 +135,21 @@ def densities_parity(ref_curve, model_curves) -> dict[str, list]: def debug_curve_plots( ref_curve, model_curves ) -> None: # TODO should I remove or use a different format? - """Plot density curves.""" + """ + Generate optional debug plots for densities and excess properties. + + Parameters + ---------- + ref_curve : tuple[numpy.ndarray, numpy.ndarray] + Reference composition and density arrays. + model_curves : dict[str, tuple[numpy.ndarray, numpy.ndarray]] + Per-model composition and density arrays. + + Returns + ------- + None + Plots are written to disk when debug plotting is enabled. + """ if not _debug_plot_enabled(): return print("plotting curves") @@ -167,7 +209,21 @@ def debug_curve_plots( @pytest.fixture def rmse_density(ref_curve, model_curves) -> dict[str, float]: - """RMSE of the density vs reference density.""" + """ + Compute density RMSE versus interpolated reference values. + + Parameters + ---------- + ref_curve : tuple[numpy.ndarray, numpy.ndarray] + Reference composition and density arrays. + model_curves : dict[str, tuple[numpy.ndarray, numpy.ndarray]] + Per-model composition and density arrays. + + Returns + ------- + dict[str, float] + RMSE values in g/cm^3 keyed by model name. + """ x_ref, rho_ref = ref_curve out: dict[str, float] = {} for m, (x_m, rho_m) in model_curves.items(): @@ -178,7 +234,21 @@ def rmse_density(ref_curve, model_curves) -> dict[str, float]: @pytest.fixture def rmse_excess_density(ref_curve, model_curves) -> dict[str, float]: - """RMSE of excess density (detrended by each dataset's own pure endpoints).""" + """ + Compute RMSE of excess density curves. + + Parameters + ---------- + ref_curve : tuple[numpy.ndarray, numpy.ndarray] + Reference composition and density arrays. + model_curves : dict[str, tuple[numpy.ndarray, numpy.ndarray]] + Per-model composition and density arrays. + + Returns + ------- + dict[str, float] + Excess-density RMSE values keyed by model name. + """ x_ref, rho_ref = ref_curve out: dict[str, float] = {} @@ -196,10 +266,19 @@ def rmse_excess_density(ref_curve, model_curves) -> dict[str, float]: @pytest.fixture def peak_x_error(ref_curve, model_curves) -> dict[str, float]: """ - Absolute error in the x-position of the maximum excess density. - - Ref peak is computed on the dense reference curve. - Model peak is computed on its (coarse) grid with a local quadratic refinement. + Compute absolute error in composition of maximum excess density. + + Parameters + ---------- + ref_curve : tuple[numpy.ndarray, numpy.ndarray] + Reference composition and density arrays. + model_curves : dict[str, tuple[numpy.ndarray, numpy.ndarray]] + Per-model composition and density arrays. + + Returns + ------- + dict[str, float] + Absolute peak-position error keyed by model name. """ x_ref, rho_ref = ref_curve ex_ref_dense = _excess_curve(x_ref, rho_ref) @@ -242,7 +321,23 @@ def metrics( rmse_excess_density: dict[str, float], peak_x_error: dict[str, float], ) -> dict[str, dict]: - """Return metric data.""" + """ + Combine individual metrics into the table payload. + + Parameters + ---------- + rmse_density : dict[str, float] + Density RMSE values. + rmse_excess_density : dict[str, float] + Excess-density RMSE values. + peak_x_error : dict[str, float] + Peak-position errors. + + Returns + ------- + dict[str, dict] + Metric-name to per-model mapping. + """ return { "RMSE density": rmse_density, "RMSE excess density": rmse_excess_density, @@ -253,7 +348,23 @@ def metrics( def test_ethanol_water_density( metrics: dict[str, dict], densities_parity: dict[str, list], debug_curve_plots ) -> None: - """Launch analysis.""" + """ + Execute density analysis fixtures and emit debug output. + + Parameters + ---------- + metrics : dict[str, dict] + Metrics table payload. + densities_parity : dict[str, list] + Parity plot payload. + debug_curve_plots : None + Side-effect fixture for debug plotting. + + Returns + ------- + None + The test validates fixture execution and writes artifacts. + """ print( MODEL_INDEX ) # TODO: these print statements may be useful for debugging, but should I remove? diff --git a/ml_peg/analysis/liquids/ethanol_water_density/io_tools.py b/ml_peg/analysis/liquids/ethanol_water_density/io_tools.py index 5d469d6e1..b0e5351ec 100644 --- a/ml_peg/analysis/liquids/ethanol_water_density/io_tools.py +++ b/ml_peg/analysis/liquids/ethanol_water_density/io_tools.py @@ -1,4 +1,4 @@ -"""i/o tools for analysis of ethanol-water densities.""" +"""I/O tools for analysis of ethanol-water densities.""" from __future__ import annotations @@ -23,11 +23,29 @@ def _debug_plot_enabled() -> bool: + """ + Return whether debug plots are enabled via environment variable. + + Returns + ------- + bool + ``True`` when ``DEBUG_PLOTS`` is set to a truthy value. + """ # Turn on plots by: DEBUG_PLOTS=1 pytest ... return os.environ.get("DEBUG_PLOTS", "0") not in ("0", "", "false", "False") def _savefig(fig, outpath: Path) -> None: + """ + Save and close a Matplotlib figure. + + Parameters + ---------- + fig : matplotlib.figure.Figure + Figure object to save. + outpath : pathlib.Path + Output path. + """ outpath.parent.mkdir(parents=True, exist_ok=True) fig.tight_layout() fig.savefig(outpath, dpi=200) @@ -36,11 +54,17 @@ def _savefig(fig, outpath: Path) -> None: def _read_model_curve(model_name: str) -> tuple[list[float], list[float]]: """ - Read model density curve by computing averages from raw time series. + Read a model density curve by averaging per-case time series. + + Parameters + ---------- + model_name : str + Name of model output directory under calculation outputs. - Expects per-composition files: - x_ethanol_XX/density_timeseries.csv - with columns: step, rho_g_cm3 + Returns + ------- + tuple[list[float], list[float]] + Mole-fraction values and corresponding mean densities. """ model_dir = CALC_PATH / model_name xs: list[float] = [] @@ -86,17 +110,12 @@ def _read_model_curve(model_name: str) -> tuple[list[float], list[float]]: def read_ref_curve() -> tuple[list[float], list[float]]: """ - Load densities given on a uniform weight-fraction grid. - - And convert to mole fraction. - Densities from: - M. Southard and D. Green, Perry’s Chemical Engineers’ Handbook, - 9th Edition. McGraw-Hill Education, 2018. + Load the reference density curve and convert to mole fraction. - Assumes: - - 101 evenly spaced points - - first line = 0 wt% ethanol - - last line = 100 wt% + Returns + ------- + tuple[list[float], list[float]] + Mole-fraction x-values and reference densities in g/cm^3. """ ref_file = DATA_PATH / "densities_293.15.txt" rho_ref = np.loadtxt(ref_file) diff --git a/ml_peg/calcs/liquids/ethanol_water_density/_compositions.py b/ml_peg/calcs/liquids/ethanol_water_density/_compositions.py index bac4af789..290bad22b 100644 --- a/ml_peg/calcs/liquids/ethanol_water_density/_compositions.py +++ b/ml_peg/calcs/liquids/ethanol_water_density/_compositions.py @@ -12,7 +12,16 @@ @dataclass(frozen=True) class CompositionCase: - """Map composition to file.""" + """ + Map composition value to structure filename. + + Attributes + ---------- + x_ethanol : float + Ethanol mole fraction for the case. + filename : str + Structure filename associated with the composition. + """ x_ethanol: float filename: str @@ -20,9 +29,12 @@ class CompositionCase: def load_compositions() -> list[CompositionCase]: """ - Load composition grid. + Load composition grid from ``compositions.csv``. - Expected CSV columns: x_ethanol, filename + Returns + ------- + list[CompositionCase] + Parsed composition cases ordered as in the CSV file. """ comps_file = DATA_PATH / "compositions.csv" cases: list[CompositionCase] = [] diff --git a/ml_peg/calcs/liquids/ethanol_water_density/_fake_data.py b/ml_peg/calcs/liquids/ethanol_water_density/_fake_data.py index 0b28195e0..abc471ec5 100644 --- a/ml_peg/calcs/liquids/ethanol_water_density/_fake_data.py +++ b/ml_peg/calcs/liquids/ethanol_water_density/_fake_data.py @@ -1,4 +1,4 @@ -"""for debugging, to verify that metrics actually do something reasonable.""" +"""Generate fake data for debugging density-curve metrics.""" from __future__ import annotations @@ -11,7 +11,32 @@ @dataclass(frozen=True) class FakeCurveParams: - """Class for fake curve parameters.""" + """ + Parameters controlling synthetic curve perturbations. + + Attributes + ---------- + severity : float + Global scaling of all enabled perturbation components. + bias : float + Additive offset magnitude. + scale : float + Multiplicative distortion magnitude. + tilt : float + Linear-in-composition additive distortion magnitude. + warp : float + Smooth nonlinear distortion magnitude. + noise_sigma : float + Standard deviation for additive white noise. + corr_len : float + Correlation length for correlated noise component. + bump_amp : float + Localized bump amplitude. + bump_center : float + Composition location for localized bump. + bump_width : float + Width of localized bump in composition units. + """ # Master knob: 0 -> perfect match, 1 -> very poor severity: float = 0.0 @@ -33,7 +58,23 @@ class FakeCurveParams: def _smooth_random_field( xs: np.ndarray, corr_len: float, rng: np.random.Generator ) -> np.ndarray: - """Create a zero-mean, ~unit-std smooth random field along xs.""" + """ + Create a smooth random field along the composition grid. + + Parameters + ---------- + xs : numpy.ndarray + Composition grid. + corr_len : float + Correlation length in composition units. + rng : numpy.random.Generator + Random number generator. + + Returns + ------- + numpy.ndarray + Approximately zero-mean, unit-variance correlated field. + """ if corr_len <= 0: return np.zeros_like(xs) @@ -55,10 +96,25 @@ def make_fake_curve_from_ref( clip: tuple[float | None, float | None] = (None, None), ) -> tuple[list[float], list[float]]: """ - Return (xs, ys_fake) using the same xs as the reference. - - `severity` scales *all* enabled components. For example, if bias=10 and - severity=0.2, you get ~2 units of bias (with a tiny randomization). + Build a synthetic curve from reference data and perturbation settings. + + Parameters + ---------- + xs_ref : list[float] + Reference x-grid. + ys_ref : list[float] + Reference y-values. + params : FakeCurveParams + Perturbation parameters controlling distortion type and magnitude. + seed : int | None, optional + Seed for deterministic random perturbations. + clip : tuple[float | None, float | None], optional + Optional ``(min, max)`` clipping bounds for generated y values. + + Returns + ------- + tuple[list[float], list[float]] + Synthetic x and y arrays. """ sev = float(np.clip(params.severity, 0.0, 1.0)) rng = np.random.default_rng(seed) @@ -73,6 +129,14 @@ def make_fake_curve_from_ref( # Small randomization so multiple models with same severity aren’t identical # (but still deterministic for a given seed). def jitter(): + """ + Return a small random multiplier around unity. + + Returns + ------- + float + Random scale factor in a narrow range near 1. + """ return 0.85 + 0.30 * rng.random() # 1) multiplicative scale error @@ -124,7 +188,21 @@ def make_fake_curve( kind: str | int, seed: int | None = 0, ) -> tuple[list[float], list[float]]: - """Make a fake density curve based on a reference.""" + """ + Generate a synthetic density curve with predefined quality level. + + Parameters + ---------- + kind : str | int + Quality label or index (perfect/good/medium/bad or 0-3). + seed : int | None, optional + Seed for deterministic randomness. + + Returns + ------- + tuple[list[float], list[float]] + Synthetic x and y arrays. + """ xs_ref, ys_ref = read_ref_curve() kind = kind.lower().strip() if isinstance(kind, str) else kind @@ -182,7 +260,29 @@ def make_fake_density_timeseries( tau: float = 0.15, # relaxation rate (bigger -> faster) noise_sigma: float = 0.001, # per-step noise ) -> list[float]: - """Make a time series of fake density values.""" + """ + Generate a fake density time series with relaxation dynamics. + + Parameters + ---------- + rho_eq : float + Equilibrium density target. + n_steps : int + Number of time steps to generate. + seed : int + Seed for deterministic random noise. + start_offset : float, optional + Magnitude of initial offset from equilibrium. + tau : float, optional + Relaxation factor per step. + noise_sigma : float, optional + Standard deviation of additive Gaussian noise per step. + + Returns + ------- + list[float] + Generated density values. + """ rng = np.random.default_rng(seed) rho0 = rho_eq + start_offset * (2 * rng.random() - 1) diff --git a/ml_peg/calcs/liquids/ethanol_water_density/_io_tools.py b/ml_peg/calcs/liquids/ethanol_water_density/_io_tools.py index 720aff283..93bb9835c 100644 --- a/ml_peg/calcs/liquids/ethanol_water_density/_io_tools.py +++ b/ml_peg/calcs/liquids/ethanol_water_density/_io_tools.py @@ -1,4 +1,4 @@ -"""i/o tools for calculations.""" +"""I/O tools for calculations.""" from __future__ import annotations @@ -15,20 +15,23 @@ def write_density_timeseries_checkpointed( do_not_raise: bool = False, ) -> None: """ - Write density_timeseries.csv with checkpoint validation. - - Behavior - -------- - If file exists: - - read existing values - - verify >= min_match_fraction already match rho_series - - overwrite anyway - - raise AssertionError if insufficient match - - If file does not exist: - - just write - - Helps detect broken resume logic while still allowing overwrite. + Write ``density_timeseries.csv`` with checkpoint validation. + + Parameters + ---------- + ts_path : pathlib.Path + Output CSV path. + rho_series : collections.abc.Iterable[float] + Density samples in g/cm^3. + min_match_fraction : float, optional + Required fraction of matching values versus existing checkpoint. + do_not_raise : bool, optional + If ``True``, skip assertion failures when checkpoint mismatches. + + Returns + ------- + None + This function writes the file in-place. """ rho_series = list(rho_series) @@ -76,14 +79,25 @@ class DensityTimeseriesLogger: """ Streaming CSV logger for density time series. - - deletes existing file on start (optional) - - writes header once - - append rows as simulation runs - - flushes every write (crash-safe) - - usable as context manager + Parameters + ---------- + path : pathlib.Path + Output CSV file path. + overwrite : bool, optional + Whether to delete pre-existing output when opening. """ def __init__(self, path: Path, *, overwrite: bool = True): + """ + Initialize the logger. + + Parameters + ---------- + path : pathlib.Path + Path to CSV output file. + overwrite : bool, optional + If ``True``, delete existing file when opening. + """ self.path = Path(path) self.overwrite = overwrite self._f = None @@ -94,7 +108,14 @@ def __init__(self, path: Path, *, overwrite: bool = True): # context manager API # --------------------- def __enter__(self): - """Open the file.""" + """ + Open the output file and return logger instance. + + Returns + ------- + DensityTimeseriesLogger + Logger ready to write rows. + """ if self.overwrite and self.path.exists(): self.path.unlink() @@ -107,7 +128,23 @@ def __enter__(self): return self def __exit__(self, exc_type, exc, tb): - """Close the file.""" + """ + Close the output file. + + Parameters + ---------- + exc_type : type | None + Exception type, if raised inside context. + exc : BaseException | None + Exception instance, if raised inside context. + tb : traceback | None + Traceback, if raised inside context. + + Returns + ------- + None + This method performs cleanup only. + """ if self._f: self._f.close() @@ -115,7 +152,19 @@ def __exit__(self, exc_type, exc, tb): # logging # --------------------- def write(self, rho: float): - """Write one density value.""" + """ + Write one density value to the CSV file. + + Parameters + ---------- + rho : float + Density in g/cm^3 for the current sample. + + Returns + ------- + None + This method appends one row to disk. + """ self._writer.writerow([self._step, f"{rho:.8f}"]) self._f.flush() # critical for crash safety self._step += 1 diff --git a/ml_peg/calcs/liquids/ethanol_water_density/calc_ethanol_water_density.py b/ml_peg/calcs/liquids/ethanol_water_density/calc_ethanol_water_density.py index d5a0e2e16..0b9728a08 100644 --- a/ml_peg/calcs/liquids/ethanol_water_density/calc_ethanol_water_density.py +++ b/ml_peg/calcs/liquids/ethanol_water_density/calc_ethanol_water_density.py @@ -1,4 +1,4 @@ -"""calculate ethanol-water density curves.""" +"""Calculate ethanol-water density curves.""" from __future__ import annotations @@ -36,6 +36,19 @@ def _case_id(composition) -> str: + """ + Build a readable test identifier for a composition case. + + Parameters + ---------- + composition : Any + Composition object with an ``x_ethanol`` attribute. + + Returns + ------- + str + Case identifier shown in pytest output. + """ # nicer test ids in `pytest -vv` return f"x={composition.x_ethanol:.2f}" @@ -45,7 +58,21 @@ def _case_id(composition) -> str: "composition", COMPOSITIONS, ids=[_case_id(c) for c in COMPOSITIONS] ) def test_water_ethanol_density_curve(mlip: tuple[str, Any], composition) -> None: - """Either run the md simulation or fake the data.""" + """ + Generate one density-curve case for a model and composition. + + Parameters + ---------- + mlip : tuple[str, Any] + Pair of model name and model object. + composition : Any + Composition case input. + + Returns + ------- + None + This test writes output files for a single case. + """ if not FAKE_DATA: water_ethanol_density_curve_one_case(mlip, composition) else: @@ -53,7 +80,21 @@ def test_water_ethanol_density_curve(mlip: tuple[str, Any], composition) -> None def water_ethanol_density_curve_one_case(mlip: tuple[str, Any], case) -> None: - """Run an md simulation.""" + """ + Run one MD simulation case and write its density time series. + + Parameters + ---------- + mlip : tuple[str, Any] + Pair of model name and model object. + case : Any + Composition case containing ``x_ethanol`` and ``filename``. + + Returns + ------- + None + This function writes outputs for one composition. + """ model_name, model = mlip # TODO: dispersion ??? model_out = OUT_PATH / model_name @@ -77,7 +118,21 @@ def water_ethanol_density_curve_one_case(mlip: tuple[str, Any], case) -> None: def water_ethanol_density_dummy_data_one_case(mlip: tuple[str, Any], case) -> None: - """Generate fake data for debugging instead of running the test.""" + """ + Generate one synthetic density time series for debugging. + + Parameters + ---------- + mlip : tuple[str, Any] + Pair of model name and model object. + case : Any + Composition case containing ``x_ethanol``. + + Returns + ------- + None + This function writes a fake density time series. + """ model_name, model = mlip model_out = OUT_PATH / model_name diff --git a/ml_peg/calcs/liquids/ethanol_water_density/md_code.py b/ml_peg/calcs/liquids/ethanol_water_density/md_code.py index 1e62b19bc..8c968fe7e 100644 --- a/ml_peg/calcs/liquids/ethanol_water_density/md_code.py +++ b/ml_peg/calcs/liquids/ethanol_water_density/md_code.py @@ -1,4 +1,4 @@ -"""code for md simulation.""" +"""Code for molecular-dynamics simulation workflows.""" from __future__ import annotations @@ -25,13 +25,37 @@ def total_mass_kg(atoms): - """Return the mass in kg for ase atoms.""" + """ + Return atomic-system mass in kilograms. + + Parameters + ---------- + atoms : ase.Atoms + Atomic configuration. + + Returns + ------- + float + Total mass in kilograms. + """ amu_to_kg = 1.66053906660e-27 return atoms.get_masses().sum() * amu_to_kg def density_g_cm3(atoms): - """Return density in g/cm^3.""" + """ + Return density in g/cm^3. + + Parameters + ---------- + atoms : ase.Atoms + Atomic configuration with periodic cell volume. + + Returns + ------- + float + Density in g/cm^3. + """ v_a3 = atoms.get_volume() v_m3 = v_a3 * 1e-30 m_kg = total_mass_kg(atoms) @@ -40,7 +64,27 @@ def density_g_cm3(atoms): def attach_basic_logging(dyn, atoms, md_logfile, log_every, t0): - """Attach a logger to an ase md simulation.""" + """ + Attach text and progress loggers to an ASE dynamics object. + + Parameters + ---------- + dyn : ase.md.md.MolecularDynamics + Dynamics object to attach callbacks to. + atoms : ase.Atoms + Current atomic system. + md_logfile : str | pathlib.Path + Path to ASE MD log file. + log_every : int + Logging interval in MD steps. + t0 : float + Start timestamp from ``time.time()``. + + Returns + ------- + None + This function mutates ``dyn`` by attaching callbacks. + """ logger = MDLogger( dyn, atoms, @@ -53,6 +97,7 @@ def attach_basic_logging(dyn, atoms, md_logfile, log_every, t0): dyn.attach(logger, interval=log_every) def progress(): + """Print one progress line with thermodynamic state.""" step = dyn.get_number_of_steps() rho = density_g_cm3(atoms) volume = atoms.get_volume() @@ -72,7 +117,27 @@ def progress(): @contextmanager def traj_logging(dyn, atoms, workdir, traj_every: int, name="md.traj"): - """Context manager for logging trajectory.""" + """ + Attach trajectory logging inside a context manager. + + Parameters + ---------- + dyn : ase.md.md.MolecularDynamics + Dynamics object receiving callback. + atoms : ase.Atoms + Atomic system written to trajectory. + workdir : pathlib.Path + Output directory. + traj_every : int + Trajectory write interval in steps. + name : str, optional + Trajectory filename within ``workdir``. + + Yields + ------ + ase.io.trajectory.Trajectory | None + Open trajectory handle when enabled, otherwise ``None``. + """ traj = None if traj_every and traj_every > 0: traj = Trajectory(str(workdir / name), "a", atoms) @@ -103,9 +168,45 @@ def run_one_case( workdir: Path, ) -> Iterable[float]: """ - Run NPT and return (mean_density, std_density). + Run the full MD workflow for one composition case. + + Parameters + ---------- + struct_path : pathlib.Path + Input structure path. + calc : Any + ASE-compatible calculator. + temperature : float, optional + Target temperature in kelvin. + p_bar : float, optional + Target pressure in bar. + dt_fs : float, optional + Time step in femtoseconds. + nvt_stabilise_steps : int, optional + Initial NVT stabilization steps. + npt_settle_steps : int, optional + Berendsen NPT settling steps. + nvt_thermalise_steps : int, optional + NVT thermalization steps after settling. + npt_equil_steps : int, optional + BAOAB NPT equilibration steps. + npt_prod_steps : int, optional + Production NPT steps. + sample_every : int, optional + Sampling interval for density collection. + log_every : int, optional + Logging interval in MD steps. + log_trajectory_every : int, optional + Trajectory write interval in MD steps. + dummy_data : bool, optional + If ``True``, skip simulation and generate synthetic data. + workdir : pathlib.Path + Output directory for logs and trajectories. - TODO: use lammps? Though I would guess GPU is the bottleneck so it wouldn't matter? + Returns + ------- + collections.abc.Iterable[float] + Density time series in g/cm^3. """ ts_path = workdir / "density_timeseries.csv" if dummy_data: From 84e66b5b7d998dddad5d2e8a3c165360d64ebacc Mon Sep 17 00:00:00 2001 From: Arn Date: Wed, 11 Feb 2026 14:12:20 +0000 Subject: [PATCH 08/12] simplify logic --- .../calc_ethanol_water_density.py | 7 +-- .../liquids/ethanol_water_density/md_code.py | 52 ++++--------------- 2 files changed, 13 insertions(+), 46 deletions(-) diff --git a/ml_peg/calcs/liquids/ethanol_water_density/calc_ethanol_water_density.py b/ml_peg/calcs/liquids/ethanol_water_density/calc_ethanol_water_density.py index d5a0e2e16..12fdda7b5 100644 --- a/ml_peg/calcs/liquids/ethanol_water_density/calc_ethanol_water_density.py +++ b/ml_peg/calcs/liquids/ethanol_water_density/calc_ethanol_water_density.py @@ -111,11 +111,8 @@ def water_ethanol_density_dummy_data_one_case(mlip: tuple[str, Any], case) -> No rho = run_one_case( "data/mix_xe_0.00.extxyz", calc, - nvt_stabilise_steps=200, - npt_settle_steps=1000, - nvt_thermalise_steps=250, - npt_equil_steps=1000, - npt_prod_steps=1000, + nvt_steps=1000, + npt_steps=3000, log_every=50, workdir=Path("debug"), ) diff --git a/ml_peg/calcs/liquids/ethanol_water_density/md_code.py b/ml_peg/calcs/liquids/ethanol_water_density/md_code.py index 1e62b19bc..6c3158306 100644 --- a/ml_peg/calcs/liquids/ethanol_water_density/md_code.py +++ b/ml_peg/calcs/liquids/ethanol_water_density/md_code.py @@ -11,7 +11,6 @@ from ase.io import Trajectory, read, write from ase.md import Langevin, MDLogger from ase.md.langevinbaoab import LangevinBAOAB -from ase.md.nptberendsen import NPTBerendsen from ase.md.velocitydistribution import ( MaxwellBoltzmannDistribution, Stationary, @@ -91,11 +90,8 @@ def run_one_case( temperature: float = 298.15, p_bar: float = 1.0, dt_fs: float = 0.5, - nvt_stabilise_steps: int = 4_000, - npt_settle_steps=7_500, - nvt_thermalise_steps: int = 1_000, - npt_equil_steps: int = 10_000, - npt_prod_steps: int = 25_000, + nvt_steps: int = 10_000, + npt_steps: int = 50_000, sample_every: int = 20, log_every: int = 200, log_trajectory_every: int = 400, @@ -110,7 +106,7 @@ def run_one_case( ts_path = workdir / "density_timeseries.csv" if dummy_data: rho_series = np.random.normal( - loc=0.9, scale=0.05, size=npt_prod_steps // sample_every + loc=0.9, scale=0.05, size=npt_steps // sample_every ) with DensityTimeseriesLogger(ts_path) as density_log: for rho in rho_series: @@ -132,55 +128,29 @@ def run_one_case( dt = dt_fs * fs t0 = time.time() - - # the used pre-relax is not good enough - # do some Langevin NVT steps before starting NPT - dyn = Langevin(atoms, timestep=dt, temperature_K=temperature, friction=0.02) - attach_basic_logging(dyn, atoms, str(workdir / "md.log"), log_every, t0) - with traj_logging(dyn, atoms, workdir, traj_every=log_trajectory_every): - dyn.run(nvt_stabilise_steps) - - # quick Berendsen settle close to target density ps = 1000 * fs - dyn = NPTBerendsen( - atoms, - timestep=dt, - temperature_K=temperature, - pressure_au=p_bar * bar, - taut=0.07 * ps, - taup=0.4 * ps, - compressibility=4.5e-5, - ) - attach_basic_logging(dyn, atoms, str(workdir / "md.log"), log_every, t0) - with traj_logging(dyn, atoms, workdir, traj_every=log_trajectory_every): - dyn.run(npt_settle_steps) + T_tau = 0.5 * ps - # thermalise - MaxwellBoltzmannDistribution(atoms, temperature_K=temperature) - Stationary(atoms) - ZeroRotation(atoms) - dyn = Langevin(atoms, timestep=dt, temperature_K=temperature, friction=0.03) + dyn = Langevin(atoms, timestep=dt, temperature_K=temperature, friction=1 / (T_tau)) attach_basic_logging(dyn, atoms, str(workdir / "md.log"), log_every, t0) with traj_logging(dyn, atoms, workdir, traj_every=log_trajectory_every): - dyn.run(nvt_thermalise_steps) - + dyn.run(nvt_steps) # real NPT - dyn = LangevinBAOAB( # MTK + dyn = LangevinBAOAB( # use MTK? atoms, timestep=dt, temperature_K=temperature, externalstress=p_bar * bar, - T_tau=0.1 * ps, - P_tau=1 * ps, + T_tau=T_tau, + P_tau=0.5 + * ps, # same timeconstants for baro/thermostat is fine for stochastic ones hydrostatic=True, rng=0, ) attach_basic_logging(dyn, atoms, str(workdir / "md.log"), log_every, t0) with traj_logging(dyn, atoms, workdir, traj_every=log_trajectory_every): - dyn.run(npt_equil_steps) - rhos = [] - n_samples = npt_prod_steps // sample_every + n_samples = npt_steps // sample_every with DensityTimeseriesLogger(ts_path) as density_log: for _ in range(n_samples): dyn.run(sample_every) From f99a1feeccaa3f9191c8c22d5e45caf968c045ca Mon Sep 17 00:00:00 2001 From: Arn Date: Wed, 11 Feb 2026 16:05:02 +0000 Subject: [PATCH 09/12] fix dispersion --- .../ethanol_water_density/calc_ethanol_water_density.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/ml_peg/calcs/liquids/ethanol_water_density/calc_ethanol_water_density.py b/ml_peg/calcs/liquids/ethanol_water_density/calc_ethanol_water_density.py index 0263b019a..e0a50d8bd 100644 --- a/ml_peg/calcs/liquids/ethanol_water_density/calc_ethanol_water_density.py +++ b/ml_peg/calcs/liquids/ethanol_water_density/calc_ethanol_water_density.py @@ -95,12 +95,13 @@ def water_ethanol_density_curve_one_case(mlip: tuple[str, Any], case) -> None: None This function writes outputs for one composition. """ - model_name, model = mlip # TODO: dispersion ??? + model_name, model = mlip model_out = OUT_PATH / model_name model_out.mkdir(parents=True, exist_ok=True) calc = model.get_calculator() + calc = model.add_d3_calculator(calc) struct_path = DATA_PATH / case.filename if not struct_path.exists(): @@ -162,7 +163,7 @@ def water_ethanol_density_dummy_data_one_case(mlip: tuple[str, Any], case) -> No # run a very small simulation to see if it does something reasonable from mace.calculators import mace_mp - calc = mace_mp("data_old/mace-omat-0-small.model") + calc = mace_mp("data_old/mace-omat-0-small.model", dispersion=True) rho = run_one_case( "data/mix_xe_0.00.extxyz", calc, From 5ca1e4c2ddbf39cb121f94b630f27842efc49fd2 Mon Sep 17 00:00:00 2001 From: Arn Date: Wed, 11 Feb 2026 16:50:43 +0000 Subject: [PATCH 10/12] longer run --- .../liquids/ethanol_water_density/md_code.py | 27 +++++++++---------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/ml_peg/calcs/liquids/ethanol_water_density/md_code.py b/ml_peg/calcs/liquids/ethanol_water_density/md_code.py index 2991c7ab5..37e0656bc 100644 --- a/ml_peg/calcs/liquids/ethanol_water_density/md_code.py +++ b/ml_peg/calcs/liquids/ethanol_water_density/md_code.py @@ -156,7 +156,7 @@ def run_one_case( p_bar: float = 1.0, dt_fs: float = 0.5, nvt_steps: int = 10_000, - npt_steps: int = 50_000, + npt_steps: int = 200_000, sample_every: int = 20, log_every: int = 200, log_trajectory_every: int = 400, @@ -178,16 +178,10 @@ def run_one_case( Target pressure in bar. dt_fs : float, optional Time step in femtoseconds. - nvt_stabilise_steps : int, optional - Initial NVT stabilization steps. - npt_settle_steps : int, optional - Berendsen NPT settling steps. - nvt_thermalise_steps : int, optional - NVT thermalization steps after settling. - npt_equil_steps : int, optional - BAOAB NPT equilibration steps. - npt_prod_steps : int, optional - Production NPT steps. + nvt_steps : int, optional + Initial NVT stabilisation and mixing steps. + npt_steps : int, optional + NPT steps. sample_every : int, optional Sampling interval for density collection. log_every : int, optional @@ -230,9 +224,14 @@ def run_one_case( dt = dt_fs * fs t0 = time.time() ps = 1000 * fs - T_tau = 0.5 * ps + thermostat_tau = 0.5 * ps - dyn = Langevin(atoms, timestep=dt, temperature_K=temperature, friction=1 / (T_tau)) + dyn = Langevin( + atoms, + timestep=dt, + temperature_K=temperature, + friction=1 / thermostat_tau, + ) attach_basic_logging(dyn, atoms, str(workdir / "md.log"), log_every, t0) with traj_logging(dyn, atoms, workdir, traj_every=log_trajectory_every): dyn.run(nvt_steps) @@ -242,7 +241,7 @@ def run_one_case( timestep=dt, temperature_K=temperature, externalstress=p_bar * bar, - T_tau=T_tau, + T_tau=thermostat_tau, P_tau=0.5 * ps, # same timeconstants for baro/thermostat is fine for stochastic ones hydrostatic=True, From fd4d8129d8bf228263f4ff3a6c2881e9e14b06ef Mon Sep 17 00:00:00 2001 From: Arn Date: Tue, 17 Feb 2026 21:24:15 +0000 Subject: [PATCH 11/12] shorten dispersion for testing --- .../calc_ethanol_water_density.py | 51 ++++++++++++++++++- 1 file changed, 49 insertions(+), 2 deletions(-) diff --git a/ml_peg/calcs/liquids/ethanol_water_density/calc_ethanol_water_density.py b/ml_peg/calcs/liquids/ethanol_water_density/calc_ethanol_water_density.py index e0a50d8bd..631c15ca3 100644 --- a/ml_peg/calcs/liquids/ethanol_water_density/calc_ethanol_water_density.py +++ b/ml_peg/calcs/liquids/ethanol_water_density/calc_ethanol_water_density.py @@ -35,6 +35,48 @@ COMPOSITIONS = load_compositions() +def add_shorter_d3_calculator(model, calcs): + """ + Add D3 dispersion to calculator(s). + + Parameters + ---------- + model + Model to add the dispersion. + calcs + Calculator, or list of calculators, to add D3 dispersion to via a + SumCalculator. + + Returns + ------- + SumCalculator | Calculator + Calculator(s) with D3 dispersion added, or the original calculator when + the model is already trained with D3 corrections. + """ + if model.trained_on_d3: + return calcs + from ase import units + from ase.calculators.mixing import SumCalculator + import torch + from torch_dftd.torch_dftd3_calculator import TorchDFTD3Calculator + + if not isinstance(calcs, list): + calcs = [calcs] + + d3_calc = TorchDFTD3Calculator( + device=model.d3_kwargs.get("device", "cpu"), + damping=model.d3_kwargs.get("damping", "bj"), + xc=model.d3_kwargs.get("xc", "pbe"), + dtype=getattr(torch, model.d3_kwargs.get("dtype", "float32")), + cutoff=model.d3_kwargs.get( + "cutoff", 25.0 * units.Bohr + ), # shortened to make run more manageable. + ) + calcs.append(d3_calc) + + return SumCalculator(calcs) + + def _case_id(composition) -> str: """ Build a readable test identifier for a composition case. @@ -101,7 +143,7 @@ def water_ethanol_density_curve_one_case(mlip: tuple[str, Any], case) -> None: model_out.mkdir(parents=True, exist_ok=True) calc = model.get_calculator() - calc = model.add_d3_calculator(calc) + calc = add_shorter_d3_calculator(model, calc) # TODO: don't forget to change back struct_path = DATA_PATH / case.filename if not struct_path.exists(): @@ -161,9 +203,14 @@ def water_ethanol_density_dummy_data_one_case(mlip: tuple[str, Any], case) -> No if __name__ == "__main__": # TODO: delete this # run a very small simulation to see if it does something reasonable + from ase import units from mace.calculators import mace_mp - calc = mace_mp("data_old/mace-omat-0-small.model", dispersion=True) + calc = mace_mp( + "data_old/mace-omat-0-small.model", + dispersion=True, + dispersion_cutoff=25 * units.Bohr, + ) rho = run_one_case( "data/mix_xe_0.00.extxyz", calc, From a2bb0ade7ee39d9d4b2461e25539a3618403a531 Mon Sep 17 00:00:00 2001 From: Arn Date: Wed, 18 Feb 2026 14:48:16 +0000 Subject: [PATCH 12/12] checkpoint loading --- .../ethanol_water_density/_io_tools.py | 38 ++++-- .../calc_ethanol_water_density.py | 9 +- .../liquids/ethanol_water_density/md_code.py | 129 ++++++++++++++---- 3 files changed, 134 insertions(+), 42 deletions(-) diff --git a/ml_peg/calcs/liquids/ethanol_water_density/_io_tools.py b/ml_peg/calcs/liquids/ethanol_water_density/_io_tools.py index 93bb9835c..a974ead78 100644 --- a/ml_peg/calcs/liquids/ethanol_water_density/_io_tools.py +++ b/ml_peg/calcs/liquids/ethanol_water_density/_io_tools.py @@ -104,9 +104,6 @@ def __init__(self, path: Path, *, overwrite: bool = True): self._writer = None self._step = 0 - # --------------------- - # context manager API - # --------------------- def __enter__(self): """ Open the output file and return logger instance. @@ -117,13 +114,35 @@ def __enter__(self): Logger ready to write rows. """ if self.overwrite and self.path.exists(): + mode = "w" self.path.unlink() - - self._f = self.path.open("w", newline="") + elif not self.path.exists(): + mode = "w" + else: + mode = "a" + # If appending, recover last step index + try: + with self.path.open("r", newline="") as f: + last_step = -1 + for row in csv.reader(f): + if not row or row[0] == "step": + continue + try: + last_step = int(row[0]) + except ValueError: + continue + self._step = last_step + 1 + except Exception: + # If file is corrupted or empty, just continue from 0 + self._step = 0 + + self._f = self.path.open(mode, newline="") self._writer = csv.writer(self._f) - self._writer.writerow(["step", "rho_g_cm3"]) - self._f.flush() + # Only write header if creating new file + if mode == "w": + self._writer.writerow(["step", "rho_g_cm3"]) + self._f.flush() return self @@ -148,9 +167,6 @@ def __exit__(self, exc_type, exc, tb): if self._f: self._f.close() - # --------------------- - # logging - # --------------------- def write(self, rho: float): """ Write one density value to the CSV file. @@ -166,5 +182,5 @@ def write(self, rho: float): This method appends one row to disk. """ self._writer.writerow([self._step, f"{rho:.8f}"]) - self._f.flush() # critical for crash safety + self._f.flush() self._step += 1 diff --git a/ml_peg/calcs/liquids/ethanol_water_density/calc_ethanol_water_density.py b/ml_peg/calcs/liquids/ethanol_water_density/calc_ethanol_water_density.py index 631c15ca3..8598b0db1 100644 --- a/ml_peg/calcs/liquids/ethanol_water_density/calc_ethanol_water_density.py +++ b/ml_peg/calcs/liquids/ethanol_water_density/calc_ethanol_water_density.py @@ -30,6 +30,7 @@ MODELS = load_models(current_models) MODEL_INDEX = {name: i for i, name in enumerate(MODELS)} FAKE_DATA = os.getenv("FAKE_DENSITY_DATA", "") == "1" +CONTINUE_RUNNING = os.getenv("CONTINUE_RUNNING", "") == "1" # IMPORTANT: create the list once for parametrization COMPOSITIONS = load_compositions() @@ -143,6 +144,7 @@ def water_ethanol_density_curve_one_case(mlip: tuple[str, Any], case) -> None: model_out.mkdir(parents=True, exist_ok=True) calc = model.get_calculator() + # calc = model.add_d3_calculator(calc) calc = add_shorter_d3_calculator(model, calc) # TODO: don't forget to change back struct_path = DATA_PATH / case.filename @@ -154,7 +156,9 @@ def water_ethanol_density_curve_one_case(mlip: tuple[str, Any], case) -> None: case_dir = model_out / f"x_ethanol_{case.x_ethanol:.2f}" case_dir.mkdir(parents=True, exist_ok=True) - rho_series = run_one_case(struct_path, calc, workdir=case_dir) + rho_series = run_one_case( + struct_path, calc, workdir=case_dir, continue_running=CONTINUE_RUNNING + ) ts_path = case_dir / "density_timeseries.csv" write_density_timeseries_checkpointed(ts_path, rho_series) @@ -215,8 +219,9 @@ def water_ethanol_density_dummy_data_one_case(mlip: tuple[str, Any], case) -> No "data/mix_xe_0.00.extxyz", calc, nvt_steps=1000, - npt_steps=3000, + npt_steps=4000, log_every=50, workdir=Path("debug"), + continue_running=True, ) print(rho) diff --git a/ml_peg/calcs/liquids/ethanol_water_density/md_code.py b/ml_peg/calcs/liquids/ethanol_water_density/md_code.py index 37e0656bc..ed74cc928 100644 --- a/ml_peg/calcs/liquids/ethanol_water_density/md_code.py +++ b/ml_peg/calcs/liquids/ethanol_water_density/md_code.py @@ -161,6 +161,7 @@ def run_one_case( log_every: int = 200, log_trajectory_every: int = 400, dummy_data=False, + continue_running=False, workdir: Path, ) -> Iterable[float]: """ @@ -190,6 +191,8 @@ def run_one_case( Trajectory write interval in MD steps. dummy_data : bool, optional If ``True``, skip simulation and generate synthetic data. + continue_running : bool, optional + If ``True``, continue running from the previous trajectory. workdir : pathlib.Path Output directory for logs and trajectories. @@ -199,6 +202,8 @@ def run_one_case( Density time series in g/cm^3. """ ts_path = workdir / "density_timeseries.csv" + traj_path = workdir / "md.traj" + if dummy_data: rho_series = np.random.normal( loc=0.9, scale=0.05, size=npt_steps // sample_every @@ -207,52 +212,107 @@ def run_one_case( for rho in rho_series: density_log.write(rho) return rho_series - atoms = read(struct_path) - atoms.set_pbc(True) - atoms.wrap() - atoms.calc = calc - - # fast pre-relax - opt = FIRE(atoms, logfile=str(workdir / "opt.log")) - opt.run(fmax=0.15) - # velocities - MaxwellBoltzmannDistribution(atoms, temperature_K=temperature) - Stationary(atoms) - ZeroRotation(atoms) + workdir.mkdir(parents=True, exist_ok=True) dt = dt_fs * fs t0 = time.time() ps = 1000 * fs thermostat_tau = 0.5 * ps - dyn = Langevin( - atoms, - timestep=dt, - temperature_K=temperature, - friction=1 / thermostat_tau, - ) - attach_basic_logging(dyn, atoms, str(workdir / "md.log"), log_every, t0) - with traj_logging(dyn, atoms, workdir, traj_every=log_trajectory_every): - dyn.run(nvt_steps) - # real NPT - dyn = LangevinBAOAB( # use MTK? + target_samples = npt_steps // sample_every + + if continue_running: + # 1) Load last saved state (positions + cell + momenta if stored) + if traj_path.exists(): + with Trajectory(str(traj_path), "r") as tr: + if len(tr) == 0: + raise RuntimeError(f"{traj_path} exists but contains no frames.") + atoms = tr[-1] + else: + raise RuntimeError(f"No traj path at {traj_path}. Cannot continue running") + + # 2) Count how many samples are already present + if ts_path.exists(): + # assume one rho per non-empty line; ignore a header if present + n_lines = 0 + with open(ts_path, encoding="utf-8") as f: + for line in f: + s = line.strip() + if not s: + continue + # skip header-ish lines + if any(c.isalpha() for c in s): + continue + n_lines += 1 + already_samples = n_lines + else: + raise RuntimeError(f"no ts_path at {ts_path}. Cannot continue running") + + # If we've already finished, just return what we have + if already_samples >= target_samples: + # load and return existing rhos + rhos_existing = [] + if ts_path.exists(): + with open(ts_path, encoding="utf-8") as f: + for line in f: + s = line.strip() + if not s or any(c.isalpha() for c in s): + continue + # tolerate csv with extra columns + rhos_existing.append(float(s.split(",")[0])) + return np.array(rhos_existing) + atoms.calc = calc + else: + already_samples = 0 + atoms = read(struct_path) + atoms.set_pbc(True) + atoms.wrap() + atoms.calc = calc + + # fast pre-relax + opt = FIRE(atoms, logfile=str(workdir / "opt.log")) + opt.run(fmax=0.15) + + # velocities + MaxwellBoltzmannDistribution(atoms, temperature_K=temperature) + Stationary(atoms) + ZeroRotation(atoms) + + # NVT + dyn = Langevin( + atoms, + timestep=dt, + temperature_K=temperature, + friction=1 / thermostat_tau, + ) + attach_basic_logging(dyn, atoms, str(workdir / "md.log"), log_every, t0) + with traj_logging(dyn, atoms, workdir, traj_every=log_trajectory_every): + dyn.run(nvt_steps) + + # NPT + dyn = LangevinBAOAB( atoms, timestep=dt, temperature_K=temperature, externalstress=p_bar * bar, T_tau=thermostat_tau, - P_tau=0.5 - * ps, # same timeconstants for baro/thermostat is fine for stochastic ones + P_tau=0.5 * ps, hydrostatic=True, rng=0, ) + dyn.nsteps = already_samples * sample_every # seems public enough to me + attach_basic_logging(dyn, atoms, str(workdir / "md.log"), log_every, t0) + + remaining_samples = target_samples - already_samples + rhos = [] + with traj_logging(dyn, atoms, workdir, traj_every=log_trajectory_every): - rhos = [] - n_samples = npt_steps // sample_every - with DensityTimeseriesLogger(ts_path) as density_log: - for _ in range(n_samples): + with DensityTimeseriesLogger( + ts_path, overwrite=not continue_running + ) as density_log: + for _ in range(remaining_samples): dyn.run(sample_every) rho = density_g_cm3(atoms) rhos.append(rho) @@ -261,4 +321,15 @@ def run_one_case( # save final structure for debugging/repro write(workdir / "final.extxyz", atoms) + # If resuming, return the whole series (existing + new) for convenience + if continue_running and ts_path.exists(): + rhos_all = [] + with open(ts_path, encoding="utf-8") as f: + for line in f: + s = line.strip() + if not s or any(c.isalpha() for c in s): + continue + rhos_all.append(float(s.split(",")[0])) + return np.array(rhos_all) + return np.array(rhos)