diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 00000000..1bccc1fa --- /dev/null +++ b/.gitattributes @@ -0,0 +1 @@ +*.h5 filter=lfs diff=lfs merge=lfs -text diff --git a/.github/workflows/pytest.yaml b/.github/workflows/pytest.yaml index d6dcaec1..e27caecb 100644 --- a/.github/workflows/pytest.yaml +++ b/.github/workflows/pytest.yaml @@ -18,7 +18,7 @@ jobs: # windows-latest is not supported because pyscf is not supported on windows # https://pyscf.org/user/install.html os: ["ubuntu-latest", "macos-latest"] - py: ["3.9", "3.10", "3.11", "3.12"] + py: ["3.10", "3.11", "3.12"] steps: - uses: "actions/checkout@v4" @@ -28,6 +28,23 @@ jobs: with: python-version: ${{ matrix.py }} + - name: Install system dependencies for PyTables (Linux/macOS) + run: | + python -m pip install --upgrade pip wheel + pip install numpy==1.26.4 + if [[ "$RUNNER_OS" == "Linux" ]]; then + sudo apt-get update + sudo apt-get install -y libhdf5-dev libblosc-dev + elif [[ "$RUNNER_OS" == "macOS" ]]; then + export HOMEBREW_NO_INSTALL_CLEANUP=1 + brew update + brew install hdf5 c-blosc + export CPATH="$(brew --prefix hdf5)/include:$(brew --prefix c-blosc)/include:$CPATH" + export LIBRARY_PATH="$(brew --prefix hdf5)/lib:$(brew --prefix c-blosc)/lib:$LIBRARY_PATH" + export HDF5_DIR="$(brew --prefix hdf5)" + fi + + - name: Install development version run: | pip install -v . @@ -37,6 +54,7 @@ jobs: pip install --upgrade pip pip install .[test_extra] + - name: Run pytest default tests uses: pavelzw/pytest-action@v2 with: @@ -59,3 +77,5 @@ jobs: click-to-expand: true report-title: 'Dev Test Report' pytest-args: '-m dev' + + diff --git a/atomdb/data/database_beta_1.3.0.h5 b/atomdb/data/database_beta_1.3.0.h5 index 744f7497..b4904224 100644 Binary files a/atomdb/data/database_beta_1.3.0.h5 and b/atomdb/data/database_beta_1.3.0.h5 differ diff --git a/atomdb/data/elements_data.h5 b/atomdb/data/elements_data.h5 new file mode 100644 index 00000000..f8cb3e66 --- /dev/null +++ b/atomdb/data/elements_data.h5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fde2a5f5db8c0adb8418016ea8b85d5f12af4e2e40a7f07bef7bcfe474ae3e81 +size 105117616 diff --git a/atomdb/migration/periodic/elements_data.py b/atomdb/migration/periodic/elements_data.py new file mode 100644 index 00000000..22256219 --- /dev/null +++ b/atomdb/migration/periodic/elements_data.py @@ -0,0 +1,361 @@ +import csv +import tables as pt +import numpy as np +from importlib_resources import files +import warnings +from atomdb.utils import CONVERTOR_TYPES + +# Suppresses NaturalNameWarning warnings from PyTables. +warnings.filterwarnings('ignore', category=pt.NaturalNameWarning) + +# Set-up variables +elements_data_csv = files("atomdb.data").joinpath("elements_data.csv") +data_info_csv = files("atomdb.data").joinpath("data_info.csv") +hdf5_file = files("atomdb.data").joinpath("elements_data.h5") + + +# Properties of each element in the HDF5 file. +PROPERTY_CONFIGS = [ + { + 'basic_property': 'atnum', + 'table_name': 'atnum', + 'description': 'Atom Number', + 'type': 'int', + }, + + { + 'basic_property': 'symbol', + 'table_name': 'symbol', + 'description': 'Atom Symbol', + 'type': 'string', + }, + + { + 'basic_property': 'name', + 'table_name': 'name', + 'description': 'Atom Name', + 'type': 'string', + + }, + + { + 'basic_property': 'group', + 'table_name': 'group', + 'description': 'Atom Group', + 'type': 'int', + }, + + { + 'basic_property': 'period', + 'table_name': 'period', + 'description': 'Atom Period', + 'type': 'int', + }, + + { + 'basic_property': 'mult', + 'table_name': 'mult', + 'description': 'Atom multiplicity', + 'type': 'int', + }, + + { + 'property': 'cov_radius', + 'table_name': 'cov_radius', + 'description': 'Covalent Radius' + }, + { + 'property': 'vdw_radius', + 'table_name': 'vdw_radius', + 'description': 'Van der Waals Radius' + }, + { + 'property': 'at_radius', + 'group': 'Radius', + 'table_name': 'at_radius', + 'description': 'Atomic Radius' + }, + { + 'property': 'mass', + 'table_name': 'atmass', + 'description': 'Atomic Mass' + }, + { + 'property': 'pold', + 'table_name': 'polarizability', + 'description': 'Polarizability' + }, + { + 'property': 'c6', + 'table_name': 'dispersion_c6', + 'description': 'C6 Dispersion Coefficient' + }, + { + 'property': 'eneg', + 'table_name': 'eneg', + 'description': 'Electronegativity' + } +] + + +class NumberElementDescription(pt.IsDescription): + value = pt.Int32Col() + +class StringElementDescription(pt.IsDescription): + value = pt.StringCol(25) + + +class PropertyValues(pt.IsDescription): + """Schema for property value tables.""" + source = pt.StringCol(30, pos=0) + unit = pt.StringCol(20, pos=1) + value = pt.Float64Col(pos=2) + + +class ElementsDataInfo(pt.IsDescription): + """Schema for the property_info table.""" + property_key = pt.StringCol(20, pos=0) + property_name = pt.StringCol(50, pos=1) + source_key = pt.StringCol(30, pos=2) + property_description = pt.StringCol(250, pos=3) + reference = pt.StringCol(250, pos=4) + doi = pt.StringCol(150, pos=5) + notes = pt.StringCol(500, pos=6) + + +def create_properties_tables(hdf5_file, parent_folder, table_name, table_description, row_description, columns, row_data, sources_data, units_data): + """ + Create a table in the HDF5 file for a specific properties. + + Args: + hdf5_file: PyTables file object. + parent_folder: Group where the table will be created. + table_name (str): Name of the table. + table_description (str): Description of the table. + row_description: PyTables IsDescription class for the table schema. + columns (list): List of column names from the CSV to include. + row_data (dict): Data for the current element. + sources_data (dict): sources of each property. + units_data (dict): units of each property. + """ + + # Creates a new table in the HDF5 file. + table = hdf5_file.create_table(parent_folder, table_name, row_description, table_description) + + # Iterates over the list of columns relevant to the current table. + for col in columns: + source = sources_data.get(col, 'unknown') # defaulting to 'unknown' if not found. + unit = units_data.get(col, 'unknown') # defaulting to 'unknown' if not found. + value = np.nan + + if col in row_data and row_data[col].strip(): + try: + value = float(row_data[col]) + value = CONVERTOR_TYPES[unit](value) + except (ValueError, TypeError): + value = np.nan + + # Creates a new row in the table. + row = table.row + row['source'] = source.encode('utf-8') if source else '' + row['unit'] = unit.encode('utf-8') if unit else '' + row['value'] = value + row.append() + + # Flushes the table to ensure all data is written to the HDF5 file. + table.flush() + +def create_basic_properties_tables(hdf5_file, parent_folder, table_name, row_description, table_description, value, prop_type): + """ + Create a table for a single basic property. + + Args: + hdf5_file: PyTables file object. + parent_folder: Group where the table will be created. + table_name (str): Name of the table. + row_description: PyTables IsDescription class for the table schema. + table_description (str): Description of the table. + value (integer or string): The value to store in the table. + """ + table = hdf5_file.create_table(parent_folder, table_name, row_description, table_description) + row = table.row + if prop_type == 'int': + row['value'] = value + if prop_type == 'string': + row['value'] = value.encode('utf-8') if value else '' + + row.append() + table.flush() + +def read_elements_data_csv(elements_data_csv): + """ + Read the elements_data.csv file. + + Args: + elements_data_csv: Path to the elements_data.csv file. + + Returns: + - data (List): List of dictionaries containing element data. + - unique_headers (List): List of unique column headers. + - sources_data (dict): sources of each property. + - units_data (dict): units of each property. + """ + + # Opens the csv file, filters out comment lines (starting with #) and empty lines. + with open(elements_data_csv, 'r') as f: + reader = csv.reader(f) + lines = [line for line in reader if not line[0].startswith('#') and any(line)] + + headers = [header.strip() for header in lines[0]] # first row as column headers + sources = [source.strip() for source in lines[1]] # second row as sources + units = [unit.strip() for unit in lines[2]] # third row as units + data_rows = lines[3:] # remaining rows as data + + # Process headers to make them unique + unique_headers = [] + header_counts = {} + for header in headers: + if header in header_counts: + header_counts[header] += 1 + unique_headers.append(f"{header}.{header_counts[header]}") # creates suffix (header.1, header.2) for duplicate headers + else: + header_counts[header] = 0 + unique_headers.append(header) + + # Create data as list of dictionaries + data = [] + for row in data_rows: + data.append(dict(zip(unique_headers, row))) + + sources_data = dict(zip(unique_headers, sources)) + units_data = dict(zip(unique_headers, units)) + + return data, unique_headers, sources_data, units_data + + +def read_data_info_csv(data_info_csv): + """ + Read and parse the data_info.csv file containing metadata. + + Args: + data_info_csv: Path to the data_info.csv file. + + Returns: + data_info (List): List of dictionaries containing metadata for each property. + """ + # Opens the csv file, filters out comment lines (starting with #) and empty lines. + with open(data_info_csv, 'r') as f: + lines = [] + for line in f: + stripped = line.strip() + if stripped and not stripped.startswith('#'): + lines.append(stripped) + + # hardcode the headers + data_info_headers = [ + 'Property key', + 'Property name', + 'Source key', + 'Property description', + 'Reference', + 'doi', + 'Notes' + ] + + reader = csv.reader(lines) + data_rows = list(reader) + + data_info = [] + for row in data_rows: + data_info.append(dict(zip(data_info_headers, row))) + + return data_info + + +def write_elements_data_to_hdf5(data, unique_headers, sources_data, units_data): + """ Write element data to an HDF5 file using PyTables. + + Args: + data (list of dict): List of dictionaries containing element data. + unique_headers (list of str): List of unique column headers from the data, used to identify properties. + sources_data (dict): sources of each property. + units_data (dict): units of each property. + """ + h5file = pt.open_file(hdf5_file, mode="w", title='Periodic Data') + elements_group = h5file.create_group('/', 'Elements', 'Elements Data') + + for row in data: + atnum = int(row['atnum']) if 'atnum' in row and row['atnum'].strip() else 0 + name = row['name'] if 'name' in row and row['name'].strip() else '' + element_group_name = f"{atnum:03d}" + element_group = h5file.create_group(elements_group, element_group_name, f'Data for {name}') + + # Handle basic properties + for config in PROPERTY_CONFIGS: + if 'basic_property' in config: + property_name = config['basic_property'] + table_name = config['table_name'] + description = config['description'] + prop_type = config['type'] + + # checking the property type to use the relevant ElementDescription class + if prop_type == 'int': + row_description = NumberElementDescription + value = int(row[property_name]) if property_name in row and row[property_name].strip() else 0 + elif prop_type == 'string': + row_description = StringElementDescription + value = row[property_name] if property_name in row and row[property_name].strip() else '' + + create_basic_properties_tables(h5file, element_group, table_name, row_description, description, value, prop_type) + + + # handle rest of the properties + else: + columns = [col for col in unique_headers if col.startswith(config['property'])] + if columns: + create_properties_tables(h5file, element_group, config['table_name'], config['description'], PropertyValues, columns, row, + sources_data, units_data) + + h5file.close() + + +def write_data_info_to_hdf5(data_info_list): + """ + Write dara from data_info.csv to the HDF5 file. + + Args: + data_info_list: List of dictionaries containing metadata. + """ + + + # Opens the HDF5 file in append mode ("a") --> add metadata without overwriting existing data. + with pt.open_file(hdf5_file, mode='a', title='Periodic Data') as h5file: + data_info_group = h5file.create_group('/', 'data_info', 'Data Info') + + property_info_table = h5file.create_table(data_info_group, 'property_info', ElementsDataInfo,'Property Information') + + for row in data_info_list: + table_row = property_info_table.row + table_row['property_key'] = row.get('Property key', '').encode('utf-8') + table_row['property_name'] = row.get('Property name', '').encode('utf-8') + table_row['source_key'] = row.get('Source key', '').encode('utf-8') + table_row['property_description'] = row.get('Property description', '').encode('utf-8') + table_row['reference'] = row.get('Reference', '').encode('utf-8') + table_row['doi'] = row.get('doi', '').encode('utf-8') + table_row['notes'] = row.get('Notes', '').encode('utf-8') + table_row.append() + property_info_table.flush() + + +if __name__ == "__main__": + # Read the elements data from the CSV file + data, unique_headers, sources_data, units_data = read_elements_data_csv(elements_data_csv) + + # Read the provenance data from the CSV file + data_info_df = read_data_info_csv(data_info_csv) + + # Write the periodic table data to an HDF5 file + write_elements_data_to_hdf5(data, unique_headers, sources_data, units_data) + + # Write the provenance data to the HDF5 file + write_data_info_to_hdf5(data_info_df) diff --git a/atomdb/species.py b/atomdb/species.py index 2408eb05..4758be51 100644 --- a/atomdb/species.py +++ b/atomdb/species.py @@ -32,6 +32,24 @@ from atomdb.periodic import Element, element_symbol from atomdb.utils import DEFAULT_DATAPATH, DEFAULT_DATASET, DEFAULT_REMOTE +from importlib_resources import \ +files +import tables as pt +from numbers import Integral + +elements_hdf5_file = files("atomdb.data").joinpath("elements_data.h5") + +PROPERTY_NAME_MAP = { + "atmass": "atmass", + "cov_radius": "cov_radius", + "vdw_radius": "vdw_radius", + "at_radius": "at_radius", + "polarizability": "polarizability", + "dispersion_c6": "dispersion_c6", + "elem": "symbol", + "atnum": "atnum", + "name": "name", +} __all__ = [ "Species", @@ -62,38 +80,93 @@ def default_matrix(): return np.zeros(0).reshape(1, 0) +# def scalar(method): +# r"""Expose a SpeciesData field.""" +# name = method.__name__ +# +# @property +# def wrapper(self): +# print("hi") +# +# # Map the name of the method in the SpeciesData class to the name in the Elements class +# # This dict can be removed if the Elements csv file uses the same names as the SpeciesData class. +# namemap = { +# "cov_radius": "cov_radius", +# "vdw_radius": "vdw_radius", +# "at_radius": "at_radius", +# "polarizability": "pold", +# "dispersion_c6": "c6", +# "atmass": "mass", +# } +# +# if name == "atmass": +# print(f"inside atmass {getattr(Element(self._data.elem), namemap[name])}") +# return getattr(Element(self._data.elem), namemap[name]) +# if name in namemap: +# # Only return Element property if neutral, otherwise None +# charge = self._data.atnum - self._data.nelec +# print(f"charge {charge}") +# print(f"inside the other {getattr(Element(self._data.elem), namemap[name])}") +# return getattr(Element(self._data.elem), namemap[name]) if charge == 0 else None +# +# return getattr(self._data, name) +# +# # conserve the docstring of the method +# wrapper.__doc__ = method.__doc__ +# return wrapper + + def scalar(method): r"""Expose a SpeciesData field.""" name = method.__name__ @property def wrapper(self): + # Checking if the property is not in PROPERTY_NAME_MAP, if not then fetch it from SpeciesData + if name not in PROPERTY_NAME_MAP: + return getattr(self._data, name) + + # calculate charge then if charge is not zero (ions) --> return none + charge = self._data.atnum - self._data.nelec + if charge != 0: + return None + + # open the HDF5 file in read mode + with pt.open_file(elements_hdf5_file, mode="r") as h5file: + # get the element group + element_group = f"/Elements/{self._data.atnum:03d}" + + table_name = PROPERTY_NAME_MAP[name] + table_path = f"{element_group}/{table_name}" + + # get the table node from the HDF5 file + table = h5file.get_node(table_path) + + # Handle basic properties (single row) + if table.nrows == 1: + value = table[0]["value"] + # if the value is an int, return it as an int + if isinstance(value, Integral): + return int(value) + # if the value is a string, decode from bytes + elif isinstance(value, bytes): + return value.decode("utf-8") + else: + # handle properties with multiple sources + result = {} + for row in table: + source = row["source"].decode("utf-8") + value = row["value"] + # exclude none values + if not np.isnan(value): + result[source] = float(value) + return result if result else None - # Map the name of the method in the SpeciesData class to the name in the Elements class - # This dict can be removed if the Elements csv file uses the same names as the SpeciesData class. - namemap = { - "cov_radius": "cov_radius", - "vdw_radius": "vdw_radius", - "at_radius": "at_radius", - "polarizability": "pold", - "dispersion_c6": "c6", - "atmass": "mass", - } - - if name == "atmass": - return getattr(Element(self._data.elem), namemap[name]) - if name in namemap: - # Only return Element property if neutral, otherwise None - charge = self._data.atnum - self._data.nelec - return getattr(Element(self._data.elem), namemap[name]) if charge == 0 else None - - return getattr(self._data, name) - - # conserve the docstring of the method wrapper.__doc__ = method.__doc__ return wrapper + def _remove_suffix(input_string, suffix): if suffix and input_string.endswith(suffix): return input_string[: -len(suffix)] @@ -699,7 +772,7 @@ def dd_dens_lapl_func(self, spin="t", index=None, log=False): Return the function for the electronic density Laplacian. .. math:: - + \nabla^2 \rho(\mathbf{r}) = \frac{d^2 \rho(r)}{dr^2} + \frac{2}{r} \frac{d \rho(r)}{dr} Parameters @@ -714,13 +787,13 @@ def dd_dens_lapl_func(self, spin="t", index=None, log=False): By default, all orbitals of the given spin(s) are included. log : bool, default=False Whether the logarithm of the density is used for interpolation. - + Returns ------- Callable[np.ndarray(N,) -> np.ndarray(N,)] a callable function evaluating the Laplacian of the density given a set of radial points (1-D array). - + Notes ----- When this function is evaluated at a point close to zero, the Laplacian becomes undefined. @@ -738,7 +811,7 @@ def densityspline_like_func(rs): laplacian = dd_dens_spline(rs) + 2 * d_dens_sp_spline(rs) / rs laplacian = np.where(rs < 1e-10, 0.0, laplacian) return laplacian - + return densityspline_like_func @spline diff --git a/pyproject.toml b/pyproject.toml index 8b9f82a9..ee83dff2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,7 +25,7 @@ authors = [ description = "AtomDB is a database of atomic and ionic properties." readme = "README.md" license = {text = "GPL-3.0-or-later"} -requires-python = ">=3.9" +requires-python = ">=3.10" classifiers = [ 'Development Status :: 5 - Production/Stable', 'Environment :: Console', @@ -36,19 +36,19 @@ classifiers = [ 'Intended Audience :: Science/Research', "Intended Audience :: Education", "Natural Language :: English", - "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", ] dependencies = [ - "numpy>=1.16", + "numpy>=1.26.4", "scipy>=1.4", "msgpack>=1.0.0", "msgpack-numpy>=0.4.8", "h5py>=3.6.0", "importlib_resources>=3.0.0", "pooch>=1.8.1", + "tables>=3.9.2", ] dynamic = ["version"] [tool.setuptools_scm] @@ -92,6 +92,9 @@ doc = [ [tool.setuptools] packages = ["atomdb"] +# Adding the package data +package-data = { "atomdb" = ["data/*.h5", "data/*.msg"] } +include-package-data = true [tool.black] line-length = 100