diff --git a/qcrboxtools/cif/file_converter/cartesian.py b/qcrboxtools/cif/file_converter/cartesian.py new file mode 100644 index 0000000..8790b6e --- /dev/null +++ b/qcrboxtools/cif/file_converter/cartesian.py @@ -0,0 +1,45 @@ +import numpy as np + + +def cell_constants_to_matrix(a: float, b: float, c: float, alpha: float, beta: float, gamma: float) -> np.ndarray: + """ + Convert cell constants to a 3x3 cell matrix. + + Parameters + ---------- + a : float + Cell length a in Angstrom. + b : float + Cell length b in Angstrom. + c : float + Cell length c in Angstrom. + alpha : float + Cell angle alpha in degrees. + beta : float + Cell angle beta in degrees. + gamma : float + Cell angle gamma in degrees. + + Returns + ------- + np.ndarray + The 3x3 cell matrix. + """ + alpha_rad = np.radians(alpha) + beta_rad = np.radians(beta) + gamma_rad = np.radians(gamma) + + cos_alpha = np.cos(alpha_rad) + cos_beta = np.cos(beta_rad) + cos_gamma = np.cos(gamma_rad) + sin_gamma = np.sin(gamma_rad) + + matrix = np.zeros((3, 3)) + matrix[0, 0] = a + matrix[0, 1] = b * cos_gamma + matrix[0, 2] = c * cos_beta + matrix[1, 1] = b * sin_gamma + matrix[1, 2] = c * (cos_alpha - cos_beta * cos_gamma) / sin_gamma + matrix[2, 2] = c * np.sqrt(1 - cos_beta**2 - ((cos_alpha - cos_beta * cos_gamma) / sin_gamma) ** 2) + + return matrix diff --git a/qcrboxtools/cif/file_converter/tsc.py b/qcrboxtools/cif/file_converter/tsc.py new file mode 100644 index 0000000..3e9d221 --- /dev/null +++ b/qcrboxtools/cif/file_converter/tsc.py @@ -0,0 +1,535 @@ +import struct +from abc import ABC, abstractmethod +from collections.abc import Iterable +from pathlib import Path +from textwrap import wrap +from typing import Dict, List, Tuple, Union + +import numpy as np +from iotbx.cif.model import block, loop + +from ..read import read_cif_as_unified +from .cartesian import cell_constants_to_matrix + + +def read_tsc_file(path: Path): + """ + Reads a TSC or TSCB file and returns the corresponding object. + Parameters + ---------- + path : Path + The path to the TSC or TSCB file. + + Returns + ------- + TSCFile or TSCBFile + The TSCFile or TSCBFile object representing the file content. + + Raises + ------ + ValueError + If the file cannot be read as either TSC or TSCB format. + """ + path = Path(path) + if path.suffix == ".tscb": + try: + return TSCBFile.from_file(path) + except Exception as exc: + try: + return TSCFile.from_file(path) + except Exception: + raise ValueError(f"Cannot read TSCB file: {str(path)}") from exc + elif path.suffix == ".tsc": + try: + return TSCFile.from_file(path) + except Exception as exc: + try: + return TSCBFile.from_file(path) + except Exception: + raise ValueError(f"Cannot read TSC file: {str(path)}") from exc + + +def parse_header(header_str): + """ + Parses the header section of a TSC file. + + Parameters + ---------- + header_str : str + The header section of the TSC file as a string. + + Returns + ------- + dict + A dictionary containing the parsed header information. + """ + if not header_str.strip(): + return {} + header = {} + header_split = iter(val.split(":") for val in header_str.strip().split("\n")) + + header_key = None + header_entry = "" + for line_split in header_split: + if len(line_split) == 2 and header_key is not None: + header[header_key] = header_entry + if len(line_split) == 2: + header_key, header_entry = line_split + elif len(line_split) == 1 and header_key is not None: + header_entry += "\n" + line_split[0] + else: + raise ValueError(f"Malformed header line: {':'.join(line_split)}") + header[header_key] = header_entry + return header + + +def parse_tsc_data_line(line: str) -> Tuple[Tuple[int, int, int], np.ndarray]: + """ + Parses a line of TSC data. + + Parameters + ---------- + line : str + The line of TSC data to parse. + + Returns + ------- + tuple + A tuple containing the indices h, k, l and the array of f0j values. + """ + + h_str, k_str, l_str, *f0j_strs = line.split() + parts = (val.split(",") for val in f0j_strs) + f0js = np.array([float(real_val) + 1j * float(imag_val) for real_val, imag_val in parts]) + return (int(h_str), int(k_str), int(l_str)), f0js + + +class TSCBase(ABC): + def __init__(self): + self.header = {"TITLE": "generic_tsc", "SYMM": "expanded", "SCATTERERS": ""} + self.data = {} + + @property + def scatterers(self) -> List[str]: + """ + Retrieves scatterers from the TSC file as a list of strings generated + from the SCATTERERS header entry. + + Returns + ------- + list + A list of scatterer names. + """ + + return self.header["SCATTERERS"].strip().split() + + @scatterers.setter + def scatterers(self, scatterers: Iterable): + """ + Sets the scatterers in the TSC file. + + The input scatterers are converted to a space-separated string and + stored in the header under the key 'SCATTERERS'. + + Parameters + ---------- + scatterers : iterable + An iterable of scatterer names. + """ + self.header["SCATTERERS"] = " ".join(str(val) for val in scatterers) + + def __getitem__(self, atom_site_label: Union[str, Iterable]) -> Dict[Tuple[int, int, int], np.ndarray]: + """ + Retrieves f0j values for a given atom site label. + + The function allows indexing the TSCFile object by atom site label or a + list of labels. If the given label is not found among the scatterers, + a ValueError is raised. + + Parameters + ---------- + atom_site_label : str or iterable + The atom site label or a list of labels to retrieve f0j values for. + + Returns + ------- + dict + A dictionary where each key is a tuple of indices (h, k, l) and the + corresponding value is a numpy array of f0j values for the given + label(s). + + Raises + ------ + ValueError + If an unknown atom site label is used for indexing. + """ + try: + if isinstance(atom_site_label, Iterable) and not isinstance(atom_site_label, str): + indexes = np.array([self.scatterers.index(label) for label in atom_site_label]) + return {hkl: f0js[indexes] for hkl, f0js in self.data.items()} + else: + index = self.scatterers.index(atom_site_label) + return {hkl: f0js[index] for hkl, f0js in self.data.items()} + except ValueError as exc: + if isinstance(atom_site_label, Iterable) and not isinstance(atom_site_label, str): + unknown = [label for label in atom_site_label if label not in self.scatterers] + else: + unknown = [atom_site_label] + raise ValueError(f"Unknown atom label(s) used for lookup from TSCFile: {' '.join(unknown)}") from exc + + @classmethod + @abstractmethod + def from_file(cls, filename: Path): + pass + + @abstractmethod + def to_file(self, filename: Path): + pass + + def _construct_moiety_loop(self, structure_cif_block: block): + """ + Constructs a CIF loop containing moiety information from a given CIF block. + + Parameters + ---------- + structure_cif_block : block + The CIF block containing the structure information. + + Returns + ------- + loop + A CIF loop containing the moiety information. + """ + cell_a = float(structure_cif_block["_cell.length_a"]) + cell_b = float(structure_cif_block["_cell.length_b"]) + cell_c = float(structure_cif_block["_cell.length_c"]) + alpha = float(structure_cif_block["_cell.angle_alpha"]) + beta = float(structure_cif_block["_cell.angle_beta"]) + gamma = float(structure_cif_block["_cell.angle_gamma"]) + cell_mat_m = cell_constants_to_matrix(cell_a, cell_b, cell_c, alpha, beta, gamma) + fract_x = np.array([float(val) for val in structure_cif_block["_atom_site.fract_x"]]) + fract_y = np.array([float(val) for val in structure_cif_block["_atom_site.fract_y"]]) + fract_z = np.array([float(val) for val in structure_cif_block["_atom_site.fract_z"]]) + + xyz_fract = np.stack((fract_x, fract_y, fract_z), axis=-1) + xyz_cart = np.einsum("xy, zy -> zx", cell_mat_m, xyz_fract) + cart_x, cart_y, cart_z = xyz_cart.T + + n_atoms = len(cart_x) + + # TODO revisit this as more sophisticated moiety handling is implemented + moiety_loop_data = { + "_wfn_moiety.id": np.full(n_atoms, 1), + "_wfn_moiety.atom_id": np.arange(1, n_atoms + 1), + "_wfn_moiety.asu_atom_site_label": structure_cif_block["_atom_site.label"], + "_wfn_moiety.atom_type_symbol": structure_cif_block["_atom_site.type_symbol"], + "_wfn_moiety.symm_code": ["1_555"] * n_atoms, + "_wfn_moiety.cartn_x": list(cart_x), + "_wfn_moiety.cartn_y": list(cart_y), + "_wfn_moiety.cartn_z": list(cart_z), + "_wfn_moiety.aff_index": [ + self.scatterers.index(name) + 1 for name in structure_cif_block["_atom_site.label"] + ], + } + + return loop(data=moiety_loop_data) + + def _construct_aff_loop(self): + def create_aff_line_string(values): + converted = [f"{val: 3.8f}" for val in values] + single_line = " ".join(converted) + return "[" + "\n".join(wrap(single_line, width=2047)) + "]" + + mil_hkl = np.asarray(list(self.data.keys())) + all_affs = np.array(list(self.data.values())) + aff_loop_data = { + "_aspheric_ff.index_h": mil_hkl[:, 0].copy(), + "_aspheric_ff.index_k": mil_hkl[:, 1].copy(), + "_aspheric_ff.index_l": mil_hkl[:, 2].copy(), + "_aspheric_ff.form_factor_real": list(create_aff_line_string(line) for line in np.real(all_affs)), + "_aspheric_ff.form_factor_imag": list(create_aff_line_string(line) for line in np.imag(all_affs)), + } + return loop(data=aff_loop_data) + + def to_cif( + self, structure_cif_block: block, partitioning_source: str, partitioning_name: str, partitioning_software: str + ) -> block: + """ + Converts the TSC data to a CIF block format. + + Parameters + ---------- + structure_cif_block : block + The CIF block containing the structure information. + partitioning_source : str + The source of the partitioning. + partitioning_name : str + The name of the partitioning scheme employed. + partitioning_software : str + The software used for the partitioning. + + Returns + ------- + block + A CIF block containing the TSC data. + """ + tsc_block = block() + tsc_block.add_data_item("_cell.length_a", structure_cif_block["_cell.length_a"]) + tsc_block.add_data_item("_cell.length_b", structure_cif_block["_cell.length_b"]) + tsc_block.add_data_item("_cell.length_c", structure_cif_block["_cell.length_c"]) + tsc_block.add_data_item("_cell.angle_alpha", structure_cif_block["_cell.angle_alpha"]) + tsc_block.add_data_item("_cell.angle_beta", structure_cif_block["_cell.angle_beta"]) + tsc_block.add_data_item("_cell.angle_gamma", structure_cif_block["_cell.angle_gamma"]) + tsc_block.add_loop(self._construct_moiety_loop(structure_cif_block)) + tsc_block.add_data_item("_aspheric_ffs.source", partitioning_source) + tsc_block.add_data_item("_aspheric_ffs_partitioning.name", partitioning_name) + tsc_block.add_data_item("_aspheric_ffs_partitioning.software", partitioning_software) + tsc_block.add_loop(self._construct_aff_loop()) + + return tsc_block + + def populate_from_cif_block(self, cif_block: block): + """ + Populates the TSCFile object from a CIF block created by the TSC to cif export function. + Parameters + ---------- + cif_block : block + The CIF block containing the TSC data. + Raises + ------ + ValueError + If the CIF block does not contain the required entries. + """ + if ( + "_aspheric_ffs.source" not in cif_block + or "_aspheric_ffs_partitioning.name" not in cif_block + or "_aspheric_ffs_partitioning.software" not in cif_block + ): + raise ValueError("CIF block does not contain required TSC entries.") + self.scatterers = cif_block["_wfn_moiety.asu_atom_site_label"] + aff_loop = cif_block.get_loop("_aspheric_ff") + if aff_loop is None: + raise ValueError("CIF block does not contain required TSC entries for the loop _aspheric_ff.") + hkl_zip = zip( + aff_loop["_aspheric_ff.index_h"], aff_loop["_aspheric_ff.index_k"], aff_loop["_aspheric_ff.index_l"] + ) + hkl_tuples = tuple((int(mil_h), int(mil_k), int(mil_l)) for mil_h, mil_k, mil_l in hkl_zip) + real_lines = aff_loop["_aspheric_ff.form_factor_real"] + imag_lines = aff_loop["_aspheric_ff.form_factor_imag"] + real_vals = np.fromiter( + (float(val) for line in real_lines for val in line.strip("[]").split()), dtype=np.float64 + ) + imag_vals = np.fromiter( + (float(val) for line in imag_lines for val in line.strip("[]").split()), dtype=np.float64 + ) + all_affs = real_vals + 1j * imag_vals + n_atoms = len(self.scatterers) + if len(all_affs) % n_atoms != 0: + raise ValueError("Number of AFF values is not a multiple of number of scatterers.") + all_affs = all_affs.reshape((-1, n_atoms)) + if len(hkl_tuples) != len(all_affs): + raise ValueError("Number of Miller indices does not match number of AFF value sets.") + self.data = {hkl: affs for hkl, affs in zip(hkl_tuples, all_affs)} + + +class TSCFile(TSCBase): + """ + A class representing a TSC file as defined in doi:10.48550/arXiv.1911.08847 + + A TSC file contains atomic form factors for a list of atoms and miller + indicees + + You can get data for atoms for example with tsc['C1'] or tsc[['C1', 'C2']] + currently setting is not implemented this way. All data is represented + in the data attribute + + Attributes + ---------- + header : dict + A dictionary holding the header information from the TSC file. + data : dict + A dictionary mapping tuples (h, k, l) to numpy arrays of f0j values, + where the ordering of the values is given by the content of the + scatterers property / the SCATTERERS entry in the header. + """ + + @classmethod + def from_file(cls, filename: Path) -> "TSCFile": + """ + Constructs a TSCFile object from a file. + + The function reads the TSC file, parses its header and data sections, + and constructs a TSCFile instance with these data. + + Parameters + ---------- + filename : Path + The name of the TSC file to read. + + Returns + ------- + TSCFile + A TSCFile instance with data loaded from the file. + """ + with open(filename, "r") as fobj: + tsc_content = fobj.read() + header_str, data_str = tsc_content.split("DATA:\n") + + new_obj = cls() + + new_obj.header.update(parse_header(header_str)) + + parsed_data_lines = iter(parse_tsc_data_line(line) for line in data_str.strip().split("\n")) + + new_obj.data = {hkl: f0js for hkl, f0js in parsed_data_lines} + + return new_obj + + def to_file(self, filename: Path) -> None: + """ + Writes the TSCFile object to a file. + + The function formats the header and data sections of the TSCFile object + and writes them to a file. Currently no safety checks are implemented + SCATTERERS and data need to match + + Parameters + ---------- + filename : Path + The name of the file to write. + """ + header_str = "\n".join(f"{key}: {value}" for key, value in self.header.items()) + formatted_data_lines = ( + f"{int(hkl[0])} {int(hkl[1])} {int(hkl[2])} " + + f"{' '.join(f'{np.real(val):.8e},{np.imag(val):.8e}' for val in values)}" + for hkl, values in self.data.items() + ) + data_str = "\n".join(formatted_data_lines) + + with open(filename, "w") as fobj: + fobj.write(f"{header_str}\nDATA:\n{data_str}\n") + + @classmethod + def from_cif_file(cls, cif_path: Path) -> "TSCFile": + """ + Constructs a TSCFile object from a CIF file created by the TSC to cif export function. + + Parameters + ---------- + filename : Path + The name of the CIF file to read. + + Returns + ------- + TSCFile + A TSCFile instance with data loaded from the CIF file. + """ + cif_block = read_cif_as_unified(cif_path, 0) + new_obj = cls() + new_obj.populate_from_cif_block(cif_block) + return new_obj + + +class TSCBFile(TSCBase): + """ + A class representing a TSCB file used by for example NoSpherA2 + + A TSC file contains atomic form factors for a list of atoms and miller + indicees + + You can get data for atoms for example with tsc['C1'] or tsc[['C1', 'C2']] + currently setting is not implemented this way. All data is represented + in the data attribute + + Attributes + ---------- + header : dict + A dictionary holding the header information from the TSC file. + data : dict + A dictionary mapping tuples (h, k, l) to numpy arrays of f0j values, + where the ordering of the values is given by the content of the + scatterers property / the SCATTERERS entry in the header. + """ + + @classmethod + def from_file(cls, filename: Path) -> "TSCBFile": + """ + Constructs a TSCFile object from a file. + + The function reads the TSC file, parses its header and data sections, + and constructs a TSCFile instance with these data. + + Parameters + ---------- + filename : Path + The name of the TSC file to read. + + Returns + ------- + TSCFile + A TSCBFile instance with data loaded from the file. + """ + new_obj = cls() + with open(filename, "rb") as fobj: + additional_header_size, n_bytes_labels = struct.unpack("2i", fobj.read(8)) + if additional_header_size > 0: + header_str = fobj.read(additional_header_size).decode("ASCII") + + new_obj.header.update(parse_header(header_str)) + new_obj.header["SCATTERERS"] = fobj.read(n_bytes_labels).decode("ASCII") + + n_refln = struct.unpack("i", fobj.read(4))[0] + n_atoms = len(new_obj.header["SCATTERERS"].split()) + new_obj.data = { + tuple(np.frombuffer(fobj.read(12), dtype=np.int32)): np.frombuffer( + fobj.read(n_atoms * 16), dtype=np.complex128 + ) + for i in range(n_refln) + } + return new_obj + + def to_file(self, filename: Path) -> None: + """ + Writes the TSCBFile object to a file. + + The function formats the header and data sections of the TSCBFile object + and writes them to a file. Currently no safety checks are implemented + SCATTERERS and data need to match + + Parameters + ---------- + filename : str + The name of the file to write. + """ + if not next(iter(self.data.values())).dtype == np.complex128: + self.data = {key: value.astype(np.complex128) for key, value in self.data.items()} + omitted_header_entries = ("SCATTERERS", "TITLE", "SYMM") + header_string = "\n".join( + f"{name}: {entry}" for name, entry in self.header.items() if name not in omitted_header_entries + ) + with open(filename, "wb") as fobj: + fobj.write(struct.pack("2i", len(header_string), len(self.header["SCATTERERS"]))) + fobj.write(header_string.encode("ASCII")) + fobj.write(self.header["SCATTERERS"].encode("ASCII")) + fobj.write(struct.pack("i", len(self.data))) + fobj.write(bytes().join(struct.pack("3i", *hkl) + f0js.tobytes() for hkl, f0js in self.data.items())) + + @classmethod + def from_cif_file(cls, cif_path: Path) -> "TSCBFile": + """ + Constructs a TSCFile object from a CIF file created by the TSC to cif export function. + + Parameters + ---------- + filename : Path + The name of the CIF file to read. + + Returns + ------- + TSCBFile + A TSCBFile instance with data loaded from the CIF file. + """ + cif_block = read_cif_as_unified(cif_path, 0) + new_obj = cls() + new_obj.populate_from_cif_block(cif_block) + return new_obj diff --git a/qcrboxtools/cif/read.py b/qcrboxtools/cif/read.py index 47ebf3f..a668205 100644 --- a/qcrboxtools/cif/read.py +++ b/qcrboxtools/cif/read.py @@ -64,7 +64,7 @@ def cifdata_str_or_index(model: cif.model.cif, dataset: Union[int, str]) -> Tupl def read_cif_as_unified( cif_path: Union[str, Path], - dataset: Optional[str] = None, + dataset: Optional[Union[str, int]] = None, convert_keywords: bool = True, custom_categories: Optional[List[str]] = None, split_sus: bool = True, diff --git a/tests/cif/convert/test_tsc.py b/tests/cif/convert/test_tsc.py new file mode 100644 index 0000000..72ee345 --- /dev/null +++ b/tests/cif/convert/test_tsc.py @@ -0,0 +1,564 @@ +""" +Test module for `qcrboxtools.cif.file_converter.tsc`. + +This module contains unit tests for TSC and TSCB file reading, parsing, +and conversion functionality. +""" + +import struct + +import numpy as np +import pytest + +from qcrboxtools.cif.file_converter.tsc import ( + TSCBFile, + TSCFile, + parse_header, + parse_tsc_data_line, + read_tsc_file, +) + + +@pytest.fixture +def sample_tsc_content(): + """Sample TSC file content for testing.""" + return """TITLE: Test TSC File +SYMM: expanded +SCATTERERS: C1 C2 O1 +DATA: +1 0 0 1.23450000e+00,0.00000000e+00 2.34560000e+00,1.00000000e+00 3.45670000e+00,-1.00000000e+00 +0 1 0 4.56780000e+00,0.50000000e+00 5.67890000e+00,-0.50000000e+00 6.78900000e+00,0.25000000e+00 +""" + + +@pytest.fixture +def sample_header_string(): + """Sample header string for testing.""" + return """TITLE: Test Header +SYMM: expanded +SCATTERERS: C1 C2 O1 +MULTI_LINE: This is a +multi-line entry +that spans several lines""" + + +@pytest.fixture +def sample_tscb_file(tmp_path): + """Create a sample TSCB file for testing.""" + tscb_path = tmp_path / "test.tscb" + + # Create minimal TSCB file content + header_str = "TITLE: Test TSCB\nSYMM: expanded" + scatterers_str = "C1 C2 O1" + + with open(tscb_path, "wb") as f: + # Write header size and scatterers size + f.write(struct.pack("2i", len(header_str), len(scatterers_str))) + # Write header and scatterers + f.write(header_str.encode("ASCII")) + f.write(scatterers_str.encode("ASCII")) + # Write number of reflections + f.write(struct.pack("i", 2)) + + # Write reflection data: hkl + form factors for 3 atoms + # Reflection 1: (1,0,0) + f.write(struct.pack("3i", 1, 0, 0)) + f.write(np.array([1.0 + 0.0j, 2.0 + 1.0j, 3.0 - 1.0j], dtype=np.complex128).tobytes()) + + # Reflection 2: (0,1,0) + f.write(struct.pack("3i", 0, 1, 0)) + f.write(np.array([4.0 + 0.5j, 5.0 - 0.5j, 6.0 + 0.25j], dtype=np.complex128).tobytes()) + + return tscb_path + + +def test_parse_header(sample_header_string): + """Test header parsing functionality.""" + header = parse_header(sample_header_string) + + assert header["TITLE"] == " Test Header" + assert header["SYMM"] == " expanded" + assert header["SCATTERERS"] == " C1 C2 O1" + assert "This is a\nmulti-line entry\nthat spans several lines" in header["MULTI_LINE"] + + +def test_parse_header_empty(): + """Test header parsing with empty string.""" + header = parse_header("") + assert header == {} + + +def test_parse_tsc_data_line(): + """Test parsing of TSC data lines.""" + line = "1 0 0 1.23450000e+00,0.00000000e+00 2.34560000e+00,1.00000000e+00" + hkl, f0js = parse_tsc_data_line(line) + + assert hkl == (1, 0, 0) + assert len(f0js) == 2 + assert np.isclose(f0js[0], 1.2345 + 0.0j) + assert np.isclose(f0js[1], 2.3456 + 1.0j) + + +def test_parse_tsc_data_line_negative_indices(): + """Test parsing TSC data line with negative Miller indices.""" + line = "-1 2 -3 1.00000000e+00,2.00000000e+00" + hkl, f0js = parse_tsc_data_line(line) + + assert hkl == (-1, 2, -3) + assert len(f0js) == 1 + assert np.isclose(f0js[0], 1.0 + 2.0j) + + +def test_read_tsc_file_tsc_extension(tmp_path, sample_tsc_content): + """Test reading a .tsc file.""" + tsc_path = tmp_path / "test.tsc" + tsc_path.write_text(sample_tsc_content) + + result = read_tsc_file(tsc_path) + + assert isinstance(result, TSCFile) + assert result.scatterers == ["C1", "C2", "O1"] + assert (1, 0, 0) in result.data + assert (0, 1, 0) in result.data + + +def test_read_tsc_file_tscb_extension(sample_tscb_file): + """Test reading a .tscb file.""" + result = read_tsc_file(sample_tscb_file) + + assert isinstance(result, TSCBFile) + assert result.scatterers == ["C1", "C2", "O1"] + assert (1, 0, 0) in result.data + assert (0, 1, 0) in result.data + + +def test_read_tsc_file_invalid_tsc(tmp_path): + """Test reading invalid TSC file raises ValueError.""" + invalid_tsc = tmp_path / "invalid.tsc" + invalid_tsc.write_text("This is not a valid TSC file") + + with pytest.raises(ValueError, match="Cannot read TSC file"): + read_tsc_file(invalid_tsc) + + +def test_read_tsc_file_invalid_tscb(tmp_path): + """Test reading invalid TSCB file raises ValueError.""" + invalid_tscb = tmp_path / "invalid.tscb" + invalid_tscb.write_bytes(b"This is not a valid TSCB file") + + with pytest.raises(ValueError, match="Cannot read TSCB file"): + read_tsc_file(invalid_tscb) + + +def test_tsc_file_scatterers_property(): + """Test TSCFile scatterers property.""" + tsc = TSCFile() + tsc.header["SCATTERERS"] = "C1 C2 O1 N1" + + assert tsc.scatterers == ["C1", "C2", "O1", "N1"] + + +def test_tsc_file_scatterers_setter(): + """Test TSCFile scatterers setter.""" + tsc = TSCFile() + tsc.scatterers = ["C1", "C2", "O1"] + + assert tsc.header["SCATTERERS"] == "C1 C2 O1" + + +def test_tsc_file_getitem_single_atom(): + """Test TSCFile indexing with single atom label.""" + tsc = TSCFile() + tsc.scatterers = ["C1", "C2", "O1"] + tsc.data = { + (1, 0, 0): np.array([1.0 + 0.0j, 2.0 + 1.0j, 3.0 - 1.0j]), + (0, 1, 0): np.array([4.0 + 0.5j, 5.0 - 0.5j, 6.0 + 0.25j]), + } + + result = tsc["C1"] + expected = {(1, 0, 0): 1.0 + 0.0j, (0, 1, 0): 4.0 + 0.5j} + + assert len(result) == 2 + for hkl, value in expected.items(): + assert np.isclose(result[hkl], value) + + +def test_tsc_file_getitem_multiple_atoms(): + """Test TSCFile indexing with multiple atom labels.""" + tsc = TSCFile() + tsc.scatterers = ["C1", "C2", "O1"] + tsc.data = { + (1, 0, 0): np.array([1.0 + 0.0j, 2.0 + 1.0j, 3.0 - 1.0j]), + } + + result = tsc[["C1", "O1"]] + expected_values = np.array([1.0 + 0.0j, 3.0 - 1.0j]) + + assert len(result) == 1 + assert np.allclose(result[(1, 0, 0)], expected_values) + + +def test_tsc_file_getitem_unknown_atom(): + """Test TSCFile indexing with unknown atom raises ValueError.""" + tsc = TSCFile() + tsc.scatterers = ["C1", "C2"] + + with pytest.raises(ValueError, match="Unknown atom label.*O1"): + tsc["O1"] + + +def test_tsc_file_from_file(tmp_path, sample_tsc_content): + """Test TSCFile.from_file method.""" + tsc_path = tmp_path / "test.tsc" + tsc_path.write_text(sample_tsc_content) + + tsc = TSCFile.from_file(tsc_path) + + assert tsc.header["TITLE"] == " Test TSC File" + assert tsc.scatterers == ["C1", "C2", "O1"] + assert len(tsc.data) == 2 + + +def test_tsc_file_to_file(tmp_path): + """Test TSCFile.to_file method.""" + tsc = TSCFile() + tsc.header = {"TITLE": "Test", "SYMM": "expanded", "SCATTERERS": "C1 C2"} + tsc.data = {(1, 0, 0): np.array([1.0 + 0.0j, 2.0 + 1.0j])} + + output_path = tmp_path / "output.tsc" + tsc.to_file(output_path) + + # Verify file was written correctly + content = output_path.read_text() + assert "TITLE: Test" in content + assert "DATA:" in content + assert "1 0 0" in content + + +def test_tscb_file_from_file(sample_tscb_file): + """Test TSCBFile.from_file method.""" + tscb = TSCBFile.from_file(sample_tscb_file) + + assert tscb.scatterers == ["C1", "C2", "O1"] + assert len(tscb.data) == 2 + assert (1, 0, 0) in tscb.data + assert len(tscb.data[(1, 0, 0)]) == 3 + + +def test_tscb_file_to_file(tmp_path): + """Test TSCBFile.to_file method.""" + tscb = TSCBFile() + tscb.header = {"TITLE": "Test", "SYMM": "expanded", "SCATTERERS": "C1 C2"} + tscb.data = {(1, 0, 0): np.array([1.0 + 0.0j, 2.0 + 1.0j], dtype=np.complex128)} + + output_path = tmp_path / "output.tscb" + tscb.to_file(output_path) + + # Verify file exists and has content + assert output_path.exists() + assert output_path.stat().st_size > 0 + + # Try to read it back + tscb_read = TSCBFile.from_file(output_path) + assert tscb_read.scatterers == ["C1", "C2"] + assert (1, 0, 0) in tscb_read.data + + +def test_tscb_file_empty_header(tmp_path): + """Test TSCBFile with empty additional header.""" + tscb_path = tmp_path / "test_empty_header.tscb" + scatterers_str = "C1" + + with open(tscb_path, "wb") as f: + # Write zero header size + f.write(struct.pack("2i", 0, len(scatterers_str))) + f.write(scatterers_str.encode("ASCII")) + f.write(struct.pack("i", 1)) # One reflection + f.write(struct.pack("3i", 1, 0, 0)) + f.write(np.array([1.0 + 0.0j], dtype=np.complex128).tobytes()) + + tscb = TSCBFile.from_file(tscb_path) + assert tscb.scatterers == ["C1"] + assert len(tscb.data) == 1 + + +@pytest.fixture +def sample_structure_cif_content(): + """Sample structure CIF content for testing.""" + return """ +data_test +_cell.length_a 10.000 +_cell.length_b 12.000 +_cell.length_c 8.000 +_cell.angle_alpha 90.0 +_cell.angle_beta 95.0 +_cell.angle_gamma 90.0 + +loop_ +_atom_site.label +_atom_site.type_symbol +_atom_site.fract_x +_atom_site.fract_y +_atom_site.fract_z +C1 C 0.1 0.2 0.3 +C2 C 0.4 0.5 0.6 +O1 O 0.7 0.8 0.9 +""" + + +@pytest.fixture +def sample_tsc_cif_content(): + """Sample TSC-generated CIF content for testing.""" + return """ +data_test +_cell.length_a 10.000 +_cell.length_b 12.000 +_cell.length_c 8.000 +_cell.angle_alpha 90.0 +_cell.angle_beta 95.0 +_cell.angle_gamma 90.0 + +loop_ +_wfn_moiety.id +_wfn_moiety.atom_id +_wfn_moiety.asu_atom_site_label +_wfn_moiety.atom_type_symbol +_wfn_moiety.symm_code +_wfn_moiety.cartn_x +_wfn_moiety.cartn_y +_wfn_moiety.cartn_z +_wfn_moiety.aff_index +1 1 C1 C 1_555 1.0 2.4 2.4 1 +1 2 C2 C 1_555 4.0 6.0 4.8 2 +1 3 O1 O 1_555 7.0 9.6 7.2 3 + +_aspheric_ffs.source 'test_source' +_aspheric_ffs_partitioning.name 'test_partitioning' +_aspheric_ffs_partitioning.software 'test_software' + +loop_ +_aspheric_ff.index_h +_aspheric_ff.index_k +_aspheric_ff.index_l +_aspheric_ff.form_factor_real +_aspheric_ff.form_factor_imag +1 0 0 '[1.00000000 2.00000000 3.00000000]' '[0.00000000 1.00000000 -1.00000000]' +0 1 0 '[4.00000000 5.00000000 6.00000000]' '[0.50000000 -0.50000000 0.25000000]' +""" + + +@pytest.fixture +def structure_cif_block(tmp_path, sample_structure_cif_content): + """Create a structure CIF file and return the first block.""" + cif_path = tmp_path / "structure.cif" + cif_path.write_text(sample_structure_cif_content) + + from qcrboxtools.cif.read import read_cif_as_unified + + return read_cif_as_unified(cif_path, 0) + + +@pytest.fixture +def tsc_cif_block(tmp_path, sample_tsc_cif_content): + """Create a TSC CIF file and return the first block.""" + cif_path = tmp_path / "tsc.cif" + cif_path.write_text(sample_tsc_cif_content) + + from qcrboxtools.cif.read import read_cif_as_unified + + return read_cif_as_unified(cif_path, 0) + + +def test_tsc_to_cif_conversion(structure_cif_block): + """Test converting TSC data to CIF format.""" + # Create a TSC file with test data + tsc = TSCFile() + tsc.scatterers = ["C1", "C2", "O1"] + tsc.data = { + (1, 0, 0): np.array([1.0 + 0.0j, 2.0 + 1.0j, 3.0 - 1.0j]), + (0, 1, 0): np.array([4.0 + 0.5j, 5.0 - 0.5j, 6.0 + 0.25j]), + } + + # Convert to CIF + cif_block = tsc.to_cif( + structure_cif_block, + partitioning_source="test_source", + partitioning_name="test_partitioning", + partitioning_software="test_software", + ) + + # Verify cell parameters are preserved + assert cif_block["_cell.length_a"] == "10.000" + assert cif_block["_cell.length_b"] == "12.000" + assert cif_block["_cell.angle_beta"] == "95.0" + + # Verify partitioning metadata + assert cif_block["_aspheric_ffs.source"] == "test_source" + assert cif_block["_aspheric_ffs_partitioning.name"] == "test_partitioning" + assert cif_block["_aspheric_ffs_partitioning.software"] == "test_software" + + # Verify moiety loop exists + moiety_loop = cif_block.get_loop("_wfn_moiety") + assert moiety_loop is not None + assert len(moiety_loop["_wfn_moiety.atom_id"]) == 3 + + # Verify AFF loop exists + aff_loop = cif_block.get_loop("_aspheric_ff") + assert aff_loop is not None + assert len(aff_loop["_aspheric_ff.index_h"]) == 2 + + +def test_tsc_populate_from_cif_block(tsc_cif_block): + """Test populating TSC from CIF block.""" + tsc = TSCFile() + tsc.populate_from_cif_block(tsc_cif_block) + + # Verify scatterers + assert tsc.scatterers == ["C1", "C2", "O1"] + + # Verify data was loaded correctly + assert len(tsc.data) == 2 + assert (1, 0, 0) in tsc.data + assert (0, 1, 0) in tsc.data + + # Verify form factor values + hkl_100_data = tsc.data[(1, 0, 0)] + assert np.isclose(hkl_100_data[0], 1.0 + 0.0j) + assert np.isclose(hkl_100_data[1], 2.0 + 1.0j) + assert np.isclose(hkl_100_data[2], 3.0 - 1.0j) + + +def test_tsc_from_cif_file(tmp_path, sample_tsc_cif_content): + """Test creating TSC from CIF file.""" + cif_path = tmp_path / "test.cif" + cif_path.write_text(sample_tsc_cif_content) + + tsc = TSCFile.from_cif_file(cif_path) + + assert tsc.scatterers == ["C1", "C2", "O1"] + assert len(tsc.data) == 2 + assert (1, 0, 0) in tsc.data + + +def test_tscb_from_cif_file(tmp_path, sample_tsc_cif_content): + """Test creating TSCB from CIF file.""" + cif_path = tmp_path / "test.cif" + cif_path.write_text(sample_tsc_cif_content) + + tscb = TSCBFile.from_cif_file(cif_path) + + assert tscb.scatterers == ["C1", "C2", "O1"] + assert len(tscb.data) == 2 + assert (1, 0, 0) in tscb.data + + +def test_populate_from_cif_block_missing_entries(): + """Test error handling when CIF block is missing required entries.""" + from iotbx.cif.model import block + + # Create incomplete block missing required entries + incomplete_block = block() + incomplete_block.add_data_item("_cell.length_a", "10.0") + + tsc = TSCFile() + + with pytest.raises(ValueError, match="CIF block does not contain required TSC entries"): + tsc.populate_from_cif_block(incomplete_block) + + +def test_populate_from_cif_block_missing_aff_loop(): + """Test error handling when AFF loop is missing.""" + from iotbx.cif.model import block + + # Create block with metadata but no AFF loop + incomplete_block = block() + incomplete_block.add_data_item("_aspheric_ffs.source", "test") + incomplete_block.add_data_item("_aspheric_ffs_partitioning.name", "test") + incomplete_block.add_data_item("_aspheric_ffs_partitioning.software", "test") + + tsc = TSCFile() + + with pytest.raises(KeyError): + tsc.populate_from_cif_block(incomplete_block) + + +def test_populate_from_cif_mismatched_atom_count(): + """Test error when AFF values don't match atom count.""" + from iotbx.cif.model import block, loop + + # Create block with mismatched data + test_block = block() + test_block.add_data_item("_aspheric_ffs.source", "test") + test_block.add_data_item("_aspheric_ffs_partitioning.name", "test") + test_block.add_data_item("_aspheric_ffs_partitioning.software", "test") + + # Moiety loop with 2 atoms + moiety_data = {"_wfn_moiety.asu_atom_site_label": ["C1", "C2"]} + test_block.add_loop(loop(data=moiety_data)) + + # AFF loop with 3 values (mismatch) + aff_data = { + "_aspheric_ff.index_h": [1], + "_aspheric_ff.index_k": [0], + "_aspheric_ff.index_l": [0], + "_aspheric_ff.form_factor_real": ["[1.0 2.0 3.0]"], # 3 values + "_aspheric_ff.form_factor_imag": ["[0.0 1.0 -1.0]"], # 3 values + } + test_block.add_loop(loop(data=aff_data)) + + tsc = TSCFile() + + with pytest.raises(ValueError, match="Number of AFF values is not a multiple of number of scatterers"): + tsc.populate_from_cif_block(test_block) + + +def test_construct_aff_loop_formatting(): + """Test that AFF loop formats form factors correctly.""" + tsc = TSCFile() + tsc.data = { + (1, 0, 0): np.array([1.23456789 + 0.0j, 2.345 + 1.0j]), + (-1, 2, -3): np.array([4.567 - 0.5j, 6.789 + 0.25j]), + } + + aff_loop = tsc._construct_aff_loop() + + # Verify structure + assert len(aff_loop["_aspheric_ff.index_h"]) == 2 + assert aff_loop["_aspheric_ff.index_h"][0] == "1" + assert aff_loop["_aspheric_ff.index_k"][1] == "2" + assert aff_loop["_aspheric_ff.index_l"][1] == "-3" + + # Verify formatting (should be wrapped in brackets with proper precision) + real_line = aff_loop["_aspheric_ff.form_factor_real"][0] + assert real_line.startswith("[") + assert real_line.endswith("]") + assert "1.23456789" in real_line + + +def test_round_trip_tsc_cif_conversion(tmp_path, structure_cif_block): + """Test round-trip TSC -> CIF -> TSC conversion preserves data.""" + # Create original TSC + original_tsc = TSCFile() + original_tsc.scatterers = ["C1", "C2", "O1"] + original_tsc.data = { + (1, 0, 0): np.array([1.0 + 0.0j, 2.0 + 1.0j, 3.0 - 1.0j]), + (0, 1, 0): np.array([4.0 + 0.5j, 5.0 - 0.5j, 6.0 + 0.25j]), + (-1, -2, 3): np.array([7.0 + 2.0j, 8.0 - 2.0j, 9.0 + 0.1j]), + } + + # Convert to CIF + cif_block = original_tsc.to_cif( + structure_cif_block, partitioning_source="test", partitioning_name="test", partitioning_software="test" + ) + + # Convert back to TSC + reconstructed_tsc = TSCFile() + reconstructed_tsc.populate_from_cif_block(cif_block) + + # Verify data is preserved + assert reconstructed_tsc.scatterers == original_tsc.scatterers + assert len(reconstructed_tsc.data) == len(original_tsc.data) + + for hkl in original_tsc.data: + assert hkl in reconstructed_tsc.data + np.testing.assert_allclose(reconstructed_tsc.data[hkl], original_tsc.data[hkl], rtol=1e-6)