Source code for mofbuilder.io.cif_reader

import re
import numpy as np
from pathlib import Path
from .basic import convert_fraction_to_decimal, remove_bracket, remove_quotes, remove_tail_number, extract_quote_lines
from .basic import find_keyword, extract_xyz_lines
from veloxchem.outputstream import OutputStream
from veloxchem.veloxchemlib import mpi_master
from veloxchem.errorhandler import assert_msg_critical
import mpi4py.MPI as MPI
import sys
"""
atom_type, atom_label, atom_number, residue_name, residue_number, x, y, z, spin, charge, note
"""
# This reader will read a cif file and create a Topology object



[docs]
class CifReader:
    """
    Class for reading CIF files, extracting symmetry and atomic site information,
    and providing atom coordinates in the primitive cell.

    Attributes:
        comm (MPI.Comm): MPI communicator.
        rank (int): Rank of the process.
        nodes (int): Total number of nodes/processes.
        ostream (OutputStream): Output stream for logging.
        filepath (str or Path): Path to the CIF file.
        data (np.ndarray or None): Array with atom data.
        _debug (bool): Toggle debug logging.
        cell_info (list): Unpacked cell dimensions and angles.
        symmetry_sector (list): Lines containing symmetry operations.
        atom_site_sector (list): Lines defining atomic sites.
        net_name (str): CIF network name.
        spacegroup (str): Space group label from CIF.
        hall_number (str): Hall number label from CIF.
        V_con (str): Connected component value from CIF, if present.
        EC_con (str): Edge connectivity value from CIF, if present.
        ciffile_lines (list[str]): Filtered lines from the CIF file.
        fcoords (np.ndarray): Fractional coordinates for atoms.
        target_fcoords (np.ndarray): Specific fractional coordinates for requested atom type.
    """

    def __init__(self, comm: MPI.Comm = None, ostream: OutputStream = None, filepath: str = None):
        """
        Initialize a CifReader instance.

        Args:
            comm (MPI.Comm, optional): MPI communicator. Defaults to MPI.COMM_WORLD.
            ostream (OutputStream, optional): Output stream for logging. If None, selected by rank.
            filepath (str, optional): Path to the CIF file to be read.
        """
        if comm is None:
            comm = MPI.COMM_WORLD

        if ostream is None:
            if comm.Get_rank() == mpi_master():
                ostream = OutputStream(sys.stdout)
            else:
                ostream = OutputStream(None)

        self.comm: MPI.Comm = comm
        self.rank: int = self.comm.Get_rank()
        self.nodes: int = self.comm.Get_size()
        self.ostream: OutputStream = ostream

        self.filepath: str = filepath
        self.data: np.ndarray | None = None

        self._debug: bool = False

    def _extract_value_str_slice(self, s: list) -> int:
        """
        Extracts a signed integer coefficient from a symmetry operation string segment.

        Args:
            s (list): List of string fragments containing symmetry operation.

        Returns:
            int: The signed integer value (e.g., +1, -1).
        """
        if len(s) == 0:
            return 0
        sign = 1
        mul_value = 1
        s_list = list(s[0])

        if "-" in s_list:
            sign = -1
        if "*" in s_list:
            mul_value = s_list[s_list.index("*") - 1]

        return sign * int(mul_value)

    def _extract_value_from_str(self, s: str) -> tuple[int, int, int, float]:
        """
        Extracts x, y, z coefficients and constants from a symmetry operation string.

        Args:
            s (str): The symmetry operation component string.

        Returns:
            tuple[int, int, int, float]: (x_coeff, y_coeff, z_coeff, constant)
        """
        s = re.sub(r" ", "", s)
        s = re.sub(r"(?<=[+-])", ",", s[::-1])[::-1]
        if s[0] == ",":
            s = s[1:]
        s_list = list(s.split(","))
        x_slice = [s_list[i] for i in range(len(s_list)) if "x" in s_list[i]]
        y_slice = [s_list[i] for i in range(len(s_list)) if "y" in s_list[i]]
        z_slice = [s_list[i] for i in range(len(s_list)) if "z" in s_list[i]]
        const_slice = [
            s_list[i] for i in range(len(s_list))
            if "x" not in s_list[i] and "y" not in s_list[i] and "z" not in s_list[i]
        ]
        x_coeff = self._extract_value_str_slice(x_slice)
        y_coeff = self._extract_value_str_slice(y_slice)
        z_coeff = self._extract_value_str_slice(z_slice)

        if len(const_slice) == 0:
            const = 0
        else:
            const = const_slice[0]
            const = convert_fraction_to_decimal(const)

        return x_coeff, y_coeff, z_coeff, const

    def _extract_transformation_matrix_from_symmetry_operator(self, expr_str: str) -> np.ndarray:
        """
        Converts a symmetry operator string into a 4x4 transformation matrix.

        Args:
            expr_str (str): Symmetry operation as string, e.g., 'x, -y, z+1/2'.

        Returns:
            np.ndarray: 4x4 transformation matrix.
        """
        expr_str = str(expr_str)
        expr_str = expr_str.strip("\n")
        expr_str = expr_str.replace(" ", "")
        split_str = expr_str.split(",")
        transformation_matrix = np.zeros((4, 4))
        transformation_matrix[3, 3] = 1
        for i in range(len(split_str)):
            x_coeff, y_coeff, z_coeff, const = self._extract_value_from_str(split_str[i])
            transformation_matrix[i] = [x_coeff, y_coeff, z_coeff, const]

        return transformation_matrix

    def _extract_symmetry_operation_from_lines(self, symmetry_sector: list[str]) -> list[str]:
        """
        Extracts symmetry operation strings from a sector of the CIF file.

        Args:
            symmetry_sector (list[str]): List of lines in the symmetry sector.

        Returns:
            list[str]: List of symmetry operation strings.
        """
        symmetry_operation = []
        for i in range(len(symmetry_sector)):
            pattern = r"([+-]?\d*\.?\d*)\s*([xyz])"
            match = re.search(pattern, symmetry_sector[i])
            if match:
                string = remove_quotes(symmetry_sector[i].strip("\n"))
                no_space_string = string.replace(" ", "")
                symmetry_operation.append(no_space_string)
        if len(symmetry_operation) < 2:
            if self._debug:
                self.ostream.print_info("P1 cell")
                self.ostream.flush()
            symmetry_operation = ["x,y,z"]
        else:
            if self._debug:
                self.ostream.print_info(
                    f"apply {len(symmetry_operation)}  symmetry operation")
                self.ostream.flush()

        return symmetry_operation

    def _fetch_spacegroup_from_cifinfo(self) -> str:
        """
        Fetches the space group name (Hermann-Mauguin) from the CIF info.

        Returns:
            str: The space group name if found; otherwise, 'P1'.
        """
        pattern = r"_symmetry_space_group_name_H-M\s+'([^']+)'"
        match = re.search(pattern, )
        if match:
            return match.group(1)
        else:
            return "P1"

    def _valid_net_name_line(self, line: str) -> str:
        """
        Checks if a line specifies a network name and returns a valid network name.

        Args:
            line (str): Line from CIF file.

        Returns:
            str: The detected network name.
        """
        if re.search(r"net", line):
            potential_net_name = line.split()[0].split("_")[1]
            if re.sub(r"[0-9]", "", potential_net_name) == "":
                return Path(self.filepath).stem
            else:
                return potential_net_name

    def _valid_spacegroup_line(self, line: str) -> str:
        """
        Checks a line for a space group or potential network name.

        Args:
            line (str): Line from CIF file.

        Returns:
            str: The spacegroup name or network name, defaults to 'P1' if not found.
        """
        if re.search(r"_symmetry_space_group_name_H-M", line):
            space_group = re.search(
                r"_symmetry_space_group_name_H-M\s+'([^']+)'", line)[1]
            return space_group
        elif re.search(r"^data_", line) and line.count("_") >= 3:
            potential_net_name = line.split()[0].split("_")[2]
            return potential_net_name
        return "P1"

    def _valid_hallnumber_line(self, line: str) -> str:
        """
        Checks a line for Hall number information.

        Args:
            line (str): Line from CIF file.

        Returns:
            str: The Hall number as found, or '1' if not found.
        """
        if re.search(r"_symmetry_Int_Tables_number", line):
            hall_number = re.search(r"_symmetry_Int_Tables_number\s+(\d+)", line)[1]
            return hall_number
        elif re.search(r"hall_number:\s*(\d+)", line):
            hall_number = re.search(r"hall_number:\s*(\d+)", line)[1]
            return hall_number
        return "1"


[docs]
    def read_cif(self, cif_file: str = None) -> None:
        """
        Reads a CIF file, storing relevant crystal and atom site sections in the instance.

        Args:
            cif_file (str, optional): Path to the CIF file. If None, uses self.filepath.

        Raises:
            AssertionError: If the CIF file path does not exist.
        """
        net_flag = False
        spacegroup_flag = False
        hallnumber_flag = False
        vcon_flag = False

        if cif_file is not None:
            self.filepath = cif_file
        assert_msg_critical(
            Path(self.filepath).exists(),
            f"cif file {self.filepath} not found")
        if self._debug:
            self.ostream.print_info(f"Reading cif file {self.filepath}")
            self.ostream.flush()

        def valid_line(line: str) -> bool:
            """Helper to test whether a line is meaningful (not comment or empty)."""
            return line.strip() != "" and not line.strip().startswith("#")

        with open(self.filepath, "r") as f:
            lines = f.readlines()
            nonempty_lines = [line for line in lines if valid_line(line)]

        self.ciffile_lines = nonempty_lines

        for line in nonempty_lines[:100]:
            if net_flag & spacegroup_flag & hallnumber_flag & vcon_flag:
                break
            if not net_flag and re.search(r"net", line):
                self.net_name = self._valid_net_name_line(line)
                net_flag = True
            if not spacegroup_flag:
                self.spacegroup = self._valid_spacegroup_line(line)
                if self.spacegroup != "P1":
                    spacegroup_flag = True
            if not hallnumber_flag:
                self.hall_number = self._valid_hallnumber_line(line)
                if self.hall_number != "1":
                    hallnumber_flag = True
            if not vcon_flag and re.search(r"V_con:\s*(\d+)", line):
                self.V_con = re.search(r"V_con:\s*(\d+)", line)[1]
                vcon_flag = True
                self.EC_con = re.search(r"EC_con:\s*(\d+)",
                                        line)[1] if re.search(
                                            r"EC_con:\s*(\d+)", line) else None

        if hasattr(self, 'net_name'):
            self.ostream.print_info(f"Found net name: {self.net_name}")
        if hasattr(self, 'spacegroup'):
            self.ostream.print_info(f"Spacegroup: {self.spacegroup}")

        self.ostream.flush()

        if self._debug:
            if hasattr(self, 'V_con'):
                self.ostream.print_info(f"Found V_con: {self.V_con}")
            if hasattr(self, 'EC_con'):
                self.ostream.print_info(f"Found EC_con: {self.EC_con}")
            self.ostream.flush()

        keyword1 = r"loop_"
        keyword2 = r"x,\s*y,\s*z"
        keyword3 = r"-x"
        # loop_key marks locations of important 'loop_' and 'x, y, z' tags

        loop_key = [0]
        linenumber = 0
        for i in range(len(nonempty_lines)):
            m = find_keyword(keyword1, nonempty_lines[i]) or (
                find_keyword(keyword2, nonempty_lines[i]) and
                (not find_keyword(keyword3, nonempty_lines[i])))

            a = re.search(r"_cell_length_a", nonempty_lines[i])
            b = re.search(r"_cell_length_b", nonempty_lines[i])
            c = re.search(r"_cell_length_c", nonempty_lines[i])
            alpha = re.search(r"_cell_angle_alpha", nonempty_lines[i])
            beta = re.search(r"_cell_angle_beta", nonempty_lines[i])
            gamma = re.search(r"_cell_angle_gamma", nonempty_lines[i])

            if m:
                loop_key.append(linenumber)
            else:
                if a:
                    cell_length_a = remove_bracket(
                        nonempty_lines[i].split()[1])
                elif b:
                    cell_length_b = remove_bracket(
                        nonempty_lines[i].split()[1])
                elif c:
                    cell_length_c = remove_bracket(
                        nonempty_lines[i].split()[1])
                elif alpha:
                    cell_angle_alpha = remove_bracket(
                        nonempty_lines[i].split()[1])
                elif beta:
                    cell_angle_beta = remove_bracket(
                        nonempty_lines[i].split()[1])
                elif gamma:
                    cell_angle_gamma = remove_bracket(
                        nonempty_lines[i].split()[1])

            linenumber += 1
        loop_key.append(len(nonempty_lines))
        loop_key = list(set(loop_key))
        loop_key.sort()

        cell_info = [
            cell_length_a,
            cell_length_b,
            cell_length_c,
            cell_angle_alpha,
            cell_angle_beta,
            cell_angle_gamma,
        ]

        cif_sectors = []
        for i in range(len(loop_key) - 1):
            cif_sectors.append(nonempty_lines[loop_key[i]:loop_key[i + 1]])
        for i in range(len(cif_sectors)):
            if re.search(keyword2, cif_sectors[i][0]):
                symmetry_sector = cif_sectors[i]

            if len(cif_sectors[i]) > 1:
                if re.search(r"_atom_site_label\s+", cif_sectors[i][1]):
                    atom_site_sector = cif_sectors[i]
        self.cell_info = cell_info
        self.symmetry_sector = symmetry_sector
        self.atom_site_sector = atom_site_sector


    def _limit_value_0_1(self, new_array_metal_xyz: np.ndarray) -> np.ndarray:
        """
        Ensures all entries are within [0,1) using modulo.

        Args:
            new_array_metal_xyz (np.ndarray): Input coordinates.

        Returns:
            np.ndarray: Coordinates wrapped to [0,1).
        """
        new_array_metal_xyz = np.mod(new_array_metal_xyz, 1)
        return new_array_metal_xyz

    def _wrap_fccords_to_0_1(self, fccords: np.ndarray) -> np.ndarray:
        """
        Center/wraps fractional coordinates to [0,1) and removes duplicated sites.

        Args:
            fccords (np.ndarray): Fractional coordinates.

        Returns:
            np.ndarray: Unique, wrapped fractional coordinates.
        """
        fccords = np.unique(np.array(fccords, dtype=float), axis=0)
        fccords = self._limit_value_0_1(fccords)
        fccords += 0.5
        fccords = self._limit_value_0_1(fccords)
        fccords += -0.5
        fccords = np.unique(np.array(fccords, dtype=float), axis=0)
        return fccords

    def _apply_sym_operator(
        self,
        symmetry_operations: list[str],
        array_metal_xyz: np.ndarray
    ) -> tuple[np.ndarray, np.ndarray]:
        """
        Applies a list of symmetry operations to the coordinates and returns all unique sites.

        Args:
            symmetry_operations (list[str]): The symmetry operations in CIF string syntax.
            array_metal_xyz (np.ndarray): Coordinates to transform.

        Returns:
            tuple[np.ndarray, np.ndarray]: unique coordinates array and corresponding unique indices.
        """
        array_metal_extend_xyz = np.hstack(
            (array_metal_xyz, np.ones((len(array_metal_xyz), 1))))
        cell_array_metal_xyz = np.empty((0, 3))
        for sym_line_idx in range(len(symmetry_operations)):
            transfromation_matrix = self._extract_transformation_matrix_from_symmetry_operator(
                symmetry_operations[sym_line_idx])
            new_extend_xyz = np.matmul(transfromation_matrix,
                                       array_metal_extend_xyz.T).T
            new_xyz = new_extend_xyz[:, 0:3]
            cell_array_metal_xyz = np.vstack((cell_array_metal_xyz, new_xyz))

        round_cell_array_metal_xyz = np.round(
            self._wrap_fccords_to_0_1(cell_array_metal_xyz), 4)
        _, unique_indices = np.unique(round_cell_array_metal_xyz,
                                      axis=0,
                                      return_index=True)
        unique_indices.sort()
        unique_metal_array = round_cell_array_metal_xyz[unique_indices]

        return unique_metal_array, unique_indices

    def _extract_atoms_fcoords_from_lines(
        self,
        atom_site_sector: list[str]
    ) -> tuple[np.ndarray, np.ndarray]:
        """
        Extracts atom type, label, and fractional coordinates from the atom_site sector.

        Args:
            atom_site_sector (list[str]): Lines defining atomic sites in the CIF.

        Returns:
            tuple[np.ndarray, np.ndarray]: (array_atom: [atom_type, atom_label], array_xyz: [x, y, z])
        """
        atom_site_lines = []
        keyword = r"_"
        for line in atom_site_sector:
            m = re.search(keyword, line)
            if m is None:
                atom_site_lines.append(line)

        array_atom = np.zeros((len(atom_site_lines), 2), dtype=object)
        array_xyz = np.zeros((len(atom_site_lines), 3))

        for i in range(len(atom_site_lines)):
            for j in [0, 1, 2, 3, 4]:  # only need atom type, atom label, x,y,z
                if j < 2:
                    array_atom[i, j] = remove_tail_number(
                        atom_site_lines[i].split()[j])
                else:
                    array_xyz[i, (j - 2)] = remove_bracket(
                        atom_site_lines[i].split()[j])
        if self._debug:
            self.ostream.print_info(
                f"Found {len(array_atom)} atoms in atom_site_sector")
            self.ostream.print_info(f"array_atom {array_atom}")
            self.ostream.print_info(f"array_xyz {array_xyz}")
            self.ostream.flush()
        return array_atom, array_xyz


[docs]
    def get_type_atoms_fcoords_in_primitive_cell(
        self,
        target_type: str = None
    ) -> tuple[list, np.ndarray, np.ndarray]:
        """
        Get all atoms of a particular type in the primitive cell, applying symmetry.

        Args:
            target_type (str, optional): Atom type to select (e.g., "V", "E"). If None, uses all.

        Returns:
            tuple[list, np.ndarray, np.ndarray]: (cell_info, atom data, requested atom fractional coordinates)
        """
        array_atom, array_xyz = self._extract_atoms_fcoords_from_lines(
            self.atom_site_sector)
        if target_type is None:
            self.ostream.print_info(
                f"target_type not specified, use {target_type} as default")
            self.ostream.flush()
        if len(self.symmetry_sector) > 1:  # need to apply symmetry operations
            array_metal_xyz = array_xyz[array_atom[:, 0] == target_type]
            array_metal_xyz = np.round(array_metal_xyz, 4)
            symmetry_sector_neat = extract_quote_lines(self.symmetry_sector)
            if len(symmetry_sector_neat) < 2:  # if no quote, then find x,y,z
                symmetry_sector_neat = extract_xyz_lines(self.symmetry_sector)
            symmetry_operations = self._extract_symmetry_operation_from_lines(
                symmetry_sector_neat)
            no_sym_array_metal_xyz, no_sym_indices = self._apply_sym_operator(
                symmetry_operations, array_metal_xyz)
            array_metal_xyz_final = no_sym_array_metal_xyz
            array_atom = np.tile(array_atom,
                                 (len(symmetry_operations), 1))[no_sym_indices]

        else:
            array_metal_xyz = array_xyz[array_atom[:, 0] == target_type]
            array_metal_xyz_final = np.round(array_metal_xyz, 4)

        self.fcoords = self._wrap_fccords_to_0_1(array_metal_xyz_final)
        self.target_fcoords = self._wrap_fccords_to_0_1(array_metal_xyz_final)

        self.data = []
        for i in range(len(self.fcoords)):
            atom_number = i + 1
            atom_type = array_atom[i, 0] + str(atom_number)
            atom_label = array_atom[i, 1]
            residue_name = 'MOL'
            residue_number = 1
            x = self.fcoords[i, 0]
            y = self.fcoords[i, 1]
            z = self.fcoords[i, 2]
            spin = 1.00
            charge = 0.0
            note = ''
            self.data.append([
                atom_type, atom_label, atom_number, residue_name,
                residue_number, x, y, z, spin, charge, note
            ])
        self.data = np.vstack(self.data)
        if self._debug:
            self.ostream.print_info(
                f"Found {len(self.fcoords)} {target_type} atoms in primitive cell"
            )
            self.ostream.print_info(f"fcoords {self.fcoords}")
            self.ostream.flush()

        return self.cell_info, self.data, self.target_fcoords




if __name__ == "__main__":
    """
    Example driver for CifReader. Reads a test CIF file and prints
    coordinates and cell information for two atom types, 'V' and 'E'.
    """
    cif_file = "tests/testdata/test.cif"
    cif_reader = CifReader(filepath=cif_file)
    cif_reader._debug = True
    cif_reader.read_cif()
    cif_reader.get_type_atoms_fcoords_in_primitive_cell(target_type="V")
    if MPI.COMM_WORLD.Get_rank() == 0:
        print(cif_reader.target_fcoords)
        print(cif_reader.cell_info)

    cif_reader.get_type_atoms_fcoords_in_primitive_cell(target_type="E")
    if MPI.COMM_WORLD.Get_rank() == 0:
        print(cif_reader.target_fcoords)
        print("*" * 50)
        print(cif_reader.cell_info)