Source code for mofbuilder.io.xyz_reader

import numpy as np
from pathlib import Path
from typing import Optional, Any, Union
from veloxchem.outputstream import OutputStream
from veloxchem.veloxchemlib import mpi_master
from veloxchem.errorhandler import assert_msg_critical
import mpi4py.MPI as MPI
import sys
import re

"""
XYZ file reader for simple ASCII molecular coordinate files.

Expected atom line format in array:
atom_type, atom_label, atom_number, residue_name, residue_number, x, y, z, spin, charge, note
"""



[docs]
class XyzReader:
    """Reader for XYZ molecular coordinate files.

    Handles loading of XYZ files with MPI-aware output and optional structure recentering.

    Attributes:
        comm (Any): MPI communicator.
        rank (int): MPI process rank.
        nodes (int): Number of MPI processes.
        ostream (OutputStream): Output stream for info/logging.
        filepath (Optional[str]): Path to the input .xyz file.
        data (Optional[np.ndarray]): Atom information as parsed and processed.
        _debug (bool): Enables debug output if True.

    Methods:
        read_xyz: Read, parse, and (optionally recenter) XYZ file into array format.
    """

    def __init__(
        self,
        comm: Optional[Any] = None,
        ostream: Optional[OutputStream] = None,
        filepath: Optional[Union[str, Path]] = None,
    ) -> None:
        """Initializes the XyzReader.

        Args:
            comm (Any, optional): MPI communicator. Defaults to MPI.COMM_WORLD.
            ostream (Optional[OutputStream]): Output stream for info/debug. Defaults to output on master rank.
            filepath (Optional[str or Path]): Path to the .xyz file to load.
        """
        if comm is None:
            comm = MPI.COMM_WORLD

        if ostream is None:
            if comm.Get_rank() == mpi_master():
                ostream = OutputStream(sys.stdout)
            else:
                ostream = OutputStream(None)

        # mpi information
        self.comm = comm
        self.rank = self.comm.Get_rank()
        self.nodes = self.comm.Get_size()

        # output stream
        self.ostream = ostream

        self.filepath = filepath
        self.data: Optional[np.ndarray] = None

        # debug
        self._debug: bool = False


[docs]
    def read_xyz(
        self,
        filepath: Optional[Union[str, Path]] = None,
        recenter: bool = False,
        com_type: Optional[str] = None,
        residue_name: str = 'MOL',
        residue_number: int = 1,
    ) -> None:
        """Reads atom and coordinate information from an XYZ file, storing structure in a NumPy array.

        Optionally recenters structure coordinates to a specified atom type or whole molecule center of mass.

        Args:
            filepath (Optional[str or Path]): Path to the .xyz file. If not supplied, uses instance filepath.
            recenter (bool): If True, recenter coordinates to center-of-mass. Defaults to False.
            com_type (Optional[str]): If specified, use only atoms of this type for COM calculation.
            residue_name (str): Residue name assignment for all atoms. Defaults to 'MOL'.
            residue_number (int): Residue number for all atoms. Defaults to 1.

        Returns:
            None

        Raises:
            FileNotFoundError: If the specified file path does not exist.

        Note:
            Assigns atom_type, atom_label (e.g., "C1", "O2"), atom_number (1-based),
            residue_name, residue_number, x, y, z, spin (1.0), charge (from file or 0.0),
            and note (from file or empty) for each atom.

            Array output shape is (num_atoms, 11).
        """
        if filepath is not None:
            self.filepath = filepath

        if self.filepath is None or not Path(self.filepath).exists():
            raise FileNotFoundError(f"XYZ file {self.filepath} not found")

        with open(self.filepath, 'r') as f:
            lines = f.readlines()

        if len(lines) < 2:
            raise ValueError(f"XYZ file {self.filepath} has insufficient lines")

        comment = lines[1].strip()
        # Extract the atom coordinates from the subsequent lines
        data = []
        for line in lines[2:]:
            if len(line.strip().split()) < 4:
                continue
            parts = line.split()
            atom_type = parts[0]
            atom_type = re.sub(r'\d', '', atom_type)  # remove digits
            atom_number = len(data) + 1
            atom_label = atom_type + str(atom_number)
            res_name = residue_name
            res_number = residue_number
            x = float(parts[1])
            y = float(parts[2])
            z = float(parts[3])
            if len(parts) > 4:
                charge = float(parts[4])
            else:
                charge = 0.0
            if len(parts) > 5:
                note = parts[5]
            else:
                note = ''
            spin = 1.00
            data.append((
                atom_type, atom_label, atom_number, res_name,
                res_number, x, y, z, spin, charge, note
            ))

        def type_data(arr: np.ndarray) -> np.ndarray:
            """Ensures appropriate dtype conversion for numeric columns.

            Args:
                arr (np.ndarray): Array of atom records (shape: N, 11).

            Returns:
                np.ndarray: Array with selected columns cast to appropriate types.
            """
            arr[:, 2] = arr[:, 2].astype(int)     # atom_number
            arr[:, 4] = arr[:, 4].astype(int)     # residue_number
            arr[:, 5:8] = arr[:, 5:8].astype(float)  # x, y, z
            arr[:, 8] = arr[:, 8].astype(float)   # spin
            arr[:, 9] = arr[:, 9].astype(float)   # charge
            return arr

        self.data = type_data(np.array(data, dtype=object))

        if recenter:
            # If not define com_type, use all atoms to calculate com
            if com_type is None:
                com_type_ccoords = self.data[:, 5:8].astype(float)
            else:
                # Try to get atom_type match for center of mass
                matched_types = self.data[:, 0] == com_type
                if not np.any(matched_types):
                    self.ostream.print_warning(
                        f"com_type {com_type} not in the xyz file, using all atoms to calculate com"
                    )
                    com_type_ccoords = self.data[:, 5:8].astype(float)
                else:
                    com_type_ccoords = self.data[matched_types][:, 5:8].astype(float)
            com = np.mean(com_type_ccoords, axis=0)
            if self._debug:
                self.ostream.print_info(
                    f"Center of mass type {com_type} at {com}"
                )
            self.data[:, 5:8] = self.data[:, 5:8].astype(float) - com