Source code for mofbuilder.io.xyz_reader

import numpy as np
from pathlib import Path
from typing import Optional, Any, Union
from veloxchem.outputstream import OutputStream
from veloxchem.veloxchemlib import mpi_master
from veloxchem.errorhandler import assert_msg_critical
import mpi4py.MPI as MPI
import sys
import re

"""
XYZ file reader for simple ASCII molecular coordinate files.

Expected atom line format in array:
atom_type, atom_label, atom_number, residue_name, residue_number, x, y, z, spin, charge, note
"""


[docs] class XyzReader: """Reader for XYZ molecular coordinate files. Handles loading of XYZ files with MPI-aware output and optional structure recentering. Attributes: comm (Any): MPI communicator. rank (int): MPI process rank. nodes (int): Number of MPI processes. ostream (OutputStream): Output stream for info/logging. filepath (Optional[str]): Path to the input .xyz file. data (Optional[np.ndarray]): Atom information as parsed and processed. _debug (bool): Enables debug output if True. Methods: read_xyz: Read, parse, and (optionally recenter) XYZ file into array format. """ def __init__( self, comm: Optional[Any] = None, ostream: Optional[OutputStream] = None, filepath: Optional[Union[str, Path]] = None, ) -> None: """Initializes the XyzReader. Args: comm (Any, optional): MPI communicator. Defaults to MPI.COMM_WORLD. ostream (Optional[OutputStream]): Output stream for info/debug. Defaults to output on master rank. filepath (Optional[str or Path]): Path to the .xyz file to load. """ if comm is None: comm = MPI.COMM_WORLD if ostream is None: if comm.Get_rank() == mpi_master(): ostream = OutputStream(sys.stdout) else: ostream = OutputStream(None) # mpi information self.comm = comm self.rank = self.comm.Get_rank() self.nodes = self.comm.Get_size() # output stream self.ostream = ostream self.filepath = filepath self.data: Optional[np.ndarray] = None # debug self._debug: bool = False
[docs] def read_xyz( self, filepath: Optional[Union[str, Path]] = None, recenter: bool = False, com_type: Optional[str] = None, residue_name: str = 'MOL', residue_number: int = 1, ) -> None: """Reads atom and coordinate information from an XYZ file, storing structure in a NumPy array. Optionally recenters structure coordinates to a specified atom type or whole molecule center of mass. Args: filepath (Optional[str or Path]): Path to the .xyz file. If not supplied, uses instance filepath. recenter (bool): If True, recenter coordinates to center-of-mass. Defaults to False. com_type (Optional[str]): If specified, use only atoms of this type for COM calculation. residue_name (str): Residue name assignment for all atoms. Defaults to 'MOL'. residue_number (int): Residue number for all atoms. Defaults to 1. Returns: None Raises: FileNotFoundError: If the specified file path does not exist. Note: Assigns atom_type, atom_label (e.g., "C1", "O2"), atom_number (1-based), residue_name, residue_number, x, y, z, spin (1.0), charge (from file or 0.0), and note (from file or empty) for each atom. Array output shape is (num_atoms, 11). """ if filepath is not None: self.filepath = filepath if self.filepath is None or not Path(self.filepath).exists(): raise FileNotFoundError(f"XYZ file {self.filepath} not found") with open(self.filepath, 'r') as f: lines = f.readlines() if len(lines) < 2: raise ValueError(f"XYZ file {self.filepath} has insufficient lines") comment = lines[1].strip() # Extract the atom coordinates from the subsequent lines data = [] for line in lines[2:]: if len(line.strip().split()) < 4: continue parts = line.split() atom_type = parts[0] atom_type = re.sub(r'\d', '', atom_type) # remove digits atom_number = len(data) + 1 atom_label = atom_type + str(atom_number) res_name = residue_name res_number = residue_number x = float(parts[1]) y = float(parts[2]) z = float(parts[3]) if len(parts) > 4: charge = float(parts[4]) else: charge = 0.0 if len(parts) > 5: note = parts[5] else: note = '' spin = 1.00 data.append(( atom_type, atom_label, atom_number, res_name, res_number, x, y, z, spin, charge, note )) def type_data(arr: np.ndarray) -> np.ndarray: """Ensures appropriate dtype conversion for numeric columns. Args: arr (np.ndarray): Array of atom records (shape: N, 11). Returns: np.ndarray: Array with selected columns cast to appropriate types. """ arr[:, 2] = arr[:, 2].astype(int) # atom_number arr[:, 4] = arr[:, 4].astype(int) # residue_number arr[:, 5:8] = arr[:, 5:8].astype(float) # x, y, z arr[:, 8] = arr[:, 8].astype(float) # spin arr[:, 9] = arr[:, 9].astype(float) # charge return arr self.data = type_data(np.array(data, dtype=object)) if recenter: # If not define com_type, use all atoms to calculate com if com_type is None: com_type_ccoords = self.data[:, 5:8].astype(float) else: # Try to get atom_type match for center of mass matched_types = self.data[:, 0] == com_type if not np.any(matched_types): self.ostream.print_warning( f"com_type {com_type} not in the xyz file, using all atoms to calculate com" ) com_type_ccoords = self.data[:, 5:8].astype(float) else: com_type_ccoords = self.data[matched_types][:, 5:8].astype(float) com = np.mean(com_type_ccoords, axis=0) if self._debug: self.ostream.print_info( f"Center of mass type {com_type} at {com}" ) self.data[:, 5:8] = self.data[:, 5:8].astype(float) - com