Source code for mofbuilder.io.pdb_reader

import numpy as np
from pathlib import Path
from typing import Optional, Any, Tuple
from .basic import nn
from veloxchem.outputstream import OutputStream
from veloxchem.veloxchemlib import mpi_master
from veloxchem.errorhandler import assert_msg_critical
import mpi4py.MPI as MPI
import sys

"""
atom_type, atom_label, atom_number, residue_name, residue_number, x, y, z, spin, charge, note
"""



[docs]
class PdbReader:
    """Class for reading and processing atomic coordinates from PDB files.

    Attributes:
        comm (MPI.Comm): MPI communicator.
        rank (int): MPI rank of current process.
        nodes (int): Total number of MPI processes.
        ostream (OutputStream): Output stream for logging.
        filepath (str | Path | None): Path to the PDB file.
        com_target_type (str): Atom type used for center-of-mass calculation.
        data (np.ndarray | None): Parsed atom data as array;
            see row format below.
        X_data (np.ndarray | None): Rows corresponding to atom type "X".
        node_atoms (np.ndarray | None): Atom types and labels for processed nodes.
        node_ccoords (np.ndarray | None): Coordinates for all node atoms, recentered.
        node_x_ccoords (np.ndarray | None): Coordinates for atoms of type "X", recentered.
        _debug (bool): Toggle debug-level logging.

    Methods:
        read_pdb: Reads the PDB file and extracts atom data.
        expand_arr2data: Converts an array of atom records into the library's standard format.
        process_node_pdb: Specialized recentering and extraction for node atoms.
    """

    def __init__(
        self,
        comm: Optional[Any] = None,
        ostream: Optional[OutputStream] = None,
        filepath: Optional[str] = None,
    ):
        """Initialize a PdbReader instance.

        Args:
            comm (MPI.Comm, optional): MPI communicator. Defaults to MPI.COMM_WORLD.
            ostream (OutputStream, optional): Output stream for logging and info.
            filepath (str, optional): Path to the PDB file.
        """
        if comm is None:
            comm = MPI.COMM_WORLD

        if ostream is None:
            if comm.Get_rank() == mpi_master():
                ostream = OutputStream(sys.stdout)
            else:
                ostream = OutputStream(None)

        self.comm = comm
        self.rank = self.comm.Get_rank()
        self.nodes = self.comm.Get_size()

        self.ostream = ostream
        self.filepath: Optional[str] = filepath
        self.com_target_type: str = "X"
        self.data: Optional[np.ndarray] = None

        self.X_data: Optional[np.ndarray] = None
        self.node_atoms: Optional[np.ndarray] = None
        self.node_ccoords: Optional[np.ndarray] = None
        self.node_x_ccoords: Optional[np.ndarray] = None

        self._debug: bool = False


[docs]
    def read_pdb(
        self,
        filepath: Optional[str] = None,
        recenter: bool = True,
        com_type: Optional[str] = None,
    ) -> None:
        """Read a PDB file, extract atoms, and optionally recenter coordinates.

        Populates self.data with shape (N, 11): [atom_type, atom_label, atom_number,
        residue_name, residue_number, value_x, value_y, value_z, occupancy, b_factor, note].

        Args:
            filepath (str, optional): Path to the PDB file. Overrides instance filepath if given.
            recenter (bool, optional): If True (default), recenter coordinates to center-of-mass.
            com_type (str, optional): Atom type for COM; if None, uses all atoms.

        Raises:
            AssertionError: If file does not exist.

        Note:
            Only ATOM/HETATM records are parsed; format assumes canonical PDB column mapping.
        """
        if filepath is not None:
            self.filepath = filepath
        assert_msg_critical(
            Path(self.filepath).exists(),
            f"pdb file {self.filepath} not found",
        )
        if self._debug:
            self.ostream.print_info(f"Reading pdb file {self.filepath}")

        inputfile = str(self.filepath)
        with open(inputfile, "r") as fp:
            lines = fp.readlines()

        data = []
        for line in lines:
            line = line.strip()
            if len(line) > 0:
                if line.startswith("ATOM") or line.startswith("HETATM"):
                    atom_number = int(line[6:11]) if line[6:11].strip() else 1
                    atom_type = line[12:16].strip()
                    residue_name = line[17:20].strip()
                    chain_id = line[21].strip() if line[21].strip() else "A"
                    residue_number = int(line[22:26]) if line[22:26].strip() else 1
                    value_x = float(line[30:38])
                    value_y = float(line[38:46])
                    value_z = float(line[46:54])
                    occupancy = float(line[54:60]) if line[54:60].strip() else 0.0
                    b_factor = float(line[60:66]) if line[60:66].strip() else 0.0
                    atom_label = line[76:78].strip()
                    charge = line[78:80].strip()
                    note = nn(atom_type)

                    data.append([
                        atom_type,         # Assigned atom name, e.g., "CA"
                        atom_label,        # Element symbol, e.g., "C"
                        atom_number,       # Serial number
                        residue_name,      # Residue name
                        residue_number,    # Residue index
                        value_x, value_y, value_z,
                        occupancy,         # Occupancy
                        b_factor,          # B-factor
                        note               # Custom mapping
                    ])

        self.data = np.vstack(data)

        def type_data(arr: np.ndarray) -> np.ndarray:
            """Ensure proper dtype for numeric and string columns."""
            arr[:, 2] = arr[:, 2].astype(int)
            arr[:, 4] = arr[:, 4].astype(int)
            arr[:, 5:8] = arr[:, 5:8].astype(float)
            arr[:, 8] = arr[:, 8].astype(float)
            arr[:, 9] = arr[:, 9].astype(float)
            return arr

        self.data = type_data(self.data)

        if recenter:
            # Use all atoms for center calculation if com_type not specified or not present.
            if com_type is None:
                com_type_ccoords = self.data[:, 5:8].astype(float)
            else:
                if com_type not in self.data[:, -1]:
                    self.ostream.print_warning(
                        f"com_type {com_type} not in the pdb file, use all atoms to calculate com"
                    )
                    com_type_ccoords = self.data[:, 5:8].astype(float)
                else:
                    com_type_ccoords = self.data[
                        self.data[:, -1] == com_type][:, 5:8].astype(float)
            com = np.mean(com_type_ccoords, axis=0)
            if self._debug:
                self.ostream.print_info(
                    f"Center of mass type {com_type} at {com}")
            self.data[:, 5:8] = self.data[:, 5:8].astype(float) - com

        self.X_data = self.data[self.data[:, -1] == "X"]



[docs]
    def expand_arr2data(
        self, arr: Optional[np.ndarray]
    ) -> Tuple[Optional[np.ndarray], Optional[np.ndarray]]:
        """Convert array of minimal records to standard data format.

        Args:
            arr (np.ndarray | list): Array/list of [atom_type, atom_label, x, y, z].

        Returns:
            Tuple[np.ndarray | None, np.ndarray | None]:
                All parsed data (as for self.data), and extracted rows where note is 'X'.

        Example:
            data, x_data = pdb_reader.expand_arr2data([["Fe", "Fe", 0.0, 0.0, 0.0]])
        """
        if arr is None or len(arr) == 0:
            return None, None
        if isinstance(arr, list):
            arr = np.vstack(arr)

        data = []
        for i in range(len(arr)):
            atom_type = arr[i, 0]
            atom_label = arr[i, 1]
            value_x = float(arr[i, 2])
            value_y = float(arr[i, 3])
            value_z = float(arr[i, 4])
            atom_number = i + 1
            residue_name = "MOL"
            residue_number = 1
            charge = 0.0
            spin = 0
            note = nn(atom_type)
            data.append([
                atom_type, atom_label, atom_number, residue_name,
                residue_number, value_x, value_y, value_z, spin, charge, note
            ])
        data = np.vstack(data)
        X_data = data[data[:, -1] == 'X']
        return data, X_data



[docs]
    def process_node_pdb(self) -> None:
        """Read the PDB and prepare node-centered atom sets.

        Parses self.data, recenters coordinates for self.com_target_type
        (default 'X'), and stores node atoms, coordinates, and "X" atoms.

        Note:
            Intended for node PDB files (metal nodes) in the MOF framework.

        Populates:
            - self.node_atoms: [atom_type, atom_label] for all atoms
            - self.node_ccoords: centered coordinates for all atoms
            - self.node_x_ccoords: centered coordinates for "X" atoms
        """
        self.read_pdb()
        node_atoms = self.data[:, 0:2]
        node_ccoords = self.data[:, 5:8]
        node_ccoords = node_ccoords.astype(float)
        com_type_indices = [
            i for i in range(len(node_atoms))
            if nn(node_atoms[i, 0]) == self.com_target_type
        ]
        x_indices = [
            j for j in range(len(node_atoms)) if nn(node_atoms[j, 0]) == "X"
        ]
        node_x_ccoords = self.data[x_indices, 5:8]
        node_x_ccoords = node_x_ccoords.astype(float)
        com_type_ccoords = node_ccoords[com_type_indices]
        com_type = np.mean(com_type_ccoords, axis=0)
        node_ccoords = node_ccoords - com_type
        node_x_ccoords = node_x_ccoords - com_type

        if self._debug:
            self.ostream.print_info(
                f"center of mass type {self.com_target_type} at {com_type}")

            self.ostream.print_info(f"number of atoms: {len(node_atoms)}")
            self.ostream.print_info(
                f"number of X atoms: {len(node_x_ccoords)}")

        self.node_atoms = node_atoms
        self.node_ccoords = node_ccoords
        self.node_x_ccoords = node_x_ccoords




if __name__ == "__main__":
    pdb = PdbReader(filepath="tests/testdata/testnode.pdb")
    pdb._debug = True
    pdb.read_pdb()
    print(pdb.data)
    pdb.process_node_pdb()