Source code for biochar.surface_builder

"""
Surface Builder for Porous Biochar Systems

Generates slit-pore (Phase 1) and amorphous (Phase 2, deferred — see #1) surface
systems consisting of multiple parallel PAH sheets. Each sheet is an
independent biochar molecule produced by the existing BiocharGenerator
pipeline. The builder positions the sheets in 3D, applies periodic
boundary conditions, and exports GROMACS-ready files.
"""

from __future__ import annotations

from dataclasses import dataclass
from pathlib import Path
from typing import Dict, List, Optional, Tuple

import numpy as np
from rdkit import Chem

from .biochar_generator import BiocharGenerator, GeneratorConfig
from .constants import CARBON_VDW_DIAMETER
from .gromacs_export import ITPFileWriter
from .heteroatom_assignment import _fix_heteroatom_bond_types
from .opls_typing import AtomTyper, ChargeAssigner


# ---------------------------------------------------------------------------
# Data classes
# ---------------------------------------------------------------------------

[docs] @dataclass class SheetResult: """ Result of generating and positioning a single sheet. Attributes: mol: RDKit molecule with 3-D conformer and OPLS-AA types. coords: Atomic coordinates in Ångströms, shape ``(N, 3)``. Updated in-place by :meth:`SurfaceBuilder.build` to world coordinates (translated along *z* after flattening). composition: Atom counts and ratios (:class:`~heteroatom_assignment.CompositionInfo`). atom_types: Mapping of atom index → OPLS-AA type string. charges: Mapping of atom index → partial charge (electrons). molecule_name: Residue name used in ``.gro`` and ``.itp`` files. Identical for all sheets when *sheet_overrides* is ``None``; indexed (e.g. ``SHT1``, ``SHT2``) for distinct sheets. """ mol: Chem.Mol coords: np.ndarray # Angstroms composition: object # CompositionInfo — avoid circular import atom_types: Dict[int, str] charges: Dict[int, float] molecule_name: str # e.g. "SHT" (identical) or "SHT1" (distinct)
[docs] @dataclass class SurfaceConfig: """ Configuration for :class:`SurfaceBuilder`. Attributes: target_num_carbons: Carbon atoms per sheet (passed to :class:`~biochar_generator.BiocharGenerator`). H_C_ratio: Target H/C ratio for each sheet. O_C_ratio: Target O/C ratio (used when *functional_groups* is ``None``). functional_groups: Functional groups applied to every sheet, e.g. ``{"phenolic": 2, "ether": 1}``. Overridden per-sheet via *sheet_overrides*. aromaticity_percent: Target aromatic carbon fraction (percentage). pore_type: Pore geometry. ``"slit"`` stacks parallel sheets along *z*; ``"amorphous"`` performs random rigid-body placement with steric rejection (each sheet independently rotated/translated). num_sheets: Number of parallel sheets. Must be ≥ 2. pore_diameter: Gap between the inner van-der-Waals surfaces of adjacent sheets, in Ångströms. The centre-to-centre sheet separation is ``pore_diameter + 3.4 Å`` (slit pore only). max_attempts: Maximum random placement attempts per sheet for ``pore_type="amorphous"`` before raising ``RuntimeError``. min_separation: Minimum allowed inter-sheet atom-atom distance (Ångströms) used as the steric-rejection criterion for ``pore_type="amorphous"``. sheet_overrides: Per-sheet configuration overrides. If provided, must be a list of dicts (one per sheet) with keys matching :class:`~biochar_generator.GeneratorConfig` fields. If ``None``, all sheets are chemically identical (one ``.itp`` with ``count = num_sheets``). box_padding_xy: Padding added to each side of the bounding box in *x* and *y*, in nm. box_padding_z: Padding added to each side of the bounding box in *z*, in nm. system_name: Name written to the ``[ system ]`` section of the ``.top`` file. sheet_base_name: Residue name prefix for sheet molecules. Must be ≤ 3 characters so that the full name (prefix + digit) fits within GROMACS' 5-character residue-name limit. seed: Random seed passed to each :class:`~biochar_generator.BiocharGenerator`. defect_fraction: Pentagon insertion probability for each sheet (see :attr:`~biochar_generator.GeneratorConfig.defect_fraction`). """ # --- Sheet chemistry (passed through to BiocharGenerator) --------------- target_num_carbons: int = 50 H_C_ratio: float = 0.3 O_C_ratio: float = 0.05 functional_groups: Optional[Dict[str, int]] = None aromaticity_percent: float = 95.0 # --- Pore geometry ------------------------------------------------------ pore_type: str = "slit" # "slit" (parallel stack) or "amorphous" num_sheets: int = 2 pore_diameter: float = 10.0 # Angstroms — gap between inner vdW surfaces # --- Amorphous packing -------------------------------------------------- # Maximum random placement attempts per sheet before giving up. max_attempts: int = 500 # Minimum allowed inter-sheet atom-atom distance (Angstroms). min_separation: float = 3.0 # --- Per-sheet overrides ------------------------------------------------ # If provided, must be a list of dicts (one per sheet) with keys matching # GeneratorConfig fields. If None, all sheets are chemically identical. sheet_overrides: Optional[List[Dict]] = None # --- Box padding (nm) --------------------------------------------------- box_padding_xy: float = 1.0 box_padding_z: float = 1.0 # --- Naming ------------------------------------------------------------- system_name: str = "SLIT" sheet_base_name: str = "SHT" # max 3 chars (+ digit ≤ 5 GROMACS limit) # --- Reproducibility ---------------------------------------------------- seed: Optional[int] = None # --- Ring defects ------------------------------------------------------- # Probability [0, 1) that each ring addition is a 5-membered pentagon. # 0.0 = pure hexagonal (default). defect_fraction: float = 0.0 # --- Strict validation -------------------------------------------------- # Propagated to each sheet's GeneratorConfig. Set False for lenient mode. strict: bool = True def __post_init__(self): self._validate() def _validate(self): if self.pore_type not in ("slit", "amorphous"): raise ValueError( f"pore_type must be 'slit' or 'amorphous' " f"(got '{self.pore_type}')." ) if self.num_sheets < 2: raise ValueError(f"num_sheets must be >= 2 (got {self.num_sheets})") if self.pore_diameter <= 0: raise ValueError(f"pore_diameter must be > 0 (got {self.pore_diameter})") if self.max_attempts < 1: raise ValueError(f"max_attempts must be >= 1 (got {self.max_attempts})") if self.min_separation <= 0: raise ValueError( f"min_separation must be > 0 (got {self.min_separation})" ) if len(self.sheet_base_name) > 3: raise ValueError( f"sheet_base_name must be <= 3 chars for GROMACS compatibility " f"(got '{self.sheet_base_name}', {len(self.sheet_base_name)} chars)" ) if self.sheet_overrides is not None: if len(self.sheet_overrides) != self.num_sheets: raise ValueError( f"sheet_overrides length ({len(self.sheet_overrides)}) " f"must equal num_sheets ({self.num_sheets})" )
# --------------------------------------------------------------------------- # Rotation helper # --------------------------------------------------------------------------- def _rotation_matrix_from_vectors(a: np.ndarray, b: np.ndarray) -> np.ndarray: """ Compute rotation matrix *R* such that ``R @ a = b`` (both unit vectors). Uses Rodrigues' rotation formula. Handles degenerate cases where *a* and *b* are parallel or anti-parallel. """ a = a / np.linalg.norm(a) b = b / np.linalg.norm(b) v = np.cross(a, b) c = np.dot(a, b) if np.linalg.norm(v) < 1e-8: if c > 0: return np.eye(3) # 180-degree rotation: pick any perpendicular axis perp = np.array([1.0, 0.0, 0.0]) if abs(a[0]) < 0.9 else np.array([0.0, 1.0, 0.0]) perp = perp - np.dot(perp, a) * a perp = perp / np.linalg.norm(perp) return 2.0 * np.outer(perp, perp) - np.eye(3) vx = np.array([ [0, -v[2], v[1]], [v[2], 0, -v[0]], [-v[1], v[0], 0], ]) return np.eye(3) + vx + vx @ vx / (1.0 + c) def _random_rotation_matrix(rng: np.random.Generator) -> np.ndarray: """ Draw a uniformly-distributed random rotation matrix in SO(3). Samples a uniform random unit quaternion (Shoemake's method) and converts it to a 3x3 rotation matrix. Self-contained (numpy only). """ u1, u2, u3 = rng.random(3) q1 = np.sqrt(1.0 - u1) * np.sin(2.0 * np.pi * u2) q2 = np.sqrt(1.0 - u1) * np.cos(2.0 * np.pi * u2) q3 = np.sqrt(u1) * np.sin(2.0 * np.pi * u3) q4 = np.sqrt(u1) * np.cos(2.0 * np.pi * u3) # Quaternion (x, y, z, w) = (q1, q2, q3, q4) -> rotation matrix x, y, z, w = q1, q2, q3, q4 return np.array([ [1 - 2 * (y * y + z * z), 2 * (x * y - z * w), 2 * (x * z + y * w)], [2 * (x * y + z * w), 1 - 2 * (x * x + z * z), 2 * (y * z - x * w)], [2 * (x * z - y * w), 2 * (y * z + x * w), 1 - 2 * (x * x + y * y)], ]) # --------------------------------------------------------------------------- # Surface builder # ---------------------------------------------------------------------------
[docs] class SurfaceBuilder: """ Build slit-pore surface systems from parallel biochar sheets. The builder orchestrates a three-phase pipeline: 1. **Sheet generation** — creates each sheet via :class:`BiocharGenerator`, assigns OPLS-AA types and charges, then flattens the sheet to the *xy* plane using SVD. 2. **Positioning** — translates sheet *i* to ``z = i × (pore_diameter + 3.4 Å)`` so consecutive sheets are separated by the requested pore gap. 3. **Box computation** — wraps all sheets in an orthogonal periodic box padded by :attr:`SurfaceConfig.box_padding_xy` (nm) in *x*/*y* and :attr:`SurfaceConfig.box_padding_z` (nm) in *z*, then centres the system inside the box. When :attr:`SurfaceConfig.sheet_overrides` is ``None`` all sheets are chemically identical: only the first sheet is generated; the rest are deep-copied. A single ``.itp`` is produced and the ``.top`` lists it with ``count = num_sheets``. Use :func:`~biochar_generator.generate_surface` for a one-call convenience wrapper. Examples:: config = SurfaceConfig( target_num_carbons=60, functional_groups={"phenolic": 2}, pore_diameter=12.0, num_sheets=2, seed=42, ) builder = SurfaceBuilder(config) sheets, box_nm = builder.build() gro, top, itps = builder.export_gromacs("output", "slit_pore") """ def __init__(self, config: SurfaceConfig): self.config = config self.sheets: List[SheetResult] = [] self._box_vectors: Optional[np.ndarray] = None # nm # ------------------------------------------------------------------ # Public API # ------------------------------------------------------------------
[docs] def build(self) -> Tuple[List[SheetResult], np.ndarray]: """ Generate all sheets and compute their positioned coordinates. Returns: ``(list_of_SheetResult, box_vectors_nm)`` Each :class:`SheetResult`.coords is updated **in place** to world coordinates (translated along *z* so sheets are stacked with the requested pore gap). """ self.sheets = [] for i in range(self.config.num_sheets): print(f"\n{'='*60}") print(f"Generating sheet {i + 1}/{self.config.num_sheets}...") print(f"{'='*60}") sheet = self._generate_single_sheet(i) self.sheets.append(sheet) if self.config.pore_type == "amorphous": # Random rigid-body packing with steric rejection. The box is # sized to give enough free volume for all sheets, then each # sheet is randomly rotated and translated inside it. self._box_vectors = self._compute_amorphous_box_vectors() self._pack_amorphous(self._box_vectors) else: self._position_sheets() self._box_vectors = self._compute_box_vectors() self._centre_in_box(self._box_vectors) return self.sheets, self._box_vectors
[docs] def export_gromacs( self, output_directory: str = ".", basename: str = "surface", ) -> Tuple[Path, Path, List[Path]]: """ Export the positioned surface system to GROMACS files. Returns: ``(gro_path, top_path, list_of_itp_paths)`` """ if not self.sheets: raise RuntimeError("Must call build() before export_gromacs()") # Import here to avoid circular imports at module level from .gromacs_export import MultiSheetGROWriter, SurfaceTopologyWriter output_dir = Path(output_directory) output_dir.mkdir(parents=True, exist_ok=True) sheets_identical = self.config.sheet_overrides is None # --- Write .itp files ----------------------------------------------- itp_paths: List[Path] = [] if sheets_identical: itp_path = output_dir / f"{basename}_sheet.itp" ITPFileWriter.write( str(itp_path), self.sheets[0].mol, self.sheets[0].atom_types, self.sheets[0].charges, molecule_name=self.sheets[0].molecule_name, ) itp_paths.append(itp_path) else: for sheet in self.sheets: itp_path = output_dir / f"{basename}_{sheet.molecule_name.lower()}.itp" ITPFileWriter.write( str(itp_path), sheet.mol, sheet.atom_types, sheet.charges, molecule_name=sheet.molecule_name, ) itp_paths.append(itp_path) # --- Write combined .gro -------------------------------------------- gro_path = output_dir / f"{basename}.gro" MultiSheetGROWriter.write( str(gro_path), sheets=self.sheets, box_vectors=self._box_vectors, title=( f"Amorphous surface, {self.config.num_sheets} sheets" if self.config.pore_type == "amorphous" else ( f"Slit pore surface, pore_diameter=" f"{self.config.pore_diameter:.1f} A, " f"{self.config.num_sheets} sheets" ) ), ) # --- Write .top ----------------------------------------------------- top_path = output_dir / f"{basename}.top" SurfaceTopologyWriter.write( str(top_path), sheets=self.sheets, itp_paths=itp_paths, sheets_identical=sheets_identical, system_name=self.config.system_name, ) return gro_path, top_path, itp_paths
# ------------------------------------------------------------------ # Sheet generation # ------------------------------------------------------------------ def _generate_single_sheet(self, sheet_index: int) -> SheetResult: """Generate a single biochar sheet (or deep-copy if identical).""" # Identical-sheet optimisation: copy first sheet for indices > 0 if self.config.sheet_overrides is None and sheet_index > 0: base = self.sheets[0] return SheetResult( mol=Chem.Mol(base.mol), coords=base.coords.copy(), composition=base.composition, atom_types=dict(base.atom_types), charges=dict(base.charges), molecule_name=base.molecule_name, ) # Build GeneratorConfig for this sheet gen_kwargs = dict( target_num_carbons=self.config.target_num_carbons, H_C_ratio=self.config.H_C_ratio, O_C_ratio=self.config.O_C_ratio, aromaticity_percent=self.config.aromaticity_percent, functional_groups=self.config.functional_groups, defect_fraction=self.config.defect_fraction, seed=self.config.seed, strict=self.config.strict, ) # Apply per-sheet overrides if present if self.config.sheet_overrides is not None: overrides = self.config.sheet_overrides[sheet_index] gen_kwargs.update(overrides) # Molecule name: same for identical sheets, indexed for distinct if self.config.sheet_overrides is None: mol_name = self.config.sheet_base_name else: mol_name = f"{self.config.sheet_base_name}{sheet_index + 1}" gen_kwargs["molecule_name"] = mol_name config = GeneratorConfig(**gen_kwargs) generator = BiocharGenerator(config) mol, coords, composition = generator.generate() # Ensure clean bond types after pipeline mol = _fix_heteroatom_bond_types(mol) # OPLS typing (using public classes directly) typer = AtomTyper() atom_types = typer.assign_atom_types(mol) charger = ChargeAssigner() charges = charger.assign_charges(mol, atom_types) # Flatten sheet to xy plane coords = self._flatten_to_xy(coords) return SheetResult( mol=mol, coords=coords, composition=composition, atom_types=atom_types, charges=charges, molecule_name=mol_name, ) # ------------------------------------------------------------------ # Geometry helpers # ------------------------------------------------------------------ @staticmethod def _flatten_to_xy(coords: np.ndarray) -> np.ndarray: """ Rotate coordinates so the best-fit plane coincides with z=0, then translate centroid to origin. """ centroid = coords.mean(axis=0) centred = coords - centroid # SVD: smallest singular vector = plane normal _, _, vt = np.linalg.svd(centred) normal = vt[2] # Rotation to align normal with z-axis z_axis = np.array([0.0, 0.0, 1.0]) R = _rotation_matrix_from_vectors(normal, z_axis) rotated = (R @ centred.T).T # Zero residual z drift rotated[:, 2] -= rotated[:, 2].mean() return rotated def _position_sheets(self): """Translate each sheet along z to create slit pore gaps.""" for i, sheet in enumerate(self.sheets): z_offset = i * (self.config.pore_diameter + CARBON_VDW_DIAMETER) sheet.coords[:, 2] += z_offset def _compute_box_vectors(self) -> np.ndarray: """ Compute orthogonal periodic box vectors in nm. x, y: span of all sheets + 2 * box_padding_xy z: span of all sheets + 2 * box_padding_z """ all_coords = np.vstack([s.coords for s in self.sheets]) # Angstroms all_coords_nm = all_coords * 0.1 min_xyz = all_coords_nm.min(axis=0) max_xyz = all_coords_nm.max(axis=0) extent = max_xyz - min_xyz box = np.array([ extent[0] + 2 * self.config.box_padding_xy, extent[1] + 2 * self.config.box_padding_xy, extent[2] + 2 * self.config.box_padding_z, ]) return box def _centre_in_box(self, box_nm: np.ndarray): """ Translate all sheet coordinates so the system is centred in the box. Coordinates remain in Angstroms internally; *box_nm* is in nm. """ all_coords = np.vstack([s.coords for s in self.sheets]) min_xyz_A = all_coords.min(axis=0) max_xyz_A = all_coords.max(axis=0) current_centre = (min_xyz_A + max_xyz_A) / 2.0 box_centre_A = (box_nm * 10.0) / 2.0 # nm -> Angstroms shift = box_centre_A - current_centre for sheet in self.sheets: sheet.coords += shift # ------------------------------------------------------------------ # Amorphous packing # ------------------------------------------------------------------ def _compute_amorphous_box_vectors(self) -> np.ndarray: """ Compute a cubic periodic box (nm) for amorphous packing. The box must hold *num_sheets* arbitrarily-oriented sheets without forcing overlap, so it is sized from the largest sheet diameter and scaled by the cube root of the sheet count plus padding. """ # Largest pairwise extent of any single sheet (its longest diagonal # bounds the sheet under any rotation). max_diag_A = 0.0 for sheet in self.sheets: extent = sheet.coords.max(axis=0) - sheet.coords.min(axis=0) diag = float(np.linalg.norm(extent)) max_diag_A = max(max_diag_A, diag) max_diag_nm = max_diag_A * 0.1 # Linear scale so volume grows with sheet count; +1 sheet of margin. scale = (self.config.num_sheets + 1) ** (1.0 / 3.0) side = max_diag_nm * scale + 2 * self.config.box_padding_xy return np.array([side, side, side]) def _pack_amorphous(self, box_nm: np.ndarray): """ Place each sheet with a random rigid-body transform, rejecting placements that bring any two sheets closer than :attr:`SurfaceConfig.min_separation`. Each accepted sheet's :attr:`SheetResult.coords` is updated in place to its world coordinates (Ångströms). Raises ``RuntimeError`` if a sheet cannot be placed within :attr:`SurfaceConfig.max_attempts`. """ rng = np.random.default_rng(self.config.seed) box_A = box_nm * 10.0 # nm -> Angstroms min_sep = self.config.min_separation placed_coords: List[np.ndarray] = [] for i, sheet in enumerate(self.sheets): # Centre the sheet at the origin so rotation is about its centroid. base = sheet.coords - sheet.coords.mean(axis=0) radius = float(np.linalg.norm(base, axis=1).max()) placed = False for _ in range(self.config.max_attempts): R = _random_rotation_matrix(rng) rotated = (R @ base.T).T # Keep the sheet fully inside the box: translate the rotated # centroid (origin) to a point at least `radius` from each # wall. If the sheet is larger than the box, fall back to a # uniform translation across the whole box. lo = np.minimum(radius, box_A / 2.0) hi = np.maximum(box_A - radius, box_A / 2.0) translation = lo + rng.random(3) * (hi - lo) candidate = rotated + translation if self._is_clash_free(candidate, placed_coords, min_sep): sheet.coords = candidate placed_coords.append(candidate) placed = True break if not placed: raise RuntimeError( f"Failed to place sheet {i + 1}/{self.config.num_sheets} " f"after {self.config.max_attempts} attempts. " f"Increase the box size (reduce num_sheets, increase " f"box_padding_xy, or lower min_separation), or raise " f"max_attempts." ) @staticmethod def _is_clash_free( candidate: np.ndarray, placed: List[np.ndarray], min_sep: float, ) -> bool: """ Return ``True`` if *candidate* (N, 3) keeps every atom at least *min_sep* Å from all atoms of already-*placed* sheets. """ for other in placed: # Cheap centroid/radius pre-screen before the full distance check. c_centre = candidate.mean(axis=0) o_centre = other.mean(axis=0) c_rad = np.linalg.norm(candidate - c_centre, axis=1).max() o_rad = np.linalg.norm(other - o_centre, axis=1).max() if np.linalg.norm(c_centre - o_centre) > c_rad + o_rad + min_sep: continue # Full atom-atom minimum distance. diff = candidate[:, None, :] - other[None, :, :] dmin = np.sqrt((diff * diff).sum(axis=2)).min() if dmin < min_sep: return False return True