Source code for ffsim.molecular_data

# (C) Copyright IBM 2023.
#
# This code is licensed under the Apache License, Version 2.0. You may
# obtain a copy of this license in the LICENSE.txt file in the root directory
# of this source tree or at http://www.apache.org/licenses/LICENSE-2.0.
#
# Any modifications or derivative works of this code must retain this
# copyright notice, and modified files need to carry a notice indicating
# that they have been altered from the originals.

"""The MolecularData class."""

from __future__ import annotations

import bz2
import dataclasses
import gzip
import lzma
import os
import tempfile
from collections.abc import Iterable
from functools import cached_property
from typing import Callable

import numpy as np
import orjson
import pyscf
import pyscf.cc
import pyscf.ci
import pyscf.fci
import pyscf.mcscf
import pyscf.mp
import pyscf.tools

from ffsim.hamiltonians import MolecularHamiltonian


[docs] @dataclasses.dataclass class MolecularData: """Class for storing molecular data. Attributes: core_energy (float): The core energy. one_body_integrals (np.ndarray): The one-body integrals. two_body_integrals (np.ndarray): The two-body integrals in compressed format. norb (int): The number of spatial orbitals. nelec (tuple[int, int]): The number of alpha and beta electrons. atom (list[tuple[str, list[float]]] | None): The coordinates of the atoms in the molecule. basis (str | None): The basis set, e.g. "sto-6g". spin (int | None): The spin of the molecule. symmetry (str | None): The symmetry of the molecule. mo_coeff (np.ndarray | None): Molecular orbital coefficients in the AO basis. mo_occ (np.ndarray | None): Molecular orbital occupancies. active_space (list[int] | None): The molecular orbitals included in the active space. hf_energy (float | None): The Hartree-Fock energy. hf_mo_coeff (np.ndarray | None): Hartree-Fock canonical orbital coefficients in the AO basis. hf_mo_occ (np.ndarray | None): Hartree-Fock canonical orbital occupancies. mp2_energy (float | None): The MP2 energy. mp2_t2 (np.ndarray | tuple[np.ndarray, np.ndarray, np.ndarray] | None): The MP2 t2 amplitudes. ccsd_energy (float | None): The CCSD energy. ccsd_t1 (np.ndarray | tuple[np.ndarray, np.ndarray] | None): The CCSD t1 amplitudes. ccsd_t2 (np.ndarray | tuple[np.ndarray, np.ndarray, np.ndarray] | None): The CCSD t2 amplitudes. cisd_energy (float | None): The CISD energy. cisd_vec (np.ndarray | None): The CISD state vector. sci_energy (float | None): The SCI energy. sci_vec (tuple[np.ndarray, np.ndarray, np.ndarray] | None): The SCI state vector coefficients, spin alpha strings, and spin beta strings. fci_energy (float | None): The FCI energy. fci_vec (np.ndarray | None): The FCI state vector. dipole_integrals (np.ndarray | None): The dipole integrals. orbital_symmetries (list[str] | None): The orbital symmetries. """ # Molecular integrals core_energy: float one_body_integrals: np.ndarray two_body_integrals: np.ndarray # Number of orbitals and numbers of alpha and beta electrons norb: int nelec: tuple[int, int] # Molecule information corresponding to attributes of pyscf.gto.Mole atom: list[tuple[str, list[float]]] | None = None basis: str | None = None spin: int | None = None symmetry: str | None = None # active space information mo_coeff: np.ndarray | None = None mo_occ: np.ndarray | None = None active_space: list[int] | None = None # Hartree-Fock data hf_energy: float | None = None hf_mo_coeff: np.ndarray | None = None hf_mo_occ: np.ndarray | None = None # MP2 data mp2_energy: float | None = None mp2_t2: np.ndarray | tuple[np.ndarray, np.ndarray, np.ndarray] | None = None # CCSD data ccsd_energy: float | None = None ccsd_t1: np.ndarray | tuple[np.ndarray, np.ndarray] | None = None ccsd_t2: np.ndarray | tuple[np.ndarray, np.ndarray, np.ndarray] | None = None # CISD data cisd_energy: float | None = None cisd_vec: np.ndarray | None = None # SCI data sci_energy: float | None = None sci_vec: tuple[np.ndarray, np.ndarray, np.ndarray] | None = None # FCI data fci_energy: float | None = None fci_vec: np.ndarray | None = None # other information dipole_integrals: np.ndarray | None = None orbital_symmetries: list[str] | None = None
[docs] @cached_property def hamiltonian(self) -> MolecularHamiltonian: """The Hamiltonian defined by the molecular data.""" return MolecularHamiltonian( one_body_tensor=self.one_body_integrals, two_body_tensor=pyscf.ao2mo.restore(1, self.two_body_integrals, self.norb), constant=self.core_energy, )
[docs] @cached_property def mole(self) -> pyscf.gto.Mole: """The PySCF Mole class for this molecular data.""" mol = pyscf.gto.Mole() return mol.build( atom=self.atom, basis=self.basis, spin=self.spin, symmetry=self.symmetry, )
[docs] @cached_property def scf(self) -> pyscf.scf.hf.SCF: """A PySCF SCF class for this molecular data.""" # HACK Not sure if there's a better way to do this... with tempfile.NamedTemporaryFile() as fp: self.to_fcidump(fp.name) return pyscf.tools.fcidump.to_scf(fp.name)
[docs] @staticmethod def from_scf( hartree_fock: pyscf.scf.hf.SCF, active_space: Iterable[int] | None = None ) -> "MolecularData": """Initialize a MolecularData object from a Hartree-Fock calculation. Args: hartree_fock: The Hartree-Fock object. active_space: An optional list of orbitals to use for the active space. """ if not hartree_fock.e_tot: hartree_fock = hartree_fock.run() hf_energy = hartree_fock.e_tot mol: pyscf.gto.Mole = hartree_fock.mol # Get core energy and one- and two-body integrals. if active_space is None: norb = mol.nao active_space = range(norb) active_space = list(active_space) norb = len(active_space) n_electrons = int(sum(hartree_fock.mo_occ[active_space])) n_alpha = (n_electrons + mol.spin) // 2 n_beta = (n_electrons - mol.spin) // 2 cas = pyscf.mcscf.CASCI(hartree_fock, norb, (n_alpha, n_beta)) mo = cas.sort_mo(active_space, base=0) one_body_tensor, core_energy = cas.get_h1cas(mo) two_body_integrals = cas.get_h2cas(mo) return MolecularData( core_energy=core_energy, one_body_integrals=one_body_tensor, two_body_integrals=two_body_integrals, norb=norb, nelec=(n_alpha, n_beta), atom=mol._atom, basis=mol.basis, spin=mol.spin, symmetry=mol.symmetry or None, mo_coeff=hartree_fock.mo_coeff, mo_occ=hartree_fock.mo_occ, active_space=active_space, hf_energy=hf_energy, )
[docs] def run_cisd(self, *, store_cisd_vec: bool = False) -> None: """Run CISD and store results.""" cisd = pyscf.ci.CISD(self.scf.run()) _, cisd_vec = cisd.kernel() self.cisd_energy = cisd.e_tot if store_cisd_vec: self.cisd_vec = cisd_vec
[docs] def run_sci(self, *, store_sci_vec: bool = False) -> None: """Run SCI and store results.""" sci = pyscf.fci.SCI(self.scf.run()) sci_energy, sci_vec = sci.kernel( self.one_body_integrals, self.two_body_integrals, norb=self.norb, nelec=self.nelec, ) self.sci_energy = sci_energy + self.core_energy if store_sci_vec: self.sci_vec = (sci_vec, *sci_vec._strs)
[docs] def run_fci(self, *, store_fci_vec: bool = False) -> None: """Run FCI and store results.""" cas = pyscf.mcscf.CASCI(self.scf.run(), ncas=self.norb, nelecas=self.nelec) _, _, fci_vec, _, _ = cas.kernel() self.fci_energy = cas.e_tot if store_fci_vec: self.fci_vec = fci_vec
[docs] def run_mp2(self, *, store_t2: bool = False): """Run MP2 and store results.""" mp2 = pyscf.mp.MP2(self.scf.run()) _, mp2_t2 = mp2.kernel() self.mp2_energy = mp2.e_tot if store_t2: self.mp2_t2 = mp2_t2
[docs] def run_ccsd( self, t1: np.ndarray | None = None, t2: np.ndarray | None = None, *, store_t1: bool = False, store_t2: bool = False, ) -> None: """Run CCSD and store results.""" ccsd = pyscf.cc.CCSD(self.scf.run()) _, ccsd_t1, ccsd_t2 = ccsd.kernel(t1=t1, t2=t2) self.ccsd_energy = ccsd.e_tot if store_t1: self.ccsd_t1 = ccsd_t1 if store_t2: self.ccsd_t2 = ccsd_t2
[docs] def to_json( self, file: str | bytes | os.PathLike, compression: str | None = None ) -> None: """Serialize to JSON format, optionally compressed, and save to disk. Args: file: The file path to save to. compression: The optional compression algorithm to use. Options: ``"gzip"``, ``"bz2"``, ``"lzma"``. """ def default(obj): if isinstance(obj, np.ndarray): return np.ascontiguousarray(obj) raise TypeError open_func: dict[str | None, Callable] = { None: open, "gzip": gzip.open, "bz2": bz2.open, "lzma": lzma.open, } for attribute in ["hamiltonian", "mole", "scf"]: try: delattr(self, attribute) except AttributeError: pass with open_func[compression](file, "wb") as f: f.write( orjson.dumps(self, option=orjson.OPT_SERIALIZE_NUMPY, default=default) )
[docs] @staticmethod def from_json( file: str | bytes | os.PathLike, compression: str | None = None ) -> MolecularData: """Load a MolecularData from a (possibly compressed) JSON file. Args: file: The file path to read from. compression: The compression algorithm, if any, which was used to compress the file. Options: ``"gzip"``, ``"bz2"``, ``"lzma"``. Returns: The MolecularData object. """ open_func: dict[str | None, Callable] = { None: open, "gzip": gzip.open, "bz2": bz2.open, "lzma": lzma.open, } with open_func[compression](file, "rb") as f: data = orjson.loads(f.read()) def as_array_or_none(val): if val is None: return None return np.asarray(val) def as_array_tuple_or_none(val): if val is None: return None return tuple(np.asarray(arr) for arr in val) nelec = tuple(data["nelec"]) n_alpha, n_beta = nelec arrays_func = as_array_or_none if n_alpha == n_beta else as_array_tuple_or_none atom = data.get("atom") if atom is not None: atom = [tuple(a) for a in atom] return MolecularData( core_energy=data["core_energy"], one_body_integrals=np.asarray(data["one_body_integrals"]), two_body_integrals=np.asarray(data["two_body_integrals"]), norb=data["norb"], nelec=nelec, atom=atom, basis=data.get("basis"), spin=data.get("spin"), symmetry=data.get("symmetry"), mo_coeff=as_array_or_none(data.get("mo_coeff")), mo_occ=as_array_or_none(data.get("mo_occ")), active_space=data.get("active_space"), hf_energy=data.get("hf_energy"), hf_mo_coeff=as_array_or_none(data.get("hf_mo_coeff")), hf_mo_occ=as_array_or_none(data.get("hf_mo_occ")), mp2_energy=data.get("mp2_energy"), mp2_t2=arrays_func(data.get("mp2_t2")), ccsd_energy=data.get("ccsd_energy"), ccsd_t1=arrays_func(data.get("ccsd_t1")), ccsd_t2=arrays_func(data.get("ccsd_t2")), cisd_energy=data.get("cisd_energy"), cisd_vec=as_array_or_none(data.get("cisd_vec")), sci_energy=data.get("sci_energy"), sci_vec=as_array_tuple_or_none(data.get("sci_vec")), fci_energy=data.get("fci_energy"), fci_vec=as_array_or_none(data.get("fci_vec")), dipole_integrals=as_array_or_none(data.get("dipole_integrals")), orbital_symmetries=data.get("orbital_symmetries"), )
[docs] def to_fcidump(self, file: str | bytes | os.PathLike) -> None: """Save data to disk in FCIDUMP format. .. note:: The FCIDUMP format does not retain all information stored in the MolecularData object. To serialize a MolecularData object losslessly, use the :func:`to_json` method to save to JSON format. Args: file: The file path to save to. """ pyscf.tools.fcidump.from_integrals( file, h1e=self.one_body_integrals, h2e=self.two_body_integrals, nuc=self.core_energy, nmo=self.norb, nelec=self.nelec, )
[docs] @staticmethod def from_fcidump(file: str | bytes | os.PathLike) -> MolecularData: """Initialize a MolecularData from an FCIDUMP file. Args: file: The FCIDUMP file path. """ data = pyscf.tools.fcidump.read(file, verbose=False) n_electrons = data["NELEC"] spin = data["MS2"] n_alpha = (n_electrons + spin) // 2 n_beta = (n_electrons - spin) // 2 return MolecularData( core_energy=data["ECORE"], one_body_integrals=data["H1"], two_body_integrals=data["H2"], norb=data["NORB"], nelec=(n_alpha, n_beta), spin=spin, )