# (C) Copyright IBM 2023.
#
# This code is licensed under the Apache License, Version 2.0. You may
# obtain a copy of this license in the LICENSE.txt file in the root directory
# of this source tree or at http://www.apache.org/licenses/LICENSE-2.0.
#
# Any modifications or derivative works of this code must retain this
# copyright notice, and modified files need to carry a notice indicating
# that they have been altered from the originals.
"""The MolecularData class.
The MolecularData class stores molecular integrals, geometry, energies, and other data.
It can be saved to and loaded from JSON (lossless) and FCIDUMP (integrals only) formats.
"""
from __future__ import annotations
import bz2
import dataclasses
import gzip
import lzma
import os
import tempfile
from collections.abc import Iterable
from functools import cached_property
from typing import Callable
import numpy as np
import orjson
import pyscf
import pyscf.cc
import pyscf.ci
import pyscf.fci
import pyscf.mcscf
import pyscf.mp
import pyscf.tools
from ffsim.hamiltonians import MolecularHamiltonian
[docs]
@dataclasses.dataclass(kw_only=True)
class MolecularData:
"""Class for storing molecular data."""
# Molecular integrals
core_energy: float
"""The core energy."""
one_body_integrals: np.ndarray
"""The one-body integrals."""
two_body_integrals: np.ndarray
"""The two-body integrals in compressed format."""
# Number of orbitals and numbers of alpha and beta electrons
norb: int
"""The number of spatial orbitals."""
nelec: tuple[int, int]
"""The numbers of alpha and beta electrons."""
# Molecule information corresponding to attributes of pyscf.gto.Mole
atom: list[tuple[str, list[float]]] | None = None
"""The coordinates of the atoms in the molecule."""
basis: str | None = None
"""The basis set, e.g. ``"sto-6g"``."""
spin: int | None = None
"""The spin of the molecule."""
symmetry: str | None = None
"""The symmetry of the molecule."""
# active space information
mo_coeff: np.ndarray | None = None
"""Molecular orbital coefficients in the AO basis."""
mo_occ: np.ndarray | None = None
"""Molecular orbital occupancies."""
active_space: list[int] | None = None
"""The molecular orbitals included in the active space."""
# Hartree-Fock data
hf_energy: float | None = None
"""The Hartree-Fock energy."""
hf_mo_coeff: np.ndarray | None = None
"""Hartree-Fock canonical orbital coefficients in the AO basis."""
hf_mo_occ: np.ndarray | None = None
"""Hartree-Fock canonical orbital occupancies."""
# MP2 data
mp2_energy: float | None = None
"""The MP2 energy."""
mp2_t2: np.ndarray | tuple[np.ndarray, np.ndarray, np.ndarray] | None = None
"""The MP2 t2 amplitudes."""
# CCSD data
ccsd_energy: float | None = None
"""The CCSD energy."""
ccsd_t1: np.ndarray | tuple[np.ndarray, np.ndarray] | None = None
"""The CCSD t1 amplitudes."""
ccsd_t2: np.ndarray | tuple[np.ndarray, np.ndarray, np.ndarray] | None = None
"""The CCSD t2 amplitudes."""
# CISD data
cisd_energy: float | None = None
"""The CISD energy."""
cisd_vec: np.ndarray | None = None
"""The CISD state vector."""
# SCI data
sci_energy: float | None = None
"""The SCI energy."""
sci_vec: tuple[np.ndarray, np.ndarray, np.ndarray] | None = None
"""The SCI state vector coefficients, spin alpha strings, and spin beta strings."""
# FCI data
fci_energy: float | None = None
"""The FCI energy."""
fci_vec: np.ndarray | None = None
"""The FCI state vector."""
# DMRG data
dmrg_energy: float | None = None
"""The DMRG energy."""
# other information
dipole_integrals: np.ndarray | None = None
"""The dipole integrals."""
orbital_symmetries: list[str] | None = None
"""The orbital symmetries."""
[docs]
@cached_property
def hamiltonian(self) -> MolecularHamiltonian:
"""The Hamiltonian defined by the molecular data."""
return MolecularHamiltonian(
one_body_tensor=self.one_body_integrals,
two_body_tensor=pyscf.ao2mo.restore(1, self.two_body_integrals, self.norb),
constant=self.core_energy,
)
[docs]
@cached_property
def mole(self) -> pyscf.gto.Mole:
"""The PySCF Mole class for this molecular data."""
mol = pyscf.gto.Mole()
return mol.build(
atom=self.atom,
basis=self.basis,
spin=self.spin,
symmetry=self.symmetry,
)
[docs]
@cached_property
def scf(self) -> pyscf.scf.hf.SCF:
"""A PySCF SCF class for this molecular data."""
# HACK Not sure if there's a better way to do this...
with tempfile.NamedTemporaryFile() as fp:
self.to_fcidump(fp.name)
return pyscf.tools.fcidump.to_scf(fp.name)
[docs]
@staticmethod
def from_scf(
hartree_fock: pyscf.scf.hf.SCF, active_space: Iterable[int] | None = None
) -> "MolecularData":
"""Initialize a MolecularData object from a Hartree-Fock calculation.
Args:
hartree_fock: The Hartree-Fock object.
active_space: An optional list of orbitals to use for the active space.
"""
if not hartree_fock.e_tot:
hartree_fock = hartree_fock.run()
hf_energy = hartree_fock.e_tot
mol: pyscf.gto.Mole = hartree_fock.mol
# Get core energy and one- and two-body integrals.
if active_space is None:
norb = mol.nao
active_space = range(norb)
active_space = list(active_space)
norb = len(active_space)
n_electrons = int(sum(hartree_fock.mo_occ[active_space]))
n_alpha = (n_electrons + mol.spin) // 2
n_beta = (n_electrons - mol.spin) // 2
cas = pyscf.mcscf.CASCI(hartree_fock, norb, (n_alpha, n_beta))
mo = cas.sort_mo(active_space, base=0)
one_body_tensor, core_energy = cas.get_h1cas(mo)
two_body_integrals = cas.get_h2cas(mo)
return MolecularData(
core_energy=core_energy,
one_body_integrals=one_body_tensor,
two_body_integrals=two_body_integrals,
norb=norb,
nelec=(n_alpha, n_beta),
atom=mol._atom,
basis=mol.basis,
spin=mol.spin,
symmetry=mol.symmetry or None,
mo_coeff=hartree_fock.mo_coeff,
mo_occ=hartree_fock.mo_occ,
active_space=active_space,
hf_energy=hf_energy,
)
[docs]
def run_cisd(self, *, store_cisd_vec: bool = False) -> None:
"""Run CISD and store results."""
cisd = pyscf.ci.CISD(self.scf.run())
_, cisd_vec = cisd.kernel()
self.cisd_energy = cisd.e_tot
if store_cisd_vec:
self.cisd_vec = cisd_vec
[docs]
def run_sci(self, *, store_sci_vec: bool = False) -> None:
"""Run SCI and store results."""
sci = pyscf.fci.SCI(self.scf.run())
sci_energy, sci_vec = sci.kernel(
self.one_body_integrals,
self.two_body_integrals,
norb=self.norb,
nelec=self.nelec,
)
self.sci_energy = sci_energy + self.core_energy
if store_sci_vec:
self.sci_vec = (sci_vec, *sci_vec._strs)
[docs]
def run_fci(self, *, store_fci_vec: bool = False) -> None:
"""Run FCI and store results."""
cas = pyscf.mcscf.CASCI(self.scf.run(), ncas=self.norb, nelecas=self.nelec)
_, _, fci_vec, _, _ = cas.kernel()
self.fci_energy = cas.e_tot
if store_fci_vec:
self.fci_vec = fci_vec
[docs]
def run_mp2(self, *, store_t2: bool = False):
"""Run MP2 and store results."""
mp2 = pyscf.mp.MP2(self.scf.run())
_, mp2_t2 = mp2.kernel()
self.mp2_energy = mp2.e_tot
if store_t2:
self.mp2_t2 = mp2_t2
[docs]
def run_ccsd(
self,
t1: np.ndarray | None = None,
t2: np.ndarray | None = None,
*,
store_t1: bool = False,
store_t2: bool = False,
) -> None:
"""Run CCSD and store results."""
ccsd = pyscf.cc.CCSD(self.scf.run())
_, ccsd_t1, ccsd_t2 = ccsd.kernel(t1=t1, t2=t2)
self.ccsd_energy = ccsd.e_tot
if store_t1:
self.ccsd_t1 = ccsd_t1
if store_t2:
self.ccsd_t2 = ccsd_t2
[docs]
def to_json(
self, file: str | bytes | os.PathLike, compression: str | None = None
) -> None:
"""Serialize to JSON format, optionally compressed, and save to disk.
Args:
file: The file path to save to.
compression: The optional compression algorithm to use.
Options: ``"gzip"``, ``"bz2"``, ``"lzma"``.
"""
def default(obj):
if isinstance(obj, np.ndarray):
return np.ascontiguousarray(obj)
raise TypeError
open_func: dict[str | None, Callable] = {
None: open,
"gzip": gzip.open,
"bz2": bz2.open,
"lzma": lzma.open,
}
for attribute in ["hamiltonian", "mole", "scf"]:
try:
delattr(self, attribute)
except AttributeError:
pass
with open_func[compression](file, "wb") as f:
f.write(
orjson.dumps(self, option=orjson.OPT_SERIALIZE_NUMPY, default=default)
)
[docs]
@staticmethod
def from_json(
file: str | bytes | os.PathLike, compression: str | None = None
) -> MolecularData:
"""Load a MolecularData from a (possibly compressed) JSON file.
Args:
file: The file path to read from.
compression: The compression algorithm, if any, which was used to compress
the file.
Options: ``"gzip"``, ``"bz2"``, ``"lzma"``.
Returns: The MolecularData object.
"""
open_func: dict[str | None, Callable] = {
None: open,
"gzip": gzip.open,
"bz2": bz2.open,
"lzma": lzma.open,
}
with open_func[compression](file, "rb") as f:
data = orjson.loads(f.read())
def as_array_or_none(val):
if val is None:
return None
return np.asarray(val)
def as_array_tuple_or_none(val):
if val is None:
return None
return tuple(np.asarray(arr) for arr in val)
nelec = tuple(data["nelec"])
n_alpha, n_beta = nelec
arrays_func = as_array_or_none if n_alpha == n_beta else as_array_tuple_or_none
atom = data.get("atom")
if atom is not None:
atom = [tuple(a) for a in atom]
return MolecularData(
core_energy=data["core_energy"],
one_body_integrals=np.asarray(data["one_body_integrals"]),
two_body_integrals=np.asarray(data["two_body_integrals"]),
norb=data["norb"],
nelec=nelec,
atom=atom,
basis=data.get("basis"),
spin=data.get("spin"),
symmetry=data.get("symmetry"),
mo_coeff=as_array_or_none(data.get("mo_coeff")),
mo_occ=as_array_or_none(data.get("mo_occ")),
active_space=data.get("active_space"),
hf_energy=data.get("hf_energy"),
hf_mo_coeff=as_array_or_none(data.get("hf_mo_coeff")),
hf_mo_occ=as_array_or_none(data.get("hf_mo_occ")),
mp2_energy=data.get("mp2_energy"),
mp2_t2=arrays_func(data.get("mp2_t2")),
ccsd_energy=data.get("ccsd_energy"),
ccsd_t1=arrays_func(data.get("ccsd_t1")),
ccsd_t2=arrays_func(data.get("ccsd_t2")),
cisd_energy=data.get("cisd_energy"),
cisd_vec=as_array_or_none(data.get("cisd_vec")),
sci_energy=data.get("sci_energy"),
sci_vec=as_array_tuple_or_none(data.get("sci_vec")),
fci_energy=data.get("fci_energy"),
fci_vec=as_array_or_none(data.get("fci_vec")),
dmrg_energy=data.get("dmrg_energy"),
dipole_integrals=as_array_or_none(data.get("dipole_integrals")),
orbital_symmetries=data.get("orbital_symmetries"),
)
[docs]
def to_fcidump(self, file: str | bytes | os.PathLike) -> None:
"""Save data to disk in FCIDUMP format.
.. note::
The FCIDUMP format does not retain all information stored in the
MolecularData object. To serialize a MolecularData object losslessly, use
the :func:`to_json` method to save to JSON format.
Args:
file: The file path to save to.
"""
pyscf.tools.fcidump.from_integrals(
file,
h1e=self.one_body_integrals,
h2e=self.two_body_integrals,
nuc=self.core_energy,
nmo=self.norb,
nelec=self.nelec,
)
[docs]
@staticmethod
def from_fcidump(file: str | bytes | os.PathLike) -> MolecularData:
"""Initialize a MolecularData from an FCIDUMP file.
Args:
file: The FCIDUMP file path.
"""
data = pyscf.tools.fcidump.read(file, verbose=False)
n_electrons = data["NELEC"]
spin = data["MS2"]
n_alpha = (n_electrons + spin) // 2
n_beta = (n_electrons - spin) // 2
return MolecularData(
core_energy=data["ECORE"],
one_body_integrals=data["H1"],
two_body_integrals=data["H2"],
norb=data["NORB"],
nelec=(n_alpha, n_beta),
spin=spin,
)