JoonhoLee-Group · yfhuang93 · Mar 25, 2024 · Mar 25, 2024 · Mar 25, 2024 · Mar 25, 2024
diff --git a/ipie/estimators/greens_function_multi_det.py b/ipie/estimators/greens_function_multi_det.py
@@ -2,16 +2,23 @@
 import scipy.linalg
 from numba import jit
 
-from ipie.estimators.kernels.cpu import wicks as wk
-from ipie.legacy.estimators.greens_function import gab_mod
+from ipie.config import config
 from ipie.propagation.overlap import (
     compute_determinants_batched,
-    get_cofactor_matrix_batched,
-    get_det_matrix_batched,
     get_overlap_one_det_wicks,
-    reduce_to_CI_tensor,
+    get_cofactor_matrix_batched,
 )
 from ipie.utils.linalg import minor_mask
+from ipie.propagation.overlap import get_det_matrix_batched, reduce_to_CI_tensor
+
+if config.get_option("use_gpu"):
+    from ipie.estimators.kernels.gpu import wicks_gpu as wk
+else:
+    from ipie.estimators.kernels.cpu import wicks as wk
+
+from ipie.legacy.estimators.greens_function import gab_mod
+
+from ipie.utils.backend import arraylib as xp
 
 
 def greens_function_multi_det(walker_batch, trial, build_full=False):
@@ -145,7 +152,7 @@ def greens_function_multi_det_wicks(walker_batch, trial, build_full=False):
     det : float64 / complex128
         Determinant of overlap matrix.
     """
-    tot_ovlps = numpy.zeros(walker_batch.nwalkers, dtype=numpy.complex128)
+    tot_ovlps = xp.zeros(walker_batch.nwalkers, dtype=numpy.complex128)
     nbasis = walker_batch.Ga.shape[-1]
 
     walker_batch.Ga.fill(0.0 + 0.0j)
@@ -155,15 +162,15 @@ def greens_function_multi_det_wicks(walker_batch, trial, build_full=False):
         phia = walker_batch.phia[iw]  # walker wfn
         phib = walker_batch.phib[iw]  # walker wfn
 
-        Oalpha = numpy.dot(trial.psi0a.conj().T, phia)
-        sign_a, logdet_a = numpy.linalg.slogdet(Oalpha)
+        Oalpha = xp.dot(trial.psi0a.conj().T, phia)
+        sign_a, logdet_a = xp.linalg.slogdet(Oalpha)
         logdet_b, sign_b = 0.0, 1.0
-        Obeta = numpy.dot(trial.psi0b.conj().T, phib)
-        sign_b, logdet_b = numpy.linalg.slogdet(Obeta)
+        Obeta = xp.dot(trial.psi0b.conj().T, phib)
+        sign_b, logdet_b = xp.linalg.slogdet(Obeta)
 
-        ovlp0 = sign_a * sign_b * numpy.exp(logdet_a + logdet_b)
-        walker_batch.det_ovlpas[iw, 0] = sign_a * numpy.exp(logdet_a)
-        walker_batch.det_ovlpbs[iw, 0] = sign_b * numpy.exp(logdet_b)
+        ovlp0 = sign_a * sign_b * xp.exp(logdet_a + logdet_b)
+        walker_batch.det_ovlpas[iw, 0] = sign_a * xp.exp(logdet_a)
+        walker_batch.det_ovlpbs[iw, 0] = sign_b * xp.exp(logdet_b)
 
         # G0, G0H = gab_spin(trial.psi0, phi, nup, ndown)
         G0a, G0Ha = gab_mod(trial.psi0a, phia)
@@ -172,8 +179,8 @@ def greens_function_multi_det_wicks(walker_batch, trial, build_full=False):
         walker_batch.G0b[iw] = G0b
         walker_batch.Ghalfa[iw] = G0Ha
         walker_batch.Ghalfb[iw] = G0Hb
-        walker_batch.Q0a[iw] = numpy.eye(nbasis) - walker_batch.G0a[iw]
-        walker_batch.Q0b[iw] = numpy.eye(nbasis) - walker_batch.G0b[iw]
+        walker_batch.Q0a[iw] = xp.eye(nbasis) - walker_batch.G0a[iw]
+        walker_batch.Q0b[iw] = xp.eye(nbasis) - walker_batch.G0b[iw]
 
         G0a = walker_batch.G0a[iw]
         G0b = walker_batch.G0b[iw]
@@ -268,13 +275,13 @@ def greens_function_multi_det_wicks(walker_batch, trial, build_full=False):
                 )  # 2 2
 
             elif nex_a > 3:
-                det_a = numpy.zeros((nex_a, nex_a), dtype=numpy.complex128)
+                det_a = xp.zeros((nex_a, nex_a), dtype=numpy.complex128)
                 for iex in range(nex_a):
                     det_a[iex, iex] = G0a[trial.cre_a[jdet][iex], trial.anh_a[jdet][iex]]
                     for jex in range(iex + 1, nex_a):
                         det_a[iex, jex] = G0a[trial.cre_a[jdet][iex], trial.anh_a[jdet][jex]]
                         det_a[jex, iex] = G0a[trial.cre_a[jdet][jex], trial.anh_a[jdet][iex]]
-                cofactor = numpy.zeros((nex_a - 1, nex_a - 1), dtype=numpy.complex128)
+                cofactor = xp.zeros((nex_a - 1, nex_a - 1), dtype=numpy.complex128)
                 for iex in range(nex_a):
                     p = trial.cre_a[jdet][iex]
                     for jex in range(nex_a):
@@ -334,13 +341,13 @@ def greens_function_multi_det_wicks(walker_batch, trial, build_full=False):
                 )  # 2 2
 
             elif nex_b > 3:
-                det_b = numpy.zeros((nex_b, nex_b), dtype=numpy.complex128)
+                det_b = xp.zeros((nex_b, nex_b), dtype=numpy.complex128)
                 for iex in range(nex_b):
                     det_b[iex, iex] = G0b[trial.cre_b[jdet][iex], trial.anh_b[jdet][iex]]
                     for jex in range(iex + 1, nex_b):
                         det_b[iex, jex] = G0b[trial.cre_b[jdet][iex], trial.anh_b[jdet][jex]]
                         det_b[jex, iex] = G0b[trial.cre_b[jdet][jex], trial.anh_b[jdet][iex]]
-                cofactor = numpy.zeros((nex_b - 1, nex_b - 1), dtype=numpy.complex128)
+                cofactor = xp.zeros((nex_b - 1, nex_b - 1), dtype=numpy.complex128)
                 for iex in range(nex_b):
                     p = trial.cre_b[jdet][iex]
                     for jex in range(nex_b):
@@ -434,6 +441,7 @@ def build_CI_single_excitation_opt(walker_batch, trial, c_phasea_ovlpb, c_phaseb
     -------
     None, modifies walker_batch.CIa, and walker_batch.CIb inplace.
     """
+
     if trial.cre_ex_a[1].shape[0] == 0:
         pass
     else:
@@ -587,6 +595,7 @@ def build_CI_double_excitation_opt(walker_batch, trial, c_phasea_ovlpb, c_phaseb
     -------
     None, modifies walker_batch.CIa, and walker_batch.CIb inplace.
     """
+
     if trial.cre_ex_a[2].shape[0] == 0:
         pass
     else:
@@ -634,6 +643,7 @@ def build_CI_triple_excitation_opt(walker_batch, trial, c_phasea_ovlpb, c_phaseb
     -------
     None, modifies walker_batch.CIa, and walker_batch.CIb inplace.
     """
+
     if trial.cre_ex_a[3].shape[0] == 0:
         pass
     else:
@@ -1034,13 +1044,15 @@ def build_CI_nfold_excitation_opt(nexcit, walker_batch, trial, c_phasea_ovlpb, c
     -------
     None, modifies walker_batch.CIa, and walker_batch.CIb inplace.
     """
+
     ndets_a = len(trial.cre_ex_a[nexcit])
     nwalkers = walker_batch.G0a.shape[0]
     if ndets_a == 0:
         pass
     else:
+        det_mat = xp.zeros((nwalkers, ndets_a, nexcit, nexcit), dtype=numpy.complex128)
         phases = c_phasea_ovlpb[:, trial.excit_map_a[nexcit]]
-        det_mat = numpy.zeros((nwalkers, ndets_a, nexcit, nexcit), dtype=numpy.complex128)
+
         wk.build_det_matrix(
             trial.cre_ex_a[nexcit],
             trial.anh_ex_a[nexcit],
@@ -1049,7 +1061,10 @@ def build_CI_nfold_excitation_opt(nexcit, walker_batch, trial, c_phasea_ovlpb, c
             walker_batch.Ghalfa,
             det_mat,
         )
-        cof_mat = numpy.zeros((nwalkers, ndets_a, nexcit - 1, nexcit - 1), dtype=numpy.complex128)
+
+        cof_mat = xp.zeros((nwalkers, ndets_a, nexcit - 1, nexcit - 1), dtype=numpy.complex128)
+        phases = c_phasea_ovlpb[:, trial.excit_map_a[nexcit]]
+
         wk.reduce_CI_nfold(
             trial.cre_ex_a[nexcit],
             trial.anh_ex_a[nexcit],
@@ -1060,12 +1075,14 @@ def build_CI_nfold_excitation_opt(nexcit, walker_batch, trial, c_phasea_ovlpb, c
             cof_mat,
             walker_batch.CIa,
         )
+
     ndets_b = len(trial.cre_ex_b[nexcit])
+
     if ndets_b == 0:
         pass
     else:
-        phases = c_phaseb_ovlpa[:, trial.excit_map_b[nexcit]]
-        det_mat = numpy.zeros((nwalkers, ndets_b, nexcit, nexcit), dtype=numpy.complex128)
+
+        det_mat = xp.zeros((nwalkers, ndets_b, nexcit, nexcit), dtype=numpy.complex128)
         wk.build_det_matrix(
             trial.cre_ex_b[nexcit],
             trial.anh_ex_b[nexcit],
@@ -1074,7 +1091,9 @@ def build_CI_nfold_excitation_opt(nexcit, walker_batch, trial, c_phasea_ovlpb, c
             walker_batch.Ghalfb,
             det_mat,
         )
-        cof_mat = numpy.zeros((nwalkers, ndets_b, nexcit - 1, nexcit - 1), dtype=numpy.complex128)
+        cof_mat = xp.zeros((nwalkers, ndets_b, nexcit - 1, nexcit - 1), dtype=numpy.complex128)
+        phases = c_phaseb_ovlpa[:, trial.excit_map_b[nexcit]]
+
         wk.reduce_CI_nfold(
             trial.cre_ex_b[nexcit],
             trial.anh_ex_b[nexcit],
@@ -1110,6 +1129,26 @@ def contract_CI(Q0_act, CI, Ghalf, G):
         G[iw] += numpy.dot(Q0_act[iw], numpy.dot(CI[iw], Ghalf[iw]))
 
 
+def contract_CI_gpu(Q0_act, CI, Ghalf, G):
+    """numba kernel to contract Q, CI and Ghalf to form G
+
+    Parameters
+    ----------
+    Q0_act : numpy.ndarray
+        1-G.
+    CI : numpy.ndarray
+        Intermediate tensor.
+    Ghalf : numpy.ndarray
+        Walker half rotated Green's function
+    G: numpy.ndarray
+        Walker Green's function
+    Returns
+    -------
+    None, modifies G in place
+    """
+    G += xp.einsum("wOa, wae, weo-> wOo", Q0_act, CI, Ghalf)
+
+
 def greens_function_multi_det_wicks_opt(walker_batch, trial, build_full=False):
     """Compute walker's green's function using Wick's theorem.
 
@@ -1124,7 +1163,6 @@ def greens_function_multi_det_wicks_opt(walker_batch, trial, build_full=False):
     det : float64 / complex128
         Determinant of overlap matrix.
     """
-
     nbasis = walker_batch.Ga.shape[-1]
 
     walker_batch.Ga.fill(0.0 + 0.0j)
@@ -1213,4 +1251,131 @@ def greens_function_multi_det_wicks_opt(walker_batch, trial, build_full=False):
     walker_batch.Gb *= (ovlps0 / ovlps)[:, None, None]
     walker_batch.det_ovlpas[:, 0] = signs_a * numpy.exp(logdets_a)
     walker_batch.det_ovlpbs[:, 0] = signs_b * numpy.exp(logdets_b)
+
+    return ovlps
+
+
+def greens_function_multi_det_wicks_opt_gpu(walker_batch, trial, build_full=False):
+    """Compute walker's green's function using Wick's theorem.
+
+    Parameters
+    ----------
+    walker_batch : object
+        MultiDetTrialWalkerBatch object.
+    trial : object
+        Trial wavefunction object.
+    Returns
+    -------
+    det : float64 / complex128
+        Determinant of overlap matrix.
+    """
+
+    nbasis = walker_batch.Ga.shape[-1]
+
+    walker_batch.Ga.fill(0.0 + 0.0j)
+    walker_batch.Gb.fill(0.0 + 0.0j)
+
+    # Build reference Green's functions and overlaps
+    # Note abuse of naming convention this is really theta for the reference
+    # determinant.
+    G0a = xp.zeros((walker_batch.nwalkers, nbasis, nbasis), dtype=numpy.complex128)
+    G0b = xp.zeros((walker_batch.nwalkers, nbasis, nbasis), dtype=numpy.complex128)
+    ovlps0 = xp.zeros((walker_batch.nwalkers), dtype=numpy.complex128)
+    signs_a = xp.zeros_like(ovlps0)
+    signs_b = xp.zeros_like(ovlps0)
+    logdets_a = xp.zeros_like(ovlps0)
+    logdets_b = xp.zeros_like(ovlps0)
+
+    trial_psi0a_conj = xp.zeros_like(trial.psi0a.conj())
+    trial_psi0a_conj.set(trial.psi0a.conj())
+    trial_psi0b_conj = xp.zeros_like(trial.psi0b.conj())
+    trial_psi0b_conj.set(trial.psi0b.conj())
+
+    ovlp = xp.einsum("wex,xE->weE", walker_batch.phia.transpose(0, 2, 1), trial_psi0a_conj)
+    ovlp_inv = xp.linalg.inv(ovlp)
+    walker_batch.Ghalfa = xp.einsum("weE,wEo->weo", ovlp_inv, walker_batch.phia.transpose(0, 2, 1))
+    G0a = xp.einsum("ox,wxO->woO", trial_psi0a_conj, walker_batch.Ghalfa)
+    signs_a, logdets_a = xp.linalg.slogdet(ovlp)
+    signs_b = [1.0 for i in range(walker_batch.nwalkers)]
+    logdets_b = [0.0 for i in range(walker_batch.nwalkers)]
+    ovlp = xp.einsum("wex,xE->weE", walker_batch.phib.transpose(0, 2, 1), trial_psi0b_conj)
+    signs_b, logdets_b = xp.linalg.slogdet(ovlp)
+    ovlp_inv = xp.linalg.inv(ovlp)
+    walker_batch.Ghalfb = xp.einsum("weE,wEo->weo", ovlp_inv, walker_batch.phib.transpose(0, 2, 1))
+    G0b = xp.einsum("ox,wxO->woO", trial_psi0b_conj, walker_batch.Ghalfb)
+
+    trial_psi0a_conj = None
+    trial_psi0b_conj = None
+    ovlp = None
+    ovlp_inv = None
+
+    walker_batch.Ghalfa = xp.ascontiguousarray(walker_batch.Ghalfa)
+    walker_batch.Ghalfb = xp.ascontiguousarray(walker_batch.Ghalfb)
+
+    ovlps0 = signs_a * signs_b * xp.exp(logdets_a + logdets_b)
+    walker_batch.G0a = G0a
+    walker_batch.G0b = G0b
+    walker_batch.Q0a = xp.eye(nbasis)[None, :] - G0a
+    walker_batch.Q0b = xp.eye(nbasis)[None, :] - G0b
+    walker_batch.CIa.fill(0.0 + 0.0j)
+    walker_batch.CIb.fill(0.0 + 0.0j)
+
+    dets_a_full, dets_b_full = compute_determinants_batched(
+        walker_batch.Ghalfa, walker_batch.Ghalfb, trial
+    )
+
+    walker_batch.det_ovlpas = dets_a_full * xp.asarray(trial.phase_a[None, :])  # phase included
+    walker_batch.det_ovlpbs = dets_b_full * xp.asarray(trial.phase_b[None, :])  # phase included
+    ovlpa = walker_batch.det_ovlpas
+    ovlpb = walker_batch.det_ovlpbs
+
+    c_phasea_ovlpb = xp.einsum(
+        "wJ,J->wJ", ovlpb, xp.asarray(trial.phase_a * trial.coeffs.conj()), optimize=True
+    )
+    c_phaseb_ovlpa = xp.einsum(
+        "wJ,J->wJ", ovlpa, xp.asarray(trial.phase_b * trial.coeffs.conj()), optimize=True
+    )
+    # contribution 1 (disconnected diagrams)
+    ovlps = xp.einsum("wJ,J->w", ovlpa * ovlpb, xp.asarray(trial.coeffs.conj()), optimize=True)
+    walker_batch.Ga += xp.einsum("w,wpq->wpq", ovlps, G0a, optimize=True)
+    walker_batch.Gb += xp.einsum("w,wpq->wpq", ovlps, G0b, optimize=True)
+    # intermediates for contribution 2 (connected diagrams)
+
+    if trial.max_excite >= 1:
+        build_CI_single_excitation_opt(walker_batch, trial, c_phasea_ovlpb, c_phaseb_ovlpa)
+
+    if trial.max_excite >= 2:
+        build_CI_double_excitation_opt(walker_batch, trial, c_phasea_ovlpb, c_phaseb_ovlpa)
+
+    if trial.max_excite >= 3:
+        build_CI_triple_excitation_opt(walker_batch, trial, c_phasea_ovlpb, c_phaseb_ovlpa)
+
+    for iexcit in range(4, trial.max_excite + 1):
+        build_CI_nfold_excitation_opt(iexcit, walker_batch, trial, c_phasea_ovlpb, c_phaseb_ovlpa)
+
+    # contribution 2 (connected diagrams)
+    # Frozen orbitals not in original active space calculation but reincluded in
+    # AFQMC
+
+    act_orb = trial.act_orb_alpha
+    contract_CI_gpu(
+        walker_batch.Q0a[:, :, act_orb].copy(),
+        walker_batch.CIa,
+        walker_batch.Ghalfa[:, act_orb].copy(),
+        walker_batch.Ga,
+    )
+    act_orb = trial.act_orb_beta
+    contract_CI_gpu(
+        walker_batch.Q0b[:, :, act_orb].copy(),
+        walker_batch.CIb,
+        walker_batch.Ghalfb[:, act_orb].copy(),
+        walker_batch.Gb,
+    )
+    # multiplying everything by reference overlap
+    ovlps *= ovlps0
+    walker_batch.Ga *= (ovlps0 / ovlps)[:, None, None]
+    walker_batch.Gb *= (ovlps0 / ovlps)[:, None, None]
+    walker_batch.det_ovlpas[:, 0] = signs_a * xp.exp(logdets_a)
+    walker_batch.det_ovlpbs[:, 0] = signs_b * xp.exp(logdets_b)
+
     return ovlps