Source code for fcdmft.gw.mol.bse

from functools import reduce
import numpy
import scipy
import scipy.linalg as sla
import time

from pyscf import lib
from pyscf.data import nist
HARTREE2EV = nist.HARTREE2EV

einsum = lib.einsum



[docs]
def bse_full_diagonalization(bse, multi):
    """Full diagonalization of BSE equation.
    BSE equation is defined as equation 1 in doi.org/10.1002/jcc.24688.
    Spin-adapted formalism can be found in chapter 18.3.2 in "Concepts and methods in modern theoretical chemistry.
    Electronic structure (2013, CRC) Ghosh S.K., Chattaraj P.K. (eds.)"
    The working equation is rewritten as equation 15 in doi.org/10.1063/1.477483.

    Args:
        bse (fcdmft.gw.mol.bse.BSE): BSE object.
        multi (str): multiplicity, 's'=singlet, 't'=triplet, 'u'=unrestricted.

    Returns:
        exci (double array): excitation energy.
        X_vec (double ndarray): X block of eigenvector (excitation).
        Y_vec (double ndarray): Y block of eigenvector (de-excitation).
    """
    # load parameters
    nocc = bse.nocc
    mo_energy = bse.mo_energy
    Lpq = bse.Lpq
    TDA = bse.TDA
    nspin, _, nmo, _ = Lpq.shape

    # determine dimension
    nvir = [(nmo - nocc[i]) for i in range(nspin)]
    dim = [(nocc[i] * nvir[i]) for i in range(nspin)]
    full_dim = dim[0] + dim[1] if nspin == 2 else dim[0]
    apb = numpy.zeros(shape=[full_dim, full_dim], dtype=numpy.double)
    amb = numpy.zeros_like(apb)

    Lpq_bar = _get_lpq_bar(nocc=nocc, mo_energy=mo_energy, Lpq=Lpq)

    # scale Coulomb matrix
    scale = 4.0 / nspin
    if TDA is True:
        scale /= 2.0

    # Coulomb part
    if multi == "s" or multi == "u":
        for i in range(nspin):
            for j in range(nspin):
                apb[i * dim[0]: i * dim[0] + dim[i], j * dim[0]: j * dim[0] + dim[j]] += \
                    einsum('Lia,Ljb->iajb', Lpq[i][:, : nocc[i], nocc[i]:], Lpq[j][:, : nocc[j], nocc[j]:])\
                    .reshape(dim[i], dim[j]) * scale

    # W part
    for i in range(nspin):
        WA = -einsum('Lij,Lab->iajb', Lpq[i][:, : nocc[i], : nocc[i]], Lpq_bar[i][:, nocc[i]:, nocc[i]:])
        WB = numpy.zeros_like(WA)
        if TDA is False:
            WB = -einsum('Lib,Laj->iajb', Lpq[i][:, : nocc[i], nocc[i]:], Lpq_bar[i][:, nocc[i]:, :nocc[i]])
        WA = WA.reshape(nocc[i]*nvir[i], nocc[i]*nvir[i])
        WB = WB.reshape(nocc[i]*nvir[i], nocc[i]*nvir[i])
        apb[i * dim[0]: i * dim[0] + dim[i], i * dim[0]: i * dim[0] + dim[i]] += WA + WB
        amb[i * dim[0]: i * dim[0] + dim[i], i * dim[0]: i * dim[0] + dim[i]] += WA - WB

    # orbital energy contribution to A+B and A-B matrix
    orb_diff = []
    for i in range(nspin):
        orb_diff.append(numpy.asarray(mo_energy[i][None, nocc[i]:] - mo_energy[i][:nocc[i], None]).reshape(-1))
    orb_diff = numpy.concatenate(orb_diff, axis=0)
    apb += numpy.diag(orb_diff)
    amb += numpy.diag(orb_diff)

    # equation 15 in doi/10.1063/1.477483, solved by LAPACK function dspgv
    exci, xpy = scipy.linalg.eigh(apb, amb, type=3)
    exci = numpy.sqrt(exci)
    xpy = xpy.T
    # (A-B)^{-1/2} |X+Y> to |X+Y>
    for i in range(full_dim):
        xpy[i, :] /= numpy.sqrt(exci[i])

    xmy = numpy.dot(apb, xpy.T).T
    for i in range(full_dim):
        xmy[i, :] /= exci[i]

    X_vec = (xpy + xmy) / 2.0
    Y_vec = (xpy - xmy) / 2.0

    for i in range(10):
        my_exci0 = reduce(numpy.dot, (xmy[i].T, amb, apb, xpy[i]))
        my_exci0 = numpy.sqrt(my_exci0)
        my_exci1 = reduce(numpy.dot, (xpy[i].T, apb, xpy[i]))
        my_exci2 = reduce(numpy.dot, (xmy[i].T, amb, xmy[i]))
        print("exci = %.6f my_exci0 = %.6f my_exci1 = %.6f my_exci2 = %.6f" % (exci[i], my_exci0, my_exci1, my_exci2))

    bse.exci = exci
    bse.X_vec = X_vec
    bse.Y_vec = Y_vec

    return exci, X_vec, Y_vec




[docs]
def get_excitation_from_eigenvector(multi, nocc, mo_energy, Lpq, X_vec, Y_vec, TDA=False, Lpq_bar=None):
    """Get excitation energies from corresponding eigenvectors.
    exci = (X+Y)^T (A+B) (X+Y)
    Equation.18 in doi.org/10.1063/1.477483

    Args:
        multi (str): multiplicity, 's'=singlet, 't'=triplet, 'u'=unrestricted.
        nocc (int array): the number of occupied orbitals.
        mo_energy (double ndarray): orbital energy.
        Lpq (double ndarray): 3-center density-fitting matrix.
        X_vec (list of double ndarray): X block of eigenvector (excitation).
        Y_vec (list of double ndarray): Y block of eigenvector (de-excitation).
        TDA (bool, optional): use TDA approximation. Defaults to False.
        Lpq_bar (double ndarray, optional): auxiliary 3-center matrix as equation 21 in doi.org/10.1002/jcc.24688. Defaults to None.

    Returns:
        exci (double array): excitation energies.
    """
    nspin, naux, nmo, _ = Lpq.shape

    # determine dimension
    nvir = [(nmo - nocc[i]) for i in range(nspin)]
    dim = [(nocc[i] * nvir[i]) for i in range(nspin)]
    full_dim = dim[0] + dim[1] if nspin == 2 else dim[0]
    if Lpq_bar is None:
        Lpq_bar = _get_lpq_bar(nocc=nocc, mo_energy=mo_energy, Lpq=Lpq)

    nroot = X_vec[0].shape[0]
    if nspin == 1:
        xpy = (X_vec[0] + Y_vec[0]).reshape(nroot, -1)
    else:
        X_vec_a, X_vec_b = X_vec[0].reshape(nroot, -1), X_vec[1].reshape(nroot, -1)
        Y_vec_a, Y_vec_b = Y_vec[0].reshape(nroot, -1), Y_vec[1].reshape(nroot, -1)
        X_vec_ab = numpy.concatenate((X_vec_a, X_vec_b), axis=1)
        Y_vec_ab = numpy.concatenate((Y_vec_a, Y_vec_b), axis=1)
        xpy = X_vec_ab + Y_vec_ab

    # scale Coulomb matrix
    scale = 4.0 / nspin
    if TDA is True:
        scale /= 2.0

    apb_prod = numpy.zeros(shape=[nroot, full_dim], dtype=numpy.double)

    # contraction: V
    if multi != 't' and multi != 'T':
        for ivec in range(nroot):
            Lpq_z = numpy.zeros(shape=[nspin, naux], dtype=numpy.double)
            for s in range(nspin):
                z = xpy[ivec][s * dim[0]: s * dim[0] + dim[s]].reshape(nocc[s], nvir[s])
                Lpq_z[s] = einsum('Pjb,jb->P', Lpq[s][:, :nocc[s], nocc[s]:], z)

            for s in range(nspin):
                for t in range(nspin):
                    vz = einsum('Pia,P->ia', Lpq[s][:, :nocc[s], nocc[s]:], Lpq_z[t]).reshape(-1) * scale
                    apb_prod[ivec][s * dim[0]: s * dim[0] + dim[s]] += vz

    # contraction: W
    for ivec in range(nroot):
        # this matrix can be large
        Lpq_z = numpy.zeros(shape=[nspin, naux, nmo, nmo], dtype=numpy.double)
        for s in range(nspin):
            z = xpy[ivec][s * dim[0]: s * dim[0] + dim[s]].reshape(nocc[s], nvir[s])
            Lpq_z[s][:, :, :nocc[s]] = einsum('Lpb,jb->Lpj', Lpq[s][:, :, nocc[s]:], z)
            waz = -einsum('Lji,Laj->ia', Lpq_bar[s][:, :nocc[s], :nocc[s]], Lpq_z[s][:, nocc[s]:, :nocc[s]]).reshape(-1)
            if TDA is True:
                wbz = numpy.zeros_like(waz)
            else:
                wbz = -einsum('Laj,Lij->ia', Lpq_bar[s][:, nocc[s]:, :nocc[s]],
                              Lpq_z[s][:, :nocc[s], :nocc[s]]).reshape(-1)
            apb_prod[ivec][s * dim[0]: s * dim[0] + dim[s]] += waz + wbz

    # contraction: orbital energy difference
    for ivec in range(nroot):
        for s in range(nspin):
            orb_diff = numpy.asarray(mo_energy[s][None, nocc[s]:] - mo_energy[s][:nocc[s], None]).reshape(-1)
            oz = orb_diff * xpy[ivec][s * dim[0]: s * dim[0] + dim[s]]
            apb_prod[ivec][s * dim[0]: s * dim[0] + dim[s]] += oz

    # As shown in doi.org/10.1063/1.477483
    # equation 18 (A+B)|X+Y>=w|X-Y> and orthogonality in equation 20 <X+Y|X-Y> = delta
    # so <X+Y|(A+B) = <X+Y|w|X-Y> = w
    exci = numpy.zeros(shape=[nroot], dtype=numpy.double)
    for i in range(nroot):
        exci[i] = numpy.matmul(xpy[i].T, apb_prod[i])

    return exci




[docs]
def bse_davidson(bse, multi, e_min=0.0, delta=0.0):
    """Davidson algorithm for BSE.
    The Davidson algorithm follows doi.org/10.1063/1.477483.
    BSE equation is defined as equation 1 in doi.org/10.1002/jcc.24688.
    Spin-adapted formalism can be found in chapter 18.3.2 in "Concepts and methods in modern theoretical chemistry.
    Electronic structure (2013, CRC) Ghosh S.K., Chattaraj P.K. (eds.)"

    Args:
        bse (fcdmft.gw.mol.bse.BSE): BSE object.
        multi (str): multiplicity, 's'=singlet, 't'=triplet, 'u'=unrestricted.
        e_min (float, optional): minimum desired excitation energy. Defaults to 0.0.
        delta (float, optional): energy shift for trial vector generation, typically <=0.0.

    Returns:
        exci (double array): excitation energy.
        X_vec (double ndarray): X block of eigenvector (excitation).
        Y_vec (double ndarray): Y block of eigenvector (de-excitation).
    """
    # load matrix
    nspin = bse.nspin
    nmo = bse.nmo
    nocc = bse.nocc
    mo_energy = bse.mo_energy
    Lpq = bse.Lpq
    # load parameter
    TDA = bse.TDA
    max_vec = bse.max_vec
    nroot = bse.nroot
    max_iter = bse.max_iter
    max_expand = bse.max_expand
    init_ntri = max(nroot, bse.init_ntri)
    residue_thresh = bse.residue_thresh

    # determine dimension
    nvir = [(nmo - nocc[i]) for i in range(nspin)]
    dim = [(nocc[i] * nvir[i]) for i in range(nspin)]
    full_dim = dim[0] + dim[1] if nspin == 2 else dim[0]

    # initialize trial vector
    tri_vec = numpy.zeros(shape=[max_vec, full_dim], dtype=numpy.double)
    ntri = min(init_ntri, full_dim) # initial guess size should be larger than nroot
    ntri_found, tri_vec_found = get_davidson_trial_vector(ntri=ntri, nocc=nocc, mo_energy=mo_energy, e_min=e_min, delta=delta)
    if ntri_found < ntri:
        lib.logger.info(bse, f"only {ntri_found} trial vectors are generated rather than {ntri}.")
        ntri = ntri_found
    if ntri_found < nroot:
        raise ValueError("cannot find enough trial vectors; lower e_min or add more trial vectors")
    tri_vec[:ntri, :] = tri_vec_found
    del tri_vec_found

    # initialize Davidson matrix
    apb_prod = numpy.zeros_like(tri_vec)
    amb_prod = numpy.zeros_like(tri_vec)
    left_vec = numpy.zeros(shape=[nroot, full_dim], dtype=numpy.double)
    right_vec = numpy.zeros_like(left_vec)
    exci = numpy.zeros(shape=[nroot], dtype=numpy.double)

    Lia = [numpy.ascontiguousarray(Lpq[s][:, :nocc[s], nocc[s]:]) for s in range(nspin)]
    Laa = [numpy.ascontiguousarray(Lpq[s][:, nocc[s]:, nocc[s]:]) for s in range(nspin)]
    Lii_bar, Lia_bar = _get_lpq_bar_by_block(
        nocc=nocc,
        mo_energy=mo_energy,
        Lii=[Lpq[s][:, :nocc[s], :nocc[s]] for s in range(nspin)],
        Lia=Lia
    )

    iter = 0
    nprod = 0 # the number of contracted vectors
    total_contract_work = 0
    total_linalg_work = 0
    while iter < max_iter:
        lib.logger.info(bse, "\nBSE Davidson #%d iteration, ntri= %d , nprod= %d .", iter+1, ntri, nprod)
        apb_prod[nprod:ntri, :], amb_prod[nprod:ntri, :], contract_work_this_iter = \
            _bse_contraction(multi=multi, nocc=nocc, mo_energy=mo_energy,
                             Lia=Lia, Laa=Laa, Lii_bar=Lii_bar, Lia_bar=Lia_bar,
                             tri_vec=tri_vec[nprod:ntri, :], TDA=TDA)
        nprod = ntri
        total_contract_work += contract_work_this_iter
        lib.logger.info(bse, f"work for iter {iter+1}: {float(contract_work_this_iter):.2E}")

        # A+B and A-B in subspace, step 3 in doi.org/10.1063/1.477483
        Mp = numpy.matmul(tri_vec[:ntri, :], apb_prod[:ntri, :].T)
        Mm = numpy.matmul(tri_vec[:ntri, :], amb_prod[:ntri, :].T)
        Mm = (Mm + Mm.T) / 2.0

        total_linalg_work += 2 * ntri * ntri * tri_vec.shape[1]

        # check stability
        Mm_e, Mm_v = scipy.linalg.eigh(Mm)
        assert Mm_e[0] >= 0.0, "negative eigenvalues in the subspace diagonalization"
    
        total_linalg_work += Mm.shape[0] ** 3

        # equation 24 in doi.org/10.1063/1.477483
        Mm_half = Mm_v @ numpy.diag(Mm_e**0.5) @ Mm_v.T
        Mh = reduce(numpy.matmul, (Mm_half.T, Mp, Mm_half))
        Mh = (Mh + Mh.T) / 2.0 # symmetrize the Hermitian matrix
    
        total_linalg_work += Mm_v.shape[0] ** 2 * Mm_v.shape[1] \
                            + Mm_half.shape[0] ** 2 * Mm_half.shape[1]

        # diagonalization in the subspace
        e_tri, v_tri = scipy.linalg.eigh(Mh)
        found_roots = numpy.flatnonzero(e_tri >= e_min)
        nrootfound = min(nroot, found_roots.size)
        lib.logger.debug(bse, "lowest %d subspace diagonalization eigenvalue\n%s", nrootfound, e_tri[found_roots[:nrootfound]])
        assert e_tri[0] >= 0.0
        e_tri = numpy.sqrt(e_tri)
        emin_index = numpy.searchsorted(e_tri, e_min, side='left')
        if emin_index + nroot > e_tri.size:
            lib.logger.info(bse, "cannot find enough approx eigenvectors; lower e_min or add more trial vectors")
            emin_index = e_tri.size - nroot
        exci[:] = e_tri[emin_index:emin_index+nroot]
        total_linalg_work += Mh.shape[0] ** 3

        # get left and right trial vectors in the subspace, equation 18 and 19 in doi.org/10.1063/1.477483
        Mm_inv_half = Mm_v @ numpy.diag(Mm_e**-0.5) @ Mm_v.T
    
        total_linalg_work += Mm_v.shape[0] ** 2 * Mm_v.shape[1]

        left_vec_tri = v_tri[:, emin_index:emin_index+nroot].T @ Mm_inv_half.T
        for i in range(nroot):
            left_vec_tri[i] *= exci[i]
            total_linalg_work += left_vec_tri[i].size
        right_vec_tri = v_tri[:, emin_index:emin_index+nroot].T @ Mm_half.T
        total_linalg_work += v_tri.shape[0] * nroot * Mm_half.shape[1]

        # orthonormalize as equation 20 in orthonormalize
        orth_work = _bi_orth_norm(left_vec=left_vec_tri, right_vec=right_vec_tri)
        total_linalg_work += orth_work

        # get left and right eigenvector in the full space, equation 25 and 26 in doi.org/10.1063/1.477483
        left_vec[:nroot, :] = numpy.matmul(left_vec_tri[:nroot, :], tri_vec[:ntri, :])
        right_vec[:nroot, :] = numpy.matmul(right_vec_tri[:nroot, :],tri_vec[:ntri, :])
    
        total_linalg_work += 2 * nroot * ntri * tri_vec.shape[1]

        # expand trial vector space if residues are large
        ntri_old = ntri
        conv, ntri = _expand_trial_vector(bse=bse, nocc=nocc, mo_energy=mo_energy, exci=exci, tri_vec=tri_vec,
                                          left_vec=left_vec, right_vec=right_vec, left_vec_tri=left_vec_tri,
                                          right_vec_tri=right_vec_tri, apb_prod=apb_prod, amb_prod=amb_prod,
                                          thresh=residue_thresh, e_min=e_min,
                                          max_expand=max_expand)
        lib.logger.info(bse, "add %d new trial vectors.", ntri - ntri_old)

        iter += 1
        if conv is True:
            break

    assert conv is True, "Davidson algorithm for BSE is not converged!"

    lib.logger.info(bse, "BSE converged in %d iterations, final subspace size = %d", iter, nprod)
    lib.logger.info(bse, f"total work for contraction: {float(total_contract_work):.2E}")
    lib.logger.info(bse, f"total work for linalg: {float(total_linalg_work):.2E}")

    # transfer left and right eigenvector to X and Y
    X_vec = (left_vec + right_vec) * 0.5
    Y_vec = (-left_vec + right_vec) * 0.5

    # reshape X and Y eigenvector
    if nspin == 1:
        X_vec = [X_vec.reshape(nroot, nocc[0], nvir[0])]
        Y_vec = [Y_vec.reshape(nroot, nocc[0], nvir[0])]
    else:
        X_vec_a, X_vec_b, Y_vec_a, Y_vec_b = [], [], [], []
        for r in range(nroot):
            X_vec_a.append(X_vec[r][:dim[0]].reshape(nocc[0], nvir[0]))
            X_vec_b.append(X_vec[r][dim[0]:].reshape(nocc[1], nvir[1]))
            Y_vec_a.append(Y_vec[r][:dim[0]].reshape(nocc[0], nvir[0]))
            Y_vec_b.append(Y_vec[r][dim[0]:].reshape(nocc[1], nvir[1]))
        X_vec = [numpy.asarray(X_vec_a), numpy.asarray(X_vec_b)]
        Y_vec = [numpy.asarray(Y_vec_a), numpy.asarray(Y_vec_b)]

    bse.exci = exci
    bse.X_vec = X_vec
    bse.Y_vec = Y_vec

    return exci, X_vec, Y_vec




[docs]
def bse_lanczos(bse, multi, u1=None, nsteps=100):
    """Lanczos algorithm for BSE.
    Follows 10.1137/16M1102641.

    Args:
        bse (fcdmft.gw.mol.bse.BSE): BSE object.
        multi (str): multiplicity, 's'=singlet, 't'=triplet, 'u'=unrestricted.
        u1 (double ndarray, optional): initial state for Lanczos algorithm. Defaults to None.
        nsteps (int, optional): the number of Lanczos steps. Defaults to 100.

    Returns:
        exci (double array): excitation energy.
        X_vec (double ndarray): X block of eigenvector (excitation).
        Y_vec (double ndarray): Y block of eigenvector (de-excitation).
    """
    # load matrix
    nspin = bse.nspin
    nmo = bse.nmo
    nocc = bse.nocc
    mo_energy = bse.mo_energy
    Lpq = bse.Lpq
    # load parameter
    TDA = bse.TDA

    # determine dimension
    nvir = [(nmo - nocc[i]) for i in range(nspin)]
    dim = [(nocc[i] * nvir[i]) for i in range(nspin)]
    full_dim = dim[0] + dim[1] if nspin == 2 else dim[0]


    Lia = [numpy.ascontiguousarray(Lpq[s][:, :nocc[s], nocc[s]:]) for s in range(nspin)]
    Laa = [numpy.ascontiguousarray(Lpq[s][:, nocc[s]:, nocc[s]:]) for s in range(nspin)]
    Lii_bar, Lia_bar = _get_lpq_bar_by_block(
        nocc=nocc,
        mo_energy=mo_energy,
        Lii=[Lpq[s][:, :nocc[s], :nocc[s]] for s in range(nspin)],
        Lia=Lia
    )

    if u1 is None:
        eia = []
        for s in range(nspin):
            eia.append(numpy.asarray(mo_energy[s][None, nocc[s]:] - mo_energy[s][:nocc[s], None]).reshape(-1))
        eia = numpy.concatenate(eia, axis=0)
        u1 = numpy.random.random(full_dim) - 0.5
        u1 = u1 / numpy.linalg.norm(u1)
    apb_u1, _ = _bse_contraction(multi=multi, nocc=nocc, mo_energy=mo_energy, Lia=Lia, Laa=Laa, Lii_bar=Lii_bar, Lia_bar=Lia_bar, tri_vec=u1[None, :], TDA=TDA, return_amb=False)

    apb_u1 = apb_u1.reshape(-1)

    betas = numpy.zeros(nsteps)
    alphas = numpy.zeros(nsteps)

    if TDA is False:
        u1_apbnorm = numpy.dot(u1, apb_u1)
        u = u1 / numpy.sqrt(u1_apbnorm)
        v = apb_u1 / numpy.sqrt(u1_apbnorm)
    else:
        u = u1 / numpy.linalg.norm(u1)
        v = u

    u_last = numpy.zeros_like(u)
    v_last = numpy.zeros_like(v)
    beta_last = 0.0

    for step in range(nsteps):
        lib.logger.info(bse, 'BSE Lanczos #%d iteration', step + 1)
        if TDA is False:
            # x = (A - B) v_j - beta_{j-1} u_{j-1}
            amb_v, _ = _bse_contraction(
                multi=multi,
                nocc=nocc,
                mo_energy=mo_energy,
                Lia=Lia,
                Laa=Laa,
                Lii_bar=Lii_bar,
                Lia_bar=Lia_bar,
                tri_vec=v.reshape((1, -1)),
                TDA=TDA,
                return_apb=False,
            )
            amb_v = amb_v.reshape(-1)
            sla.blas.daxpy(u_last, amb_v, a=-beta_last)
            x = amb_v
            # alpha = v_j^T x
            alphas[step] = numpy.dot(x, v)
            # x = x - alpha u_j
            sla.blas.daxpy(u, x, a=-alphas[step])
            # y = (A + B) x
            y, _ = _bse_contraction(
                multi=multi,
                nocc=nocc,
                mo_energy=mo_energy,
                Lia=Lia,
                Laa=Laa,
                Lii_bar=Lii_bar,
                Lia_bar=Lia_bar,
                tri_vec=x.reshape((1, -1)),
                TDA=TDA,
                return_amb=False,
            )
            y = y.reshape(-1)
            # beta_j = sqrt(x^T y)
            betas[step] = numpy.sqrt(numpy.dot(x, y))
            u_last = u
            v_last = v
            # u_{j+1} = x / beta_j
            # v_{j+1} = y / beta_j
            sla.blas.dscal(1.0 / betas[step], x)
            sla.blas.dscal(1.0 / betas[step], y)
            u = x
            v = y
        else:
            # TDA approximation
            # v = A u_j - beta_{j-1} u_{j-1}
            v, _ = _bse_contraction(
                multi=multi,
                nocc=nocc,
                mo_energy=mo_energy,
                Lia=Lia,
                Laa=Laa,
                Lii_bar=Lii_bar,
                Lia_bar=Lia_bar,
                tri_vec=u.reshape((1, -1)),
                TDA=TDA,
                return_apb=False,
            )
            v = v.reshape(-1)
            sla.blas.daxpy(u_last, v, a=-beta_last)
            # alpha_j = u_j^T v
            alphas[step] = numpy.dot(u, v)
            # v = v - alpha u_j
            sla.blas.daxpy(u, v, a=-alphas[step])
            # beta_j = ||v||
            betas[step] = numpy.linalg.norm(v)
            # u_{j+1} = v / beta_j
            sla.blas.dscal(1.0 / betas[step], v)
            u_last = u
            u = v
        beta_last = betas[step]
    return alphas, betas



[docs]
def lanczos_estimate_spectrum(alphas, betas, e_range, eta, nw, TDA=False):
    """Estimate the excitation spectrum density from the results of the Lanczos algorithm.

    Parameters
    ----------
    alphas (double array): coefficients from the Lanczos algorithm.
        diagonal elements of the tridiagonal matrix.
    betas (double array): coefficients from the Lanczos algorithm.
        off-diagonal elements of the tridiagonal matrix.
    e_range (tuple): energy range (e_min, e_max).
    eta (float): broadening factor.
    nw (int): number of frequency points.
    TDA (bool, optional): Lanczos used TDA approximation. Defaults to False.

    Returns
    -------
    freqs (double array): frequency points at which to compute density estimate.
    density (double array): excitation spectrum density estimate.
    """
    Tk_diag = numpy.concatenate([alphas, alphas[-2::-1]], axis=0)
    Tk_offdiag = numpy.concatenate([betas, betas[-3::-1]], axis=0)
    roots, S = scipy.linalg.eigh_tridiagonal(Tk_diag, Tk_offdiag, lapack_driver='stebz')
    roots_pos = roots[roots > 0]
    if TDA is False:
        roots_pos = numpy.sqrt(roots_pos)
    magnitudes = S[0, roots > 0]**2
    freqs = numpy.linspace(e_range[0], e_range[1], nw)
    def gauss_broad(omega, eta, roots):
        normalization = 1.0 / numpy.sqrt(2 * numpy.pi * eta**2)
        return normalization * (numpy.exp(-(omega[:, None] - roots[None, :])**2 / (2 * eta**2)) \
                                - numpy.exp(-(omega[:, None] + roots[None, :])**2 / (2 * eta**2)))
    if TDA is False:
        density = (gauss_broad(freqs, eta, roots_pos) / roots_pos[None, :]) @ magnitudes
    else:
        density = gauss_broad(freqs, eta, roots_pos) @ magnitudes
    return freqs, density





[docs]
def get_davidson_trial_vector(ntri, nocc, mo_energy, e_min=0.0, delta=0.0):
    """Generate initial trial vectors for particle-hole excitations.
    The order is determined by the occ-vir pair orbital energy difference.
    The initial trial vectors are diagonal. They are generated by taking
    occ-vir pairs with an energy difference of >= e_min + delta.

    Args:
        ntri (int): the number of desired initial trial vectors.
        nocc (int array): the number of occupied orbitals.
        mo_energy (double ndarray): orbital energy.
        e_min (float, optional): minimum desired excitation energy. Defaults to 0.0.
        delta (float, optional): energy shift for trial vector generation, typically <=0.0.

    Returns:
        ntri, int: the number of actual trial vectors generated
        tri_vec, double ndarray: initial trial vectors
    """
    nspin, nmo = mo_energy.shape
    nvir = [(nmo - nocc[i]) for i in range(nspin)]
    dim = [(nocc[i] * nvir[i]) for i in range(nspin)]
    full_dim = dim[0] + dim[1] if nspin == 2 else dim[0]

    e_diffs = []

    for s in range(nspin):
        # The shape of e_diffs_s is (nocc[s], nvir[s])
        # e_diffs_s[i, a] = mo_energy[s][a] - mo_energy[s][i]
        e_diffs_s = mo_energy[s][None, nocc[s]:] - mo_energy[s][:nocc[s], None]
        # Flatten e_diffs[s] into a 1D array.
        e_diffs_s = e_diffs_s.reshape(-1)
        e_diffs.append(e_diffs_s)

    # At this point, the structure of e_diffs is as follows:
    # e_diffs[spin, ia] = mo_energy[spin][a] - mo_energy[spin][i]
    # where ia = a + nvir[spin] * i

    # Glue the e_diffs together into a 1D array.
    all_ediffs = numpy.concatenate(e_diffs, axis=0)

    # Compute the sizes of the occ-vir blocks for each spin.
    e_diffs_sizes = [0] + [nocc[s] * nvir[s] for s in range(nspin)]
    # Compute the starting index of each spin's occ-vir block.
    # This indicates where e_diffs[s] resides in all_ediffs, for each s.
    e_diffs_starts = numpy.cumsum(e_diffs_sizes)

    # Find the indices which sort all_ediffs.
    sort_index = numpy.argsort(all_ediffs)

    # Take the lowest ntri pairs with energy difference greater than e_min + delta.
    e_min_index = numpy.searchsorted(all_ediffs, e_min + delta, side='left', sorter=sort_index)
    if e_min_index + ntri > all_ediffs.size:
        # cannot find enough pairs for trial vectors; lower e_min
        ntri = all_ediffs.size - e_min_index
    exci_to_take = sort_index[e_min_index:e_min_index+ntri]

    # exci_to_take is an index into all_ediffs.
    # We need to convert it back to orbital indices.

    tri_vec = numpy.zeros(shape=[ntri, full_dim], dtype=numpy.double)

    cur_trivec = 0
    for s in range(nspin):
        # Figure out which excitation indices are in this spin block.
        exci_this_spin = numpy.extract(
            (exci_to_take >= e_diffs_starts[s]) & (exci_to_take < e_diffs_starts[s+1]),
            exci_to_take
        )
        # Subtract the starting index of this spin's occ-vir block.
        # They are now in the form ia = i * nvir[s] + a.
        # That is, they are indices into e_diffs[s].reshape(-1).
        exci_this_spin -= e_diffs_starts[s]
        # Convert the indices from 1D form (i * nvir[s] + a) to 2D form (i, a).
        ex_occ, ex_vir = numpy.unravel_index(exci_this_spin, (nocc[s], nvir[s]))
        n_exci = exci_this_spin.size

        # The following is shorthand for
        # for i, a in zip(ex_occ, ex_vir):
        #     tri_vec[cur_trivec, s * dim[s] + i * nvir[s] + a] = 1.
        #     cur_trivec += 1
        tri_vec[
            range(cur_trivec, cur_trivec + n_exci),
            s * dim[s] + ex_occ * nvir[s] + ex_vir
        ] = 1.
        cur_trivec += n_exci

    return ntri, tri_vec



def _bse_contraction(multi, nocc, mo_energy, Lia, Laa,
                     Lii_bar, Lia_bar, tri_vec, TDA=False,
                     return_apb=True, return_amb=True):
    """Contraction for BSE matrix and trial vectors.
    W part is as equation 25 and 26 in doi.org/10.1002/jcc.24688.

    Args:
        multi (str): multiplicity, 's'=singlet, 't'=triplet, 'u'=unrestricted.
        nocc (int array): the number of occupied orbitals.
        mo_energy (double ndarray): orbital energy.
        Lia (double ndarray): 3-center density-fitting matrix, ov block.
        Laa (double ndarray): 3-center density-fitting matrix, vv block.
        Lii_bar (double ndarray): auxiliary 3-center matrix as equation 21 in doi.org/10.1002/jcc.24688.
        Lia_bar (double ndarray): auxiliary 3-center matrix as equation 21 in doi.org/10.1002/jcc.24688.
        tri_vec (double ndarray): trial vector.
        TDA (bool, optional): use TDA approximation. Defaults to False.
        return_apb (bool, optional): return A+B matrix and trial vector contracted vectors. Defaults to True.
        return_amb (bool, optional): return A-B matrix and trial vector contracted vectors. Defaults to True.

    Returns:
        apb_prod, double ndarray: A+B matrix and trial vector contracted vectors.
        amb_prod, double ndarray: A-B matrix and trial vector contracted vectors.
    """
    nspin = len(Lia)
    naux, _, _ = Lia[0].shape
    nmo = Lia[0].shape[2] + Lia[0].shape[1]
    ntri = tri_vec.shape[0]

    nvir = [(nmo - nocc[i]) for i in range(nspin)]
    dim = [(nocc[i] * nvir[i]) for i in range(nspin)]
    full_dim = dim[0] + dim[1] if nspin == 2 else dim[0]

    work_done = 0

    scale = 4.0 / nspin
    if TDA is True:
        scale /= 2.0

    if return_apb:
        apb_prod = numpy.zeros(shape=[ntri, full_dim], dtype=numpy.double)
    if return_amb:
        amb_prod = numpy.zeros(shape=[ntri, full_dim], dtype=numpy.double)

    # contraction: V
    if multi != 't' and multi != 'T':
        for ivec in range(ntri):
            Lpq_z = numpy.empty(shape=[nspin, naux], dtype=numpy.double)
            for s in range(nspin):
                z = tri_vec[ivec][s * dim[0]: s * dim[0] + dim[s]].reshape(nocc[s], nvir[s])

                # The following code is exactly equivalent to
                # Lpq_z[s] = einsum('Pjb,jb->P', Lia[s], z)
                Lpq_z[s] = Lia[s].reshape(naux, -1) @ z.reshape(-1)
                work_done += naux * nvir[s] * nocc[s]

            for s in range(nspin):
                for t in range(nspin):
                    # vz = einsum('Pia,P->ia', Lia[s], Lpq_z[t]).reshape(-1) * scale
                    vz = (Lia[s].reshape(naux, -1).T @ Lpq_z[t]).reshape(-1) * scale
                    work_done += naux * nvir[s] * nocc[s]
                    if return_apb:
                        apb_prod[ivec][s * dim[0]: s * dim[0] + dim[s]] += vz
                    if TDA is True and return_amb:
                        amb_prod[ivec][s * dim[0]: s * dim[0] + dim[s]] += vz

    # contraction: W
    for ivec in range(ntri):
        #continue
        # this matrix can be large
        #Lpq_z = numpy.zeros(shape=[nspin, naux, nmo, nmo], dtype=numpy.double)
        for s in range(nspin):
            z = tri_vec[ivec][s * dim[0]: s * dim[0] + dim[s]].reshape(nocc[s], nvir[s])

            # The following calculation for waz is equivalent to
            # Laj_zs = einsum('Lab,jb->Laj', Laa[s], z)
            # waz = -einsum('Lji,Laj->ia', Lii_bar[s], Laj_zs).reshape(-1)

            jLa_zs = numpy.matmul(z, Laa[s].reshape(-1, nvir[s]).T)
            jLa_zs = jLa_zs.reshape(nocc[s], naux, nvir[s])
            waz = numpy.zeros((nocc[s], nvir[s]), dtype=numpy.double)
            for j in range(nocc[s]):
                # jLa, Lji -> ia for a single j
                sla.blas.dgemm(
                    alpha=-1.0,
                    a=jLa_zs[j].T, # aL
                    b=Lii_bar[s][:, j, :].T, # iL (Li)
                    trans_a=0,
                    trans_b=1,
                    beta=1.0,
                    c=waz.T,
                    overwrite_c=True
                )
            waz = waz.reshape(-1)
            work_done += naux * nocc[s] * nocc[s] * nvir[s]
            if TDA is True:
                wbz = numpy.zeros_like(waz)
            else:
                # the following calculation for wbz is equivalent to
                # Lij_zs = einsum('Lib,jb->Lij', Lia[s], z)
                # wbz = -einsum('Lja,Lij->ia', Lia_bar[s], Lij_zs).reshape(-1)
                wbz = numpy.zeros((nocc[s], nvir[s]), dtype=numpy.double)
                jLi_zs = numpy.matmul(z, Lia[s].reshape(-1, nvir[s]).T)
                jLi_zs = jLi_zs.reshape(nocc[s], naux, nocc[s])
                for j in range(nocc[s]):
                    # jLi, Lja -> ia for a single j
                    sla.blas.dgemm(
                        alpha=-1.0,
                        b=jLi_zs[j].T, # iL
                        a=Lia_bar[s][:, j, :].T, # aL (La)
                        trans_a=0,
                        trans_b=1,
                        beta=1.0,
                        c=wbz.T,
                        overwrite_c=True
                    )
                wbz = wbz.reshape(-1)
                work_done += naux * nocc[s] * nocc[s] * nvir[s]
            if return_apb:
                apb_prod[ivec][s * dim[0]: s * dim[0] + dim[s]] += waz + wbz
            if return_amb:
                amb_prod[ivec][s * dim[0]: s * dim[0] + dim[s]] += waz - wbz

    # contraction: orbital energy difference
    for s in range(nspin):
        orb_diff = numpy.asarray(mo_energy[s][None, nocc[s]:] - mo_energy[s][:nocc[s], None]).reshape(-1)
        for ivec in range(ntri):
            oz = orb_diff * tri_vec[ivec][s * dim[0]: s * dim[0] + dim[s]]
            if return_apb:
                apb_prod[ivec][s * dim[0]: s * dim[0] + dim[s]] += oz
            if return_amb:
                amb_prod[ivec][s * dim[0]: s * dim[0] + dim[s]] += oz
            work_done += 2 * oz.size

    ret_args = []
    if return_apb:
        ret_args.append(apb_prod)
    if return_amb:
        ret_args.append(amb_prod)
    ret_args.append(work_done)
    return tuple(ret_args)


def _bi_orth_norm(left_vec, right_vec):
    """Orthonormalize left and right eigenvectors.
    Orthonormalization condition is defined as equation 20 in doi.org/10.1063/1.477483.

    Args:
        left_vec (double ndarray): left eigenvectors.
        right_vec (double ndarray): right eigenvectors.
    """
    work = 0
    nroot = left_vec.shape[0]
    # orthogonalization

    # the L1 BLAS function xSCAL just scales a vector
    scal = sla.blas.get_blas_funcs('scal', (left_vec[0],))

    # the L2 BLAS function xGER performs a rank-1 update of a matrix.
    ger = sla.get_blas_funcs('ger', (left_vec[0],))

    for i in range(nroot):
        # Remove the projection of the left vector i on the right vectors != i
        # qii is ||left_vec[i]||^2
        qii = numpy.linalg.norm(left_vec[i])**2
        if i > 0:
            # Remove the projection of the left vector i on the right vectors < i
            rijs = (right_vec[:i] @ left_vec[i]) / qii
            # The following line does
            # right_vec[j] -= rijs[j] * left_vec[i] for all j < i
            ger(-1.0, left_vec[i], rijs, a=right_vec[:i].T, overwrite_x=False, overwrite_a=True)
        if i < nroot - 1:
            # Remove the projection of the left vector i on the right vectors > i
            rijs = (right_vec[i+1:nroot] @ left_vec[i]) / qii
            # The following line does
            # right_vec[j] -= rijs[j] * left_vec[i] for all j > i
            ger(-1.0, left_vec[i], rijs, a=right_vec[i+1:nroot].T, overwrite_x=False, overwrite_a=True)
        work += left_vec[i].size ** 2 * (nroot - 1)

    for i in range(nroot):
        # At this stage, the vectors are biorthogonal, but
        # not necessarily biorthonormal. We normalize them.
        aii = numpy.dot(left_vec[i], right_vec[i])
        work += left_vec[i].size
        assert numpy.abs(aii) > 1.0e-12, "fails to normalize vectors"
        sqrt_aii = numpy.sqrt(aii)
        scal(1.0 / sqrt_aii, right_vec[i])
        scal(1.0 / sqrt_aii, left_vec[i])
    return work


def _expand_trial_vector(
        bse, nocc, mo_energy, exci, tri_vec, left_vec, right_vec, left_vec_tri, right_vec_tri, apb_prod, amb_prod,
        thresh, e_min, max_expand):
    """Expand trial vector space if residues are large.

    Args:
        bse (fcdmft.gw.mol.bse.BSE): BSE object.
        nocc (int array): the number of occupied orbitals.
        mo_energy (double ndarray): orbital energy.
        exci (double array): subspace excitation energy.
        tri_vec (double ndarray): trial vector.
        left_vec (double ndarray): left eigenvector in the full space.
        right_vec (double ndarray): right eigenvector in the full space.
        left_vec_tri (double ndarray): left eigenvector in the subspace.
        right_vec_tri (double ndarray): right eigenvector in the subspace.
        apb_prod (double ndarray): A+B matrix and trial vector contracted vectors.
        amb_prod (double ndarray): A-B matrix and trial vector contracted vectors.
        thresh (double): threshold to determine if a residue vector is small.
        e_min (float): minimum desired excitation energy.
        max_expand (int): maximum number of trial vectors to expand.

    Returns:
        bool: if all residues are small.
        ntri, int: the expanded number of trial vectors
    """
    nspin, nmo = mo_energy.shape
    max_vec = tri_vec.shape[0]
    nroot, ntri = left_vec_tri.shape

    # determine dimension
    nvir = [(nmo - nocc[i]) for i in range(nspin)]
    dim = [(nocc[i] * nvir[i]) for i in range(nspin)]
    full_dim = dim[0] + dim[1] if nspin == 2 else dim[0]

    # find eigenvector candidates in the target energy range
    emin_index = numpy.searchsorted(exci, e_min, side='left')
    if emin_index + nroot > exci.size:
        emin_index = exci.size - nroot
    exci_candidate_indices = numpy.s_[emin_index:emin_index+nroot]

    # calculate residue vector, equation 27 and 28 in doi.org/10.1063/1.477483.
    # left_res = numpy.zeros(shape=[nroot, full_dim], dtype=numpy.double)
    # right_res = numpy.zeros_like(left_res)
    # for i in range(nroot):
    #     right_res[i] = -exci[i] * left_vec[i]
    #     left_res[i] = -exci[i] * right_vec[i]
    left_res = -exci[exci_candidate_indices, None] * right_vec[exci_candidate_indices]
    right_res = -exci[exci_candidate_indices, None] * left_vec[exci_candidate_indices]

    left_res += numpy.matmul(left_vec_tri[exci_candidate_indices, :], amb_prod[:ntri, :])
    right_res += numpy.matmul(right_vec_tri[exci_candidate_indices, :], apb_prod[:ntri, :])

    # check convergence
    res_norms_left = numpy.linalg.norm(left_res, axis=1)**2
    res_norms_right = numpy.linalg.norm(right_res, axis=1)**2
    res_norms = numpy.maximum(res_norms_left, res_norms_right)
    max_res_norm = numpy.max(res_norms)
    conv_vec = numpy.where(res_norms < thresh, 1, 0)

    lib.logger.info(bse, "max residue norm = %.4e", max_res_norm)
    if numpy.all(conv_vec):
        return True, ntri

    not_converged = numpy.argwhere(conv_vec == 0).flatten()
    nexpand = min(max_expand, nroot, not_converged.size)
    candidates_to_expand = not_converged[:nexpand]

    # preconditioning the residues, equation 29 in doi.org/10.1063/1.477483.
    for s in range(nspin):
        q_vec = exci[:, None, None] - (mo_energy[s][None, None, nocc[s]:] - mo_energy[s][None, :nocc[s], None])
        q_vec = q_vec.reshape(-1, nocc[s]*nvir[s])
        left_res[:, s*dim[0]:s*dim[0]+dim[s]] /= q_vec
        right_res[:, s*dim[0]:s*dim[0]+dim[s]] /= q_vec

    for cand in candidates_to_expand:
        # for j in range(ntri):
        #     tmp = -numpy.dot(left_res[i], tri_vec[j])
        #     left_res[i] += tmp * tri_vec[j]
        #     tmp = -numpy.dot(right_res[i], tri_vec[j])
        #     right_res[i] += tmp * tri_vec[j]
        tmps = numpy.matmul(tri_vec[:ntri, :], left_res[cand])
        left_res[cand] -= numpy.matmul(tri_vec[:ntri, :].T, tmps)
        tmps = numpy.matmul(tri_vec[:ntri, :], right_res[cand])
        right_res[cand] -= numpy.matmul(tri_vec[:ntri, :].T, tmps)

        res_thresh = 1.0e-15
        # check left residue
        tmp = numpy.linalg.norm(left_res[cand])**2
        if tmp > res_thresh:
            assert ntri < max_vec, "number of trial vectors larger than max vector number"
            tmp = 1.0 / numpy.sqrt(tmp)
            tri_vec[ntri] += tmp * left_res[cand]
            tmp = -numpy.dot(right_res[cand], tri_vec[ntri])
            right_res[cand] += tmp * tri_vec[ntri]
            ntri += 1

        # check right residue
        tmp = numpy.linalg.norm(right_res[cand])**2
        if tmp > res_thresh:
            assert ntri < max_vec, "number of trial vectors larger than max vector number"
            tmp = 1.0 / numpy.sqrt(tmp)
            # check: should this be tri_vec[i]+= ?
            tri_vec[ntri] += tmp * right_res[cand]
            ntri += 1

    return False, ntri

def _get_lpq_bar(nocc, mo_energy, Lpq):
    """Calculate the auxiliary 3-center matrix.
    Lpq_bar = (epsilon)^-1 * Lpq
    Equation 11 in doi.org/10.1002/jcc.24688.

    Args:
        nocc (int array): the number of occupied orbitals.
        mo_energy (double ndarray): orbital energy.
        Lpq (double ndarray): 3-center density-fitting matrix.

    Returns:
        Lpq_bar (double ndarray): auxiliary three-center matrix.
    """
    nspin, naux, _, _ = Lpq.shape

    # calculate the response function in the auxiliary basis
    X = numpy.zeros(shape=[naux, naux], dtype=numpy.double)
    for i in range(nspin):
        orb_diff = mo_energy[i][:nocc[i], None] - mo_energy[i][None, nocc[i]:]
        orb_diff = 1.0 / orb_diff
        X += 2.0 * einsum('Pia,ia,Qia->PQ', Lpq[i][:, :nocc[i], nocc[i]:], orb_diff, Lpq[i][:, :nocc[i], nocc[i]:])
    if nspin == 1:
        X *= 2.0

    # calculate the inverse dielectric function
    InvD = numpy.linalg.inv((numpy.eye(naux)-X))

    # calculate the auxiliary matrix
    Lpq_bar = einsum('PQ,sQmn->sPmn', InvD, Lpq)

    return Lpq_bar

def _get_lpq_bar_by_block(nocc, mo_energy, Lii, Lia):
    """Calculate the auxiliary 3-center matrix.
    Lpq_bar = (epsilon)^-1 * Lpq
    Equation 11 in doi.org/10.1002/jcc.24688.

    Args:
        nocc (int array): the number of occupied orbitals.
        mo_energy (double ndarray): orbital energy.
        Lpq (double ndarray): 3-center density-fitting matrix.

    Returns:
        Lii_bar, Lia_bar (double ndarrays): auxiliary three-center matrix.
    """
    nspin = len(Lia)
    naux, _, _ = Lia[0].shape
    nvir = [Lia_s.shape[2] for Lia_s in Lia]

    # calculate the response function in the auxiliary basis
    X = numpy.zeros(shape=[naux, naux], dtype=numpy.double)
    for i in range(nspin):
        orb_diff = mo_energy[i][:nocc[i], None] - mo_energy[i][None, nocc[i]:]
        orb_diff = 1.0 / orb_diff
        Pia = Lia[i] * (orb_diff * 2.0)

        # This line computes Pi = einsum('Pia, Qia -> PQ', Pia, Lia)
        X += Pia.reshape(naux, -1) @ Lia[i].reshape(naux, -1).T
        # X += 2.0 * einsum('Pia,ia,Qia->PQ', Lia[i], orb_diff, Lia[i])
    if nspin == 1:
        X *= 2.0

    # calculate the inverse dielectric function
    InvD = numpy.linalg.inv((numpy.eye(naux)-X))

    Lia_bar = []
    Lii_bar = []

    # calculate the auxiliary matrix
    # Lpq_bar = einsum('PQ,sQmn->sPmn', InvD, Lpq)
    for i in range(nspin):
        Lia_bar.append(
            numpy.matmul(InvD, Lia[i].reshape(naux, -1)).reshape(naux, nocc[i], nvir[i])
        )

        Lii_bar.append(
            numpy.matmul(InvD, Lii[i].reshape(naux, -1)).reshape(naux, nocc[i], nocc[i])
        )
        
    return Lii_bar, Lia_bar


def _get_oscillator_strength(multi, exci, X_vec, Y_vec, mo_coeff, nocc, mol):
    """Get transition dipoles and oscillator strengths.

    Args:
        multi (char): multiplicity. "s"=singlet, "t"=triplet, "u"=unrestricted.
        exci (double array): excitation energy.
        X_vec (double ndarray): X block of eigenvector (excitation).
        Y_vec (double ndarray): Y block of eigenvector (de-excitation).
        mo_coeff (double ndarray): coefficient from AO to MO.
        nocc (int array): number of occupied orbitals.
        mol (pyscf.gto.mole.Mole): Mole object for generating dipole matrix.

    Returns:
        dipole (double ndarray): transition dipoles of all excitations.
        oscillator_strength (double array): oscillator strengths of all excitations.
    """
    nspin, _, nmo = mo_coeff.shape
    nroot = X_vec[0].shape[0]

    dipole = numpy.zeros(shape=[3, nroot], dtype=numpy.double)
    oscillator_strength = numpy.zeros(shape=[nroot], dtype=numpy.double)

    # BSE is blind to triplet oscillator strength
    if multi == "t":
        return dipole, oscillator_strength

    # transfer X and Y to AO space
    X_AO = numpy.zeros(shape=[nspin, nroot, nmo, nmo], dtype=numpy.double)
    Y_AO = numpy.zeros_like(X_AO)
    for s in range(nspin):
        for i in range(nroot):
            X_AO[s, i] = reduce(numpy.matmul, (mo_coeff[s][:, :nocc[s]], X_vec[s][i], mo_coeff[s][:, nocc[s]:].T))
            Y_AO[s, i] = reduce(numpy.matmul, (mo_coeff[s][:, :nocc[s]], Y_vec[s][i], mo_coeff[s][:, nocc[s]:].T))

    with mol.with_common_orig((0,0,0)):
        ao_dip = mol.intor_symmetric('int1e_r', comp=3)

    for i in range(3):
        for j in range(nroot):
            for s in range(nspin):
                dipole[i][j] += numpy.trace(numpy.matmul(X_AO[s, j], ao_dip[i])) \
                    + numpy.trace(numpy.matmul(Y_AO[s, j], ao_dip[i]))

            if nspin == 1:
                dipole[i][j] *= numpy.sqrt(2.0)

    for i in range(nroot):
        oscillator_strength[i] = 2.0 / 3.0 * exci[i] * (dipole[0][i] ** 2 + dipole[1][i] ** 2 + dipole[2][i] ** 2)

    return dipole, oscillator_strength


def _get_spin_square(nocc, X_vec, Y_vec, mo_coeff, ovlp):
    """Get <S2> expectation value.

    Args:
        nocc (int array): number of occupied orbitals.
        X_vec (double ndarray): X block of eigenvector (excitation).
        Y_vec (double ndarray): Y block of eigenvector (de-excitation).
        mo_coeff (double ndarray): coefficient from AO to MO.
        ovlp (double ndarray): overlap matrix.

    Returns:
        s2 (double array): <S2> expectation value of excitations.
    """
    nroot = X_vec[0].shape[0]
    ab_ovlp = reduce(numpy.matmul, (mo_coeff[0].T, ovlp, mo_coeff[1]))
    s2 = numpy.zeros(shape=[nroot], dtype=numpy.double)
    s2[:] = nocc[0] - (nocc[0] - nocc[1]) / 2.0 + ((nocc[0] - nocc[1]) / 2.0)**2
    for iroot in range(nroot):
        # alpha excitation ket
        # a alpha and j beta exchange: alpha excitation bra
        s2[iroot] -= einsum("ia,ib,aj,bj->", X_vec[0][iroot]+Y_vec[0][iroot], X_vec[0][iroot]-Y_vec[0][iroot],
                            ab_ovlp[nocc[0]:, :nocc[1]], ab_ovlp[nocc[0]:, :nocc[1]])
        # a alpha and j beta exchange: beta excitation bra
        s2[iroot] -= einsum("ia,jb,ij,ab->", X_vec[0][iroot]+Y_vec[0][iroot], X_vec[1][iroot]-Y_vec[1][iroot],
                            ab_ovlp[:nocc[0], :nocc[1]], ab_ovlp[nocc[0]:, nocc[1]:])
        # i alpha and j beta exchange: same alpha excitation bra
        s2[iroot] -= einsum("ia,ia,jk->", X_vec[0][iroot]+Y_vec[0][iroot], X_vec[0][iroot]-Y_vec[0][iroot],
                            ab_ovlp[:nocc[0], :nocc[1]]**2)
        s2[iroot] += einsum("ia,ia,ik->", X_vec[0][iroot]+Y_vec[0][iroot], X_vec[0][iroot]-Y_vec[0][iroot],
                            ab_ovlp[:nocc[0], :nocc[1]]**2)
        # beta excitation ket
        # i alpha and b beta exchange: beta excitation bra
        s2[iroot] -= einsum("ia,ib,ja,jb->", X_vec[1][iroot]+Y_vec[1][iroot], X_vec[1][iroot]-Y_vec[1][iroot],
                            ab_ovlp[:nocc[0], nocc[1]:], ab_ovlp[:nocc[0], nocc[1]:])
        # i alpha and b beta exchange: alpha excitation bra
        s2[iroot] -= einsum("ia,jb,ji,ba->", X_vec[1][iroot]+Y_vec[1][iroot], X_vec[0][iroot]-Y_vec[0][iroot],
                            ab_ovlp[:nocc[0], :nocc[1]], ab_ovlp[nocc[0]:, nocc[1]:])
        # i alpha and j beta exchange: same alpha excitation bra
        s2[iroot] -= einsum("ia,ia,jk->", X_vec[1][iroot]+Y_vec[1][iroot], X_vec[1][iroot]-Y_vec[1][iroot],
                            ab_ovlp[:nocc[0], :nocc[1]]**2)
        s2[iroot] += einsum("ia,ia,ji->", X_vec[1][iroot]+Y_vec[1][iroot], X_vec[1][iroot]-Y_vec[1][iroot],
                            ab_ovlp[:nocc[0], :nocc[1]]**2)

    return s2



[docs]
class BSE(lib.StreamObject):
    def __init__(self,
                 # initialize with a GW object
                 gw=None,
                 # initialize with nocc, mo_energy and Lpq
                 nocc=None, mo_energy=None, Lpq=None, verbose=5,
                 # options
                 TDA=False, nroot=10, max_vec=200, max_iter=100, 
                 max_expand=None, init_ntri = None, residue_thresh=1.0e-8):
        """Initialize BSE object.
        The BSE object can be initialized by restricted or unrestricted GW object.

        Args:
            gw (fcdmft.gw.mol, optional): GW object.
            nocc (int/int array, optional): number of occupied orbitals.
            mo_energy(double ndarray, optional): quasiparticle energy.
            Lpq (double ndarray, optional): three-center density-fitting matrix in MO.
            verbose (int, optional): print level.
            TDA (bool, optional): use TDA approximation to ignore B matrix. Defaults to False.
            nroot (int, optional): the number of desired roots. Defaults to 10.
            max_vec (int, optional): max allowed subspace size. Defaults to 200.
            max_iter (int, optional): max Davidson iteration. Defaults to 100.
            max_expand(int, optional): max number of trial vectors to expand. Defaults to nroot.
            init_ntri (int, optional): initial number of trial vectors. Default is 4 * nroot.
            residue_thresh (double, optional): threshold if the residue needs to be added as a new trial vector.
            Defaults to 1.0e-8.
        """
        # initialize matrix
        if gw is not None:
            self.nspin = 1 if numpy.asarray(gw.mo_energy).ndim == 1 else 2
            self.verbose = gw.mol.verbose
            self.mol = gw.mol
            self.mf = gw._scf
            self.nocc = numpy.asarray(gw.nocc)
            if self.nocc.ndim == 0:
                self.nocc = self.nocc[numpy.newaxis, ...]
            self.mo_energy = numpy.asarray(gw.mo_energy)
            if self.mo_energy.ndim == 1:
                self.mo_energy = self.mo_energy[numpy.newaxis, ...]
            self.mo_coeff = gw.mo_coeff
            if self.mo_coeff.ndim == 2:
                self.mo_coeff = self.mo_coeff[numpy.newaxis, ...]
            self.nmo = self.mo_energy.shape[-1]
            # initialize density-fitting matrix
            if self.nspin == 2 and isinstance(gw.nmo, int):
                gw.nmo = [gw.nmo, gw.nmo]
            self.Lpq = gw.Lpq if hasattr(gw, "Lpq") else None
            if self.Lpq is None:
                self.Lpq = numpy.asarray(numpy.asarray(gw.ao2mo(gw.mo_coeff)))
            if self.Lpq.ndim == 3:
                self.Lpq = self.Lpq[numpy.newaxis, ...]
        elif gw is None and (nocc is not None and mo_energy is not None and Lpq is not None):
            self.verbose = verbose
            if numpy.asarray(mo_energy).ndim == 1:
                self.nspin = 1
                self.nocc = numpy.asarray(nocc)[numpy.newaxis, ...]
                self.mo_energy = numpy.asarray(mo_energy)[numpy.newaxis, ...]
                self.Lpq = numpy.asarray(Lpq)[numpy.newaxis, ...]
                self.nmo = mo_energy.shape[0]
            else:
                self.nspin = 2
                self.nocc = numpy.asarray(nocc)
                self.mo_energy = numpy.asarray(mo_energy)
                self.Lpq = numpy.asarray(Lpq)
                self.nmo = [mo_energy.shape[1], mo_energy.shape[1]]
        else:
            raise NotImplementedError("BSE object is not properly initialized!")

        # options
        self.TDA = TDA  # use TDA approximation to ignore B matrix

        # Davidson algorithm
        self.multi = None  # multiplicity
        self.nroot = nroot  # the number of desired roots
        self.max_vec = max_vec  # max allowed subspace size
        self.max_iter = max_iter  # max Davidson iteration
         # max number of trial vectors to expand per iteration
        self.max_expand = nroot if max_expand is None else max_expand
        self.residue_thresh = residue_thresh  # threshold if the residue needs to be added as a new trial vector
        self.init_ntri = 4 * nroot if init_ntri is None else init_ntri

        # results
        self.exci = None  # excitation energy
        self.X_vec = None  # X block of eigenvector (excitation)
        self.Y_vec = None  # Y block of eigenvector (de-excitation)


[docs]
    def dump_flags(self):
        log = lib.logger.Logger(self.stdout, self.verbose)
        log.info('')
        log.info('******** %s ********', self.__class__)
        nvir = [(self.nmo - self.nocc[i]) for i in range(self.nspin)]
        dim = [(self.nocc[i] * nvir[i]) for i in range(self.nspin)]
        log.info('multiplicity = %s', self.multi)
        log.info('nmo = %s', self.nmo)
        log.info('nocc = %s', self.nocc[0] if self.nspin == 1 else self.nocc)
        log.info('nvir = %s', nvir[0] if self.nspin == 1 else nvir)
        log.info('occ-vir dimension = %s', dim[0] if self.nspin == 1 else dim)
        if self.nspin == 2:
            log.info('BSE full dimension = %s', dim[0] + dim[1])
        log.info('Tamm-Dancoff approximation = %s', self.TDA)
        log.info('number of roots = %d', self.nroot)
        log.info('max subspace size = %d', self.max_vec)
        log.info('max iteration = %s', self.max_iter)
        log.info('convergence tolerance = %s', self.residue_thresh)
        log.info('')
        return



[docs]
    def check_memory(self):
        """Check memory needed for the BSE calculation.
        """
        nvir = [(self.nmo - self.nocc[i]) for i in range(self.nspin)]
        dim = [(self.nocc[i] * nvir[i]) for i in range(self.nspin)]
        full_dim = dim[0] + dim[1] if self.nspin == 2 else dim[0]
        naux = self.Lpq.shape[1]

        # Lpq and Lpq_bar; trial vector, A+B/A-B matrix with trial vector product
        mem = (naux * self.nmo * self.nmo * 2 + self.max_vec * full_dim * 3) * 8
        lib.logger.info(self, "BSE needs at least %.1f GB memory.", mem / 1.0e9)

        return



[docs]
    def kernel(self, multi, e_min=0.0, delta=0.0):
        # check spin and multiplicity
        assert isinstance(multi, str)
        multi = multi[0].lower()
        assert (self.nspin == 1 and (multi == 's' or multi == 't')) or (self.nspin == 2 and multi == 'u')
        self.multi = multi

        cput0 = (time.process_time(), time.perf_counter())
        self.dump_flags()
        self.check_memory()
        exci, X, Y = bse_davidson(bse=self, multi=multi, e_min=e_min, delta=delta)
        lib.logger.timer(self, 'BSE', *cput0)
        return exci, X, Y



[docs]
    def full_diagonalization(self, multi):
        cput0 = (time.process_time(), time.perf_counter())
        lib.logger.info(self, "\nBSE full diagonalization: %s", multi)
        self.multi = multi

        # set nroot as full dimension for analysis
        nvir = [(self.nmo - self.nocc[i]) for i in range(self.nspin)]
        dim = [(self.nocc[i] * nvir[i]) for i in range(self.nspin)]
        self.nroot = dim[0] + dim[1] if self.nspin == 2 else dim[0]

        # A+B, A-B, X+Y, X-Y
        mem = (self.nroot * self.nroot * 4) * 8
        lib.logger.info(self, "BSE needs at least %.1f GB memory.", mem / 1.0e9)

        exci, X, Y = bse_full_diagonalization(bse=self, multi=multi)
        lib.logger.timer(self, 'BSE full diagonalization', *cput0)
        return exci, X, Y



[docs]
    def analyze(self, thresh=0.1, oscillator=True, s2=True):
        """Analyze excitations.

        Args:
            thresh (float, optional): threshold to print dominant component. Defaults to 0.1.
        """
        multi = self.multi
        nspin = self.nspin
        nmo = self.nmo
        nocc = self.nocc
        X_vec, Y_vec = self.X_vec, self.Y_vec
        nvir = [(nmo - nocc[i]) for i in range(nspin)]

        if oscillator is True:
            dipole, oscillator_strength = _get_oscillator_strength(
                multi=multi, exci=self.exci, X_vec=X_vec, Y_vec=Y_vec, mo_coeff=self.mo_coeff, nocc=nocc, mol=self.mol)

        if s2 is True and nspin == 2:
            s2 = _get_spin_square(nocc=nocc, X_vec=X_vec, Y_vec=Y_vec, mo_coeff=self.mo_coeff, ovlp=self.mf.get_ovlp())

        AU2EV = 27.211386
        print("-" * 55)
        if multi == "s":
            print("restricted singlet BSE")
        elif multi == "t":
            print("restricted triplet BSE")
        elif multi == "u":
            print("unrestricted BSE")
        for r in range(self.nroot):
            print("-" * 55)
            print("excited state: %-d" % (r + 1))
            print("excitation energy:   %15.8f   AU   %15.8f   eV" % (self.exci[r], self.exci[r] * AU2EV))
            if multi == "s":
                if oscillator is True:
                    print("spin allowed, oscillator strength:   %15.8f   AU" % oscillator_strength[r])
                    print("transition dipole: x =  %15.6f  , y =  %15.6f  , z =  %15.6f" %
                          (dipole[0][r], dipole[1][r], dipole[2][r]))
            elif multi == "t":
                if oscillator is True:
                    print("spin forbidden, oscillator strength and transition dipoles are not defined")
            elif multi == "u":
                if s2 is True:
                    print("<S^2> =    %.6f", s2[r])
                if oscillator is True:
                    print("oscillator strength:   %15.8f   AU" % oscillator_strength[r])
                    print("transition dipole: x =  %15.6f  , y =  %15.6f  , z =  %15.6f" %
                          (dipole[0][r], dipole[1][r], dipole[2][r]))

            print("dominant component")
            if nspin == 1:
                for i in range(nocc[0]):
                    for a in range(nvir[0]):
                        if abs(X_vec[0][r][i][a]) > thresh:
                            print("%5d -> %5d, %15f, X" % (i+1, a+nocc[0]+1, X_vec[0][r][i][a]))
                        if abs(Y_vec[0][r][i][a]) > thresh:
                            print("%5d -> %5d, %15f, Y" % (i+1, a+nocc[0]+1, Y_vec[0][r][i][a]))
            else:
                for s in range(nspin):
                    spin = "a" if s == 0 else "b"
                    for i in range(nocc[s]):
                        for a in range(nvir[s]):
                            if abs(X_vec[s][r][i][a]) > thresh:
                                print("%5d -> %5d, spin %c, %15f, X" % (i+1, a+nocc[s]+1, spin, X_vec[s][r][i][a]))
                            if abs(Y_vec[s][r][i][a]) > thresh:
                                print("%5d -> %5d, spin %s, %15f, Y" % (i+1, a+nocc[s]+1, spin, Y_vec[s][r][i][a]))



[docs]
    def get_oscillator_strength(self):
        """Get transition dipoles and oscillator strengths.

        Returns:
            dipole (double ndarray): transition dipoles.
            oscillator_strength (double array): oscillator strengths.
        """
        assert self.exci is not None and self.X_vec is not None and self.Y_vec is not None
        assert self.mo_coeff is not None and self.mol is not None
        dipole, oscillator_strength = _get_oscillator_strength(
            multi=self.multi, exci=self.exci, X_vec=self.X_vec, Y_vec=self.Y_vec, mo_coeff=self.mo_coeff,
            nocc=self.nocc, mol=self.mol)

        return dipole, oscillator_strength




if __name__ == '__main__':
    from pyscf import gto, dft, scf
    from fcdmft.gw.mol.gw_exact import GWExact
    from fcdmft.gw.mol.ugw_exact import UGWExact
    mol = gto.Mole()
    mol.verbose = 0
    mol.atom = [
        [8, (0., 0., 0.)],
        [1, (0.7571, 0., 0.5861)],
        [1, (-0.7571, 0., 0.5861)]
    ]
    mol.basis = 'def2-svp'
    mol.build()

    # GW-exact/BSE
    rmf = dft.RKS(mol)
    rmf.xc = 'pbe'
    rmf.kernel()

    rgw = GWExact(rmf)
    rgw.eta = 1.0e-5
    rgw.linearized = True
    rgw.kernel()
    rbse = BSE(rgw)
    rbse.verbose = 5
    exci_s = rbse.kernel('s')[0]
    #dipole, oscillator_strength = rbse.get_oscillator_strength()
    #print("oscillator strength\n", oscillator_strength)
    #rbse.analyze()
    exci_t = rbse.kernel('t')[0]
    #rbse.analyze()
    print("GWExact@PBE/BSE singlet excitation energy (eV)\n", exci_s*27.21138)
    print("GWExact@PBE/BSE triplet excitation energy (eV)\n", exci_t*27.21138)
    assert abs(exci_s[0] - 0.26302056) < 1e-5
    assert abs(exci_t[0] - 0.22831865) < 1e-5

    # full diagonalization, N6 scaling
    exci_s_full = rbse.full_diagonalization(multi="s")[0]
    #rbse.analyze()
    assert abs(exci_s[0] - exci_s_full[0]) < 1e-5
    exci_t_full = rbse.full_diagonalization(multi="t")[0]
    #rbse.analyze()
    assert abs(exci_t[0] - exci_t_full[0]) < 1e-5

    # GW-AC with BSE
    from fcdmft.gw.mol.gw_ac import GWAC
    rgw = GWAC(rmf)
    rgw.kernel()
    rbse = BSE(rgw)
    exci_s = rbse.kernel('s')[0]
    #rbse.analyze()
    exci_t = rbse.kernel('t')[0]
    #rbse.analyze()
    print("GWAC@PBE/BSE singlet excitation energy (eV)\n", exci_s*27.21138)
    print("GWAC@PBE/BSE triplet excitation energy (eV)\n", exci_t*27.21138)
    assert abs(exci_s[0] - 0.25749397) < 1e-5
    assert abs(exci_t[0] - 0.22299263) < 1e-5

    # unrestricted
    mol = gto.Mole()
    mol.verbose = 5
    mol.atom = [
        [8, (0., 0., 0.)],
        [1, (0.7571, 0., 0.5861)],
        [1, (-0.7571, 0., 0.5861)]
    ]
    mol.charge = 1
    mol.spin = 1
    mol.basis = 'def2-svp'
    mol.build()
    umf = dft.UKS(mol)
    umf.xc = 'pbe'
    umf.kernel()

    # UGW-exact/BSE
    ugw = UGWExact(umf)
    ugw.eta = 1.0e-5
    ugw.linearized = True
    ugw.kernel()

    ubse = BSE(ugw)
    exci = ubse.kernel("u")[0]
    #ubse.analyze()
    assert abs(exci[0] - 0.02388355) < 1e-5
    print("GWExact@PBE/BSE unrestricted excitation energy (eV)\n", exci*27.21138)

    # UGW-AC/BSE
    from fcdmft.gw.mol.ugw_ac import UGWAC
    ugw = UGWAC(umf)
    ugw.kernel()

    ubse = BSE(ugw)
    exci = ubse.kernel("u")[0]
    #ubse.analyze()
    assert abs(exci[0] - 0.02114003) < 1e-5
    print("GWAC@PBE/BSE unrestricted excitation energy (eV)\n", exci*27.21138)

    # Energy-specific BSE
    from fcdmft.gw.mol.gw_ac import GWAC
    rgw = GWAC(rmf)
    rgw.kernel()
    rbse = BSE(rgw)
    rbse.verbose = 5
    exci_s = rbse.kernel('s', e_min=0.4)[0]
    #rbse.analyze()
    exci_t = rbse.kernel('t', e_min=0.4)[0]
    #rbse.analyze()
    print("GWAC@PBE/BSE singlet excitation energy (eV)\n", exci_s*HARTREE2EV)
    print("GWAC@PBE/BSE triplet excitation energy (eV)\n", exci_t*HARTREE2EV)
    assert abs(exci_s[0] - 0.42691789) < 1e-5
    assert abs(exci_t[0] - 0.45195324) < 1e-5