From 47818b8240074222ce771cce4eb2af097db3d5ba Mon Sep 17 00:00:00 2001
From: jabader97 <51213988+jabader97@users.noreply.github.com>
Date: Tue, 10 May 2022 16:33:01 +0200
Subject: [PATCH 01/29] [ADD] Base class for `NLLLoss` & implement `MSELoss`
 (#252)

The base class automates the Hessian square root approximation via
MC-sampling. It requires specifying the likelihood distribution. The Hessian
square root is then approximated by computing gradients with targets
drawn from the likelihood with `autograd`.

---


* Updated cross entropy and MSE to use new NLL base

* Changed _post_process and _checks to raise NotImplementedError

* Refactored NLL base for general log-prob derivative from torch.distribution, as well as overwriting for MSE and CE

* Fixed some spacing errors

* Updated doc strings

* Added test for NLL version of compute_sampled_grads

* Fixed spacing issue introduced in last commit

* Implemented (some of) commented fixes

* Added NotImplementedError to _veryify_support to fix coveralls problem

* Fixed some commenting, removed mean from mse make distribution, moved sampling into loop for nll base

* [REF] Use distribution for sampled gradients in manual approach

* Changed use_dist to use_autograd, fixed return statement error in MSE, fixed device error in MSE

* Black

* Reverted changes to CEL

* Reverted cross entropy changes

* Removed unneeded changes to clean the diff

* Some docstring updates

* A few missed changes for diff

* [REF] Move `use_autograd` inside `NLLLossDerivatives`

* [REF] Change default of `use_autograd` to `False` for `MSELoss`

* [FMT] Remove space

* [DEL] Remove `use_autograd` from `CrossEntropyLoss`

* [DOC] Clarify `use_autograd` in test function

* [FIX] Syntax error

* [CI] Add NLLLossDerivatives to fully documented

* Added missing type annotations to nll_base.py, removed redundant autograd_res call in test_sqrt_hessian_sampled_squared_approximates_hessian_nll

* Removed unnecessary ABC in MSE loss, fixed _compute_sampled_grad_manual name

* Fixed documentation to match standards and reflect current version

* Darglint fixes

* Pydocstyle fix

* Darglint formatting fix

* Removed retain_graph=True

* Created MSE_LOSS_PROBLEMS for test_sqrt_hessian_sampled_squared_approximates_hessian_nll to run on

* Added autograd test to check that sample has same shape as subsampled_input

* Reformated some too-long lines

* [REF] Remove `enable_grad` and `Variable`

* [REF] Shorten import

* [REF] Rewrite NLL test with recursion

* [FIX] Remove unused import

* [FIX] darglint

* [DOC] Polish MSELoss

* [DOC] Polish NLL base

* [DOC] Polish derivatives test

* [FIX] Type annotation

* [FIX] Darglint

* [DOC] Polish NLLbase

* [DOC] One more pass through docstrings

Co-authored-by: Felix Dangel <fdangel@tue.mpg.de>
Co-authored-by: Felix Dangel <felix.dangel@tuebingen.mpg.de>
---
 backpack/core/derivatives/mseloss.py          | 109 ++++++----
 backpack/core/derivatives/nll_base.py         | 188 ++++++++++++++++++
 backpack/utils/module_classification.py       |  13 ++
 fully_documented.txt                          |   1 +
 test/core/derivatives/derivatives_test.py     |  61 ++++++
 .../derivatives/implementation/backpack.py    |  12 +-
 test/core/derivatives/problem.py              |   5 +-
 7 files changed, 352 insertions(+), 37 deletions(-)
 create mode 100644 backpack/core/derivatives/nll_base.py

diff --git a/backpack/core/derivatives/mseloss.py b/backpack/core/derivatives/mseloss.py
index f09750052..b5c4caa97 100644
--- a/backpack/core/derivatives/mseloss.py
+++ b/backpack/core/derivatives/mseloss.py
@@ -3,21 +3,34 @@
 from math import sqrt
 from typing import List, Tuple
 
-from torch import Tensor, eye, normal, ones
+from torch import Size, Tensor, eye, ones, tensor
+from torch.distributions import Normal
 from torch.nn import MSELoss
 
-from backpack.core.derivatives.basederivatives import BaseLossDerivatives
+from backpack.core.derivatives.nll_base import NLLLossDerivatives
 
 
-class MSELossDerivatives(BaseLossDerivatives):
-    """Derivatives of the MSE Loss.
+class MSELossDerivatives(NLLLossDerivatives):
+    """Derivatives of ``MSELoss``.
 
     We only support 2D tensors.
 
     For `X : [n, d]` and `Y : [n, d]`, if `reduce=sum`, the MSE computes
     `∑ᵢ₌₁ⁿ ‖X[i,∶] − Y[i,∶]‖²`. If `reduce=mean`, the result is divided by `nd`.
+
+    ``MSELoss`` is a negative log-likelihood of a Gaussian with mean corresponding
+    to the module input and constant standard deviation √0.5.
     """
 
+    def __init__(self, use_autograd: bool = False):
+        """Initialization for MSE loss derivative.
+
+        Args:
+            use_autograd: Compute gradients with autograd (rather than manual)
+                Defaults to ``False`` (manual computation).
+        """
+        super().__init__(use_autograd=use_autograd)
+
     def _sqrt_hessian(
         self,
         module: MSELoss,
@@ -25,7 +38,7 @@ def _sqrt_hessian(
         g_out: Tuple[Tensor],
         subsampling: List[int] = None,
     ) -> Tensor:  # noqa: D102
-        self.check_input_dims(module)
+        self._check_input_dims(module)
 
         input0: Tensor = module.input0
         N, D = input0.shape
@@ -40,33 +53,6 @@ def _sqrt_hessian(
 
         return sqrt_H
 
-    def _sqrt_hessian_sampled(
-        self,
-        module: MSELoss,
-        g_inp: Tuple[Tensor],
-        g_out: Tuple[Tensor],
-        mc_samples: int = 1,
-        subsampling: List[int] = None,
-    ) -> Tensor:
-        self.check_input_dims(module)
-
-        input0: Tensor = module.input0
-        N, D = input0.shape
-        N_active = N if subsampling is None else len(subsampling)
-        samples = normal(
-            0,
-            1,
-            size=[mc_samples, N_active, D],
-            device=input0.device,
-            dtype=input0.dtype,
-        )
-        samples *= sqrt(2) / sqrt(mc_samples)
-
-        if module.reduction == "mean":
-            samples /= sqrt(input0.numel())
-
-        return samples
-
     def _sum_hessian(self, module, g_inp, g_out):
         """The Hessian, summed across the batch dimension.
 
@@ -78,7 +64,7 @@ def _sum_hessian(self, module, g_inp, g_out):
         Returns: a `[D, D]` tensor of the Hessian, summed across batch
 
         """
-        self.check_input_dims(module)
+        self._check_input_dims(module)
 
         N, D = module.input0.shape
         H = 2 * eye(D, device=module.input0.device)
@@ -101,10 +87,63 @@ def hessian_mat_prod(mat):
 
         return hessian_mat_prod
 
-    def check_input_dims(self, module):
+    def _verify_support(self, module: MSELoss):
+        """We only support 2D tensors."""
+        self._check_input_dims(module)
+
+    def _make_distribution(self, subsampled_input: Tensor) -> Normal:
+        """Create the likelihood distribution whose NLL is the MSE.
+
+        The log probability of the Gaussian distribution is proportional to
+        ¹/₍₂𝜎²₎∑ᵢ₌₁ⁿ (xᵢ−𝜇)². Because MSE = ∑ᵢ₌₁ⁿ(Yᵢ−Ŷᵢ)², this is
+        equivalent for samples drawn from a Gaussian distribution with
+        mean of the subsampled input and standard deviation √0.5.
+
+        Args:
+            subsampled_input: input after subsampling
+
+        Returns:
+            Normal distribution for targets | inputs
+        """
+        return Normal(
+            subsampled_input, tensor(sqrt(0.5), device=subsampled_input.device)
+        )
+
+    def _check_input_dims(self, module: MSELoss):
         """Raises an exception if the shapes of the input are not supported."""
         if not len(module.input0.shape) == 2:
             raise ValueError("Only 2D inputs are currently supported for MSELoss.")
 
-    def hessian_is_psd(self):
+    def hessian_is_psd(self) -> bool:
+        """Return whether cross-entropy loss Hessian is positive semi-definite.
+
+        Returns:
+            True
+        """
         return True
+
+    @staticmethod
+    def _get_mean_normalization(input: Tensor) -> int:
+        return input.numel()
+
+    def _compute_sampled_grads_manual(
+        self, subsampled_input: Tensor, mc_samples: int
+    ) -> Tensor:
+        """Manually compute gradients from sampled targets.
+
+        Because MSE = ∑ᵢ₌₁ⁿ(Yᵢ−Ŷᵢ)², the gradient is 2∑ᵢ₋₁ⁿ(Yᵢ−Ŷᵢ).
+
+        Args:
+            subsampled_input: input after subsampling
+            mc_samples: number of samples
+
+        Returns:
+            Gradient samples
+        """
+        dist = self._make_distribution(subsampled_input)
+        samples = dist.sample(sample_shape=Size([mc_samples]))
+        subsampled_input_expanded = subsampled_input.unsqueeze(0).expand(
+            mc_samples, -1, -1
+        )
+
+        return 2 * (samples - subsampled_input_expanded)
diff --git a/backpack/core/derivatives/nll_base.py b/backpack/core/derivatives/nll_base.py
new file mode 100644
index 000000000..1f8b14795
--- /dev/null
+++ b/backpack/core/derivatives/nll_base.py
@@ -0,0 +1,188 @@
+"""Partial derivative bases for NLL losses."""
+from math import sqrt
+from typing import List, Tuple
+
+from torch import Tensor, stack
+from torch.autograd import grad
+from torch.distributions import Distribution
+from torch.nn import Module
+
+from backpack.core.derivatives.basederivatives import BaseLossDerivatives
+from backpack.utils.subsampling import subsample
+
+
+class NLLLossDerivatives(BaseLossDerivatives):
+    """Base class for partial derivatives of negative log-likelihood losses.
+
+    These loss functions can be expressed as a negative log-likelihood (NLL)
+    of targets given the input, 𝑙(fₙ,yₙ)= −log p(yₙ | fₙ) with a likelihood
+    distribution p(· | f).
+    """
+
+    def __init__(self, use_autograd: bool = True):
+        """Initialization.
+
+        Args:
+            use_autograd: Compute gradient samples with autograd (rather than manually).
+                Default: ``True``. This argument is used to test the non-default
+                computation.
+        """
+        self.use_autograd = use_autograd
+
+    def _sqrt_hessian_sampled(
+        self,
+        module: Module,
+        g_inp: Tuple[Tensor],
+        g_out: Tuple[Tensor],
+        mc_samples: int = 1,
+        subsampling: List[int] = None,
+    ) -> Tensor:
+        """Approximate the Hessian square root through Monte-Carlo sampling.
+
+        If use_autograd is True, _make_distribution must be implemented.
+        Otherwise, _compute_sampled_grads_manual must be implemented.
+
+        In mean reduction mode, _get_mean_normalization must be implemented.
+
+        Args:
+            module: loss module.
+            g_inp: Gradient of loss w.r.t. input
+            g_out: Gradient of loss w.r.t. output
+            mc_samples: number of Monte Carlo samples to take
+            subsampling: Indices of samples that are sliced along the dimension
+
+        Returns:
+            Approximate Hessian square root. Has shape [mc_samples,
+            subsampled_input.shape].
+        """
+        self._verify_support(module)
+        subsampled_input = subsample(module.input0, subsampling=subsampling)
+        sqrt_hessian = self.compute_sampled_grads(subsampled_input, mc_samples) / sqrt(
+            mc_samples
+        )
+        if module.reduction == "mean":
+            sqrt_hessian /= sqrt(self._get_mean_normalization(module.input0))
+        return sqrt_hessian
+
+    def _verify_support(self, module: Module):
+        """Verify that the module hyperparameters are supported.
+
+        Args:
+            module: loss module
+
+        Raises:
+            NotImplementedError: If the module has unsupported hyperparameters.
+        """
+        raise NotImplementedError
+
+    def compute_sampled_grads(
+        self, subsampled_input: Tensor, mc_samples: int
+    ) -> Tensor:
+        """Compute gradients with targets drawn from the likelihood p(· | f).
+
+        If use_autograd is True, use _compute_sampled_grads_autograd.
+        Otherwise, use _compute_sampled_grads_manual.
+
+        Args:
+            subsampled_input: input after subsampling
+            mc_samples: number of gradient samples
+
+        Returns:
+            Sampled gradients of shape [mc_samples, *subsampled_input.shape]
+        """
+        grad_func = (
+            self._compute_sampled_grads_autograd
+            if self.use_autograd
+            else self._compute_sampled_grads_manual
+        )
+        return grad_func(subsampled_input, mc_samples)
+
+    def _compute_sampled_grads_autograd(
+        self, subsampled_input: Tensor, mc_samples: int
+    ) -> Tensor:
+        """Compute gradients for samples of the likelihood distribution with autograd.
+
+        _make_distribution must be implemented for this function to work.
+
+        Args:
+            subsampled_input: input after subsampling
+            mc_samples: number of samples
+
+        Returns:
+            Sampled gradients of shape [mc_samples, *subsampled_input.shape]
+        """
+        subsampled_input.requires_grad = True
+        gradients = []
+
+        dist = self._make_distribution(subsampled_input)
+        self._check_distribution_shape(dist, subsampled_input)
+
+        for _ in range(mc_samples):
+            y_tilde = dist.sample()
+            loss_tilde = -dist.log_prob(y_tilde).sum()
+            gradients.append(grad(loss_tilde, subsampled_input)[0])
+
+        return stack(gradients)
+
+    def _compute_sampled_grads_manual(
+        self, subsampled_input: Tensor, mc_samples: int
+    ) -> Tensor:
+        """Compute gradients for samples of the likelihood distribution manually.
+
+        This function can be used instead of _compute_sampled_grads_autograd if
+        the gradient is known analytically.
+
+        Args:
+            subsampled_input: input after subsampling
+            mc_samples: number of samples
+
+        Raises:
+            NotImplementedError: if manual sampled gradients not implemented
+        """
+        raise NotImplementedError("Manual sampled gradients not implemented.")
+
+    def _make_distribution(self, subsampled_input: Tensor) -> Distribution:
+        """Create the likelihood distribution p(· | f).
+
+        This should be in the form of a torch.Distributions object for p, such that
+        the desired loss 𝑙(f, y) α ∑ₙ − log p(yₙ | fₙ).
+
+        Otherwise, the returned object must offer functions to draw samples and to
+        evaluate the log-probability.
+
+        Args:
+            subsampled_input: input after subsampling
+
+        Raises:
+            NotImplementedError: If not implemented.
+        """
+        raise NotImplementedError
+
+    @staticmethod
+    def _get_mean_normalization(input: Tensor) -> int:
+        """Return the normalization factor in mean mode.
+
+        The number C in loss = 1 / C * ∑ᵢ lossᵢ.
+
+        Args:
+            input: input to the layer
+
+        Raises:
+            NotImplementedError: If not implemented
+        """
+        raise NotImplementedError
+
+    @staticmethod
+    def _check_distribution_shape(dist: Distribution, subsampled_input: Tensor):
+        """Verify shape of sampled targets y ∼ p(· | f).
+
+        Args:
+            dist: Distribution of the likelihood p(y | f), e.g. created by
+                _make_distribution.
+            subsampled_input: Input after subsampling.
+
+        Raises:
+            ValueError: If the target samples have incorrect shape.
+        """
+        if dist.sample().shape != subsampled_input.shape:
+            raise ValueError("Sample does not have same shape as subsampled_input.")
diff --git a/backpack/utils/module_classification.py b/backpack/utils/module_classification.py
index b8d9b5b5f..ff3470990 100644
--- a/backpack/utils/module_classification.py
+++ b/backpack/utils/module_classification.py
@@ -3,6 +3,7 @@
 from torch.nn import Module, Sequential
 from torch.nn.modules.loss import _Loss
 
+from backpack.core.derivatives.mseloss import MSELoss
 from backpack.custom_module.branching import Parallel, _Branch
 from backpack.custom_module.reduce_tuple import ReduceTuple
 
@@ -19,6 +20,18 @@ def is_loss(module: Module) -> bool:
     return isinstance(module, _Loss)
 
 
+def is_mse(module: Module) -> bool:
+    """Return whether 'module' is a MSELoss function.
+
+    Args:
+        module: A PyTorch module.
+
+    Returns:
+        Whether 'module' is an MSE loss function
+    """
+    return isinstance(module, MSELoss)
+
+
 def is_no_op(module: Module) -> bool:
     """Return whether the module does no operation in graph.
 
diff --git a/fully_documented.txt b/fully_documented.txt
index 275f0b92d..e56a28e68 100644
--- a/fully_documented.txt
+++ b/fully_documented.txt
@@ -5,6 +5,7 @@ backpack/context.py
 backpack/custom_module/
 
 backpack/core/derivatives/basederivatives.py
+backpack/core/derivatives/nll_base.py
 backpack/core/derivatives/rnn.py
 backpack/core/derivatives/shape_check.py
 backpack/core/derivatives/__init__.py
diff --git a/test/core/derivatives/derivatives_test.py b/test/core/derivatives/derivatives_test.py
index 5eae40c54..8c2c0c2d6 100644
--- a/test/core/derivatives/derivatives_test.py
+++ b/test/core/derivatives/derivatives_test.py
@@ -73,6 +73,9 @@
     problem.make_id() for problem in CUSTOM_SLICING_MODULE_PROBLEMS
 ]
 
+MSE_PROBLEMS = [problem for problem in PROBLEMS if problem.is_mse()]
+MSE_IDS = [problem.make_id() for problem in MSE_PROBLEMS]
+
 SUBSAMPLINGS = [None, [0, 0], [2, 0]]
 SUBSAMPLING_IDS = [f"subsampling={s}".replace(" ", "") for s in SUBSAMPLINGS]
 
@@ -344,6 +347,64 @@ def test_sqrt_hessian_sampled_squared_approximates_hessian(
     problem.tear_down()
 
 
+@mark.parametrize("subsampling", SUBSAMPLINGS, ids=SUBSAMPLING_IDS)
+@mark.parametrize("problem", MSE_PROBLEMS, ids=MSE_IDS)
+def test_sqrt_hessian_sampled_squared_approximates_hessian_nll(
+    problem: DerivativesTestProblem,
+    subsampling: Union[List[int], None],
+    mc_samples: int = 50000,
+    chunks: int = 10,
+    rerun_on_crash: bool = True,
+) -> None:
+    """Test the MC-sampled sqrt decomposition of the input Hessian for NLL loss base.
+
+    Compares the Hessian to reconstruction from individual Hessian MC-sampled
+    sqrt. This test runs specifically on the autograd version of
+    compute_sampled_grads, rather than manual versions which are used by default
+    and tested elsewhere.
+
+    Args:
+        problem: Test case.
+        subsampling: Indices of active samples.
+        mc_samples: number of samples. Defaults to 50000.
+        chunks: Number of passes the MC samples will be processed sequentially.
+        rerun_on_crash: Run the test again with more samples, then crash if it
+            still fails. Default: ``True``.
+
+    Raises:
+        AssertionError: If the MC-sampled Hessian square root does not square to the
+            exact Hessian.
+    """
+    problem.set_up()
+    skip_subsampling_conflict(problem, subsampling)
+    RTOL, ATOL = 1e-2, 8e-3
+
+    autograd_res = AutogradDerivatives(problem).input_hessian(subsampling=subsampling)
+
+    try:
+        backpack_res = BackpackDerivatives(problem).input_hessian_via_sqrt_hessian(
+            mc_samples=mc_samples,
+            chunks=chunks,
+            subsampling=subsampling,
+            use_autograd=True,
+        )
+        problem.tear_down()
+        check_sizes_and_values(autograd_res, backpack_res, rtol=RTOL, atol=ATOL)
+
+    except AssertionError as e:
+        if rerun_on_crash:
+            more = 10
+            test_sqrt_hessian_sampled_squared_approximates_hessian_nll(
+                problem,
+                subsampling,
+                mc_samples=mc_samples * more,
+                chunks=chunks * more,
+                rerun_on_crash=False,
+            )
+        else:
+            raise e
+
+
 @mark.parametrize("subsampling", SUBSAMPLINGS, ids=SUBSAMPLING_IDS)
 @mark.parametrize("problem", LOSS_FAIL_PROBLEMS, ids=LOSS_FAIL_IDS)
 def test_sqrt_hessian_sampled_should_fail(
diff --git a/test/core/derivatives/implementation/backpack.py b/test/core/derivatives/implementation/backpack.py
index 092d368c1..f6e1a1693 100644
--- a/test/core/derivatives/implementation/backpack.py
+++ b/test/core/derivatives/implementation/backpack.py
@@ -5,6 +5,7 @@
 
 from torch import Tensor, einsum, zeros
 
+from backpack.core.derivatives.nll_base import NLLLossDerivatives
 from backpack.utils.subsampling import subsample
 
 
@@ -82,7 +83,11 @@ def sum_hessian(self):  # noqa: D102
         return self.problem.derivative.sum_hessian(self.problem.module, None, None)
 
     def input_hessian_via_sqrt_hessian(
-        self, mc_samples: int = None, chunks: int = 1, subsampling: List[int] = None
+        self,
+        mc_samples: int = None,
+        chunks: int = 1,
+        subsampling: List[int] = None,
+        use_autograd: bool = False,
     ) -> Tensor:
         """Computes the Hessian w.r.t. to the input from its matrix square root.
 
@@ -92,6 +97,8 @@ def input_hessian_via_sqrt_hessian(
             chunks: Maximum sequential split of the computation. Default: ``1``.
                 Only used if mc_samples is specified.
             subsampling: Indices of active samples. ``None`` uses all samples.
+            use_autograd: Compute sampled gradients with ``autograd``. Only relevant
+                for ``NLLLossDerivatives``. Default: ``False``.
 
         Returns:
             Hessian with respect to the input. Has shape
@@ -105,6 +112,9 @@ def input_hessian_via_sqrt_hessian(
             chunk_samples = chunk_sizes(mc_samples, chunks)
             chunk_weights = [samples / mc_samples for samples in chunk_samples]
 
+            if isinstance(self.problem.derivative, NLLLossDerivatives):
+                self.problem.derivative.use_autograd = use_autograd
+
             individual_hessians: Tensor = sum(
                 weight
                 * self._sample_hessians_from_sqrt(
diff --git a/test/core/derivatives/problem.py b/test/core/derivatives/problem.py
index af3524159..4d98ed533 100644
--- a/test/core/derivatives/problem.py
+++ b/test/core/derivatives/problem.py
@@ -8,7 +8,7 @@
 from torch import Tensor, long
 
 from backpack import extend
-from backpack.utils.module_classification import is_loss
+from backpack.utils.module_classification import is_loss, is_mse
 from backpack.utils.subsampling import subsample
 
 
@@ -141,6 +141,9 @@ def make_output_shape(self):
     def is_loss(self):
         return is_loss(self.make_module())
 
+    def is_mse(self):
+        return is_mse(self.make_module())
+
     def forward_pass(
         self, input_requires_grad: bool = False, subsampling: List[int] = None
     ) -> Tuple[Tensor, Tensor, Dict[str, Tensor]]:

From 0a488d7cd6658a7a778757f5ce3dafb8746a74cd Mon Sep 17 00:00:00 2001
From: jabader97 <51213988+jabader97@users.noreply.github.com>
Date: Fri, 3 Jun 2022 14:14:49 +0200
Subject: [PATCH 02/29] [REF] Implement `CrossEntropyLoss` as `NLLLoss` (#256)

- Replace shape check of samples in main library with test
- Add `retain_grad=True` for autograd computation of sampled gradients
  (for MSELoss, it worked without `retain_graph`)

---

* [REF] Changed cross entropy loss to NLL base

* [REF] Removed arrange and rearrange, made CE work for autograd

* [REF] Changed compute_grad_manual for CE to use _make_distribution

* [REF] some cleaning

* [REF] Moved nll distribution shape check

* [FIX] darglint, isort

* [FIX] removed some unused import statements

* [REF] Remove redundant import, improve names

* [REF] Improve readability by linebreaks

* [REF] Import loss modules from `torch.nn`

* [TEST] Apply sub-sampling to input and target for shape check

* [FIX] Add tear_down call

* [DEL] Remove clone+detach

Co-authored-by: Felix Dangel <fdangel@tue.mpg.de>
---
 backpack/core/derivatives/crossentropyloss.py | 100 ++++++++++++------
 backpack/core/derivatives/nll_base.py         |  18 +---
 backpack/utils/module_classification.py       |  13 +--
 test/core/derivatives/derivatives_test.py     |  36 ++++++-
 test/core/derivatives/problem.py              |   6 +-
 5 files changed, 109 insertions(+), 64 deletions(-)

diff --git a/backpack/core/derivatives/crossentropyloss.py b/backpack/core/derivatives/crossentropyloss.py
index 68690df9a..5c8bc2768 100644
--- a/backpack/core/derivatives/crossentropyloss.py
+++ b/backpack/core/derivatives/crossentropyloss.py
@@ -3,21 +3,31 @@
 from typing import Callable, Dict, List, Tuple
 
 from einops import rearrange
-from torch import Tensor, diag, diag_embed, einsum, eye, multinomial, ones_like, softmax
+from torch import Size, Tensor, diag, diag_embed, einsum, eye, ones_like, softmax
+from torch.distributions import Categorical
 from torch.nn import CrossEntropyLoss
 from torch.nn.functional import one_hot
 
-from backpack.core.derivatives.basederivatives import BaseLossDerivatives
+from backpack.core.derivatives.nll_base import NLLLossDerivatives
 from backpack.utils.subsampling import subsample
 
 
-class CrossEntropyLossDerivatives(BaseLossDerivatives):
+class CrossEntropyLossDerivatives(NLLLossDerivatives):
     """Partial derivatives for cross-entropy loss.
 
     The `torch.nn.CrossEntropyLoss` operation is a composition of softmax
     and negative log-likelihood.
     """
 
+    def __init__(self, use_autograd: bool = False):
+        """Initialization for CE loss derivative.
+
+        Args:
+            use_autograd: Compute gradients with autograd (rather than manual)
+                Defaults to ``False`` (manual computation).
+        """
+        super().__init__(use_autograd=use_autograd)
+
     def _sqrt_hessian(
         self,
         module: CrossEntropyLoss,
@@ -43,37 +53,6 @@ def _sqrt_hessian(
         sqrt_H = self._expand_sqrt_h(sqrt_H)
         return sqrt_H
 
-    def _sqrt_hessian_sampled(
-        self,
-        module: CrossEntropyLoss,
-        g_inp: Tuple[Tensor],
-        g_out: Tuple[Tensor],
-        mc_samples: int = 1,
-        subsampling: List[int] = None,
-    ) -> Tensor:
-        self._check_2nd_order_parameters(module)
-
-        M = mc_samples
-        C = module.input0.shape[1]
-
-        probs = self._get_probs(module, subsampling=subsampling)
-        probs, *rearrange_info = self._merge_batch_and_additional(probs)
-
-        V_dim = 0
-        probs_unsqueezed = probs.unsqueeze(V_dim).repeat(M, 1, 1)
-
-        multi = multinomial(probs, M, replacement=True)
-        classes = one_hot(multi, num_classes=C)
-        classes = einsum("nvc->vnc", classes).float()
-
-        sqrt_mc_h = (probs_unsqueezed - classes) / sqrt(M)
-
-        if module.reduction == "mean":
-            sqrt_mc_h /= sqrt(self._get_mean_normalization(module.input0))
-
-        sqrt_mc_h = self._ungroup_batch_and_additional(sqrt_mc_h, *rearrange_info)
-        return sqrt_mc_h
-
     def _sum_hessian(
         self, module: CrossEntropyLoss, g_inp: Tuple[Tensor], g_out: Tuple[Tensor]
     ) -> Tensor:
@@ -265,3 +244,56 @@ def _get_mean_normalization(input: Tensor) -> int:
             Divisor for mean reduction.
         """
         return input.numel() // input.shape[1]
+
+    def _verify_support(self, module: CrossEntropyLoss):
+        """We only support default weight and ignore_index.
+
+        Args:
+            module: CrossEntropyLoss module
+        """
+        self._check_2nd_order_parameters(module)
+
+    def _make_distribution(self, subsampled_input: Tensor) -> Categorical:
+        """Create the likelihood distribution whose NLL is the CE.
+
+        The log probability of the Categorical distribution for a single sample
+        with k classes is ∑ᵢ₌₁ᵏ Ŷᵢ log pᵢ, where Ŷ is one-hot encoded. If p is
+        chosen as the softmax, this is equivalent to CrossEntropyLoss
+
+        Args:
+            subsampled_input: input after subsampling
+
+        Returns:
+            Normal distribution for targets | inputs
+        """
+        probs = softmax(subsampled_input, dim=1)
+        probs_rearranged = einsum("nc...->n...c", probs)
+        return Categorical(probs_rearranged)
+
+    def _compute_sampled_grads_manual(
+        self, subsampled_input: Tensor, mc_samples: int
+    ) -> Tensor:
+        """Manually compute gradients from sampled targets.
+
+        Cross Entropy loss is ∑ᵢ₌₁ᵏ Ŷᵢ log 𝜎(xᵢ), where 𝜎(xᵢ) is the softmax of
+        the input and Ŷᵢ is one-hot encoded. The gradient is 𝜎(xᵢ) - Ŷᵢ.
+
+        Args:
+            subsampled_input: input after subsampling
+            mc_samples: number of samples
+
+        Returns:
+            Gradient samples
+        """
+        probs = softmax(subsampled_input, dim=1)
+        expand_dims = [mc_samples] + probs.dim() * [-1]
+        probs_unsqeezed = probs.unsqueeze(0).expand(*expand_dims)  # [V N C D1 D2]
+
+        distribution = self._make_distribution(subsampled_input)
+        samples = distribution.sample(Size([mc_samples]))  # [V N D1 D2]
+        samples_onehot = one_hot(samples, num_classes=probs.shape[1])  # [V N D1 D2 C]
+        samples_onehot_rearranged = einsum("vn...c->vnc...", samples_onehot).to(
+            probs.dtype
+        )  # [V N C D1 D2]
+
+        return probs_unsqeezed - samples_onehot_rearranged
diff --git a/backpack/core/derivatives/nll_base.py b/backpack/core/derivatives/nll_base.py
index 1f8b14795..f2c5dd607 100644
--- a/backpack/core/derivatives/nll_base.py
+++ b/backpack/core/derivatives/nll_base.py
@@ -115,12 +115,11 @@ def _compute_sampled_grads_autograd(
         gradients = []
 
         dist = self._make_distribution(subsampled_input)
-        self._check_distribution_shape(dist, subsampled_input)
 
         for _ in range(mc_samples):
             y_tilde = dist.sample()
             loss_tilde = -dist.log_prob(y_tilde).sum()
-            gradients.append(grad(loss_tilde, subsampled_input)[0])
+            gradients.append(grad(loss_tilde, subsampled_input, retain_graph=True)[0])
 
         return stack(gradients)
 
@@ -171,18 +170,3 @@ def _get_mean_normalization(input: Tensor) -> int:
             NotImplementedError: If not implemented
         """
         raise NotImplementedError
-
-    @staticmethod
-    def _check_distribution_shape(dist: Distribution, subsampled_input: Tensor):
-        """Verify shape of sampled targets y ∼ p(· | f).
-
-        Args:
-            dist: Distribution of the likelihood p(y | f), e.g. created by
-                _make_distribution.
-            subsampled_input: Input after subsampling.
-
-        Raises:
-            ValueError: If the target samples have incorrect shape.
-        """
-        if dist.sample().shape != subsampled_input.shape:
-            raise ValueError("Sample does not have same shape as subsampled_input.")
diff --git a/backpack/utils/module_classification.py b/backpack/utils/module_classification.py
index ff3470990..e3321c9ff 100644
--- a/backpack/utils/module_classification.py
+++ b/backpack/utils/module_classification.py
@@ -1,9 +1,8 @@
 """Contains util function for classification of modules."""
 from torch.fx import GraphModule
-from torch.nn import Module, Sequential
+from torch.nn import CrossEntropyLoss, Module, MSELoss, Sequential
 from torch.nn.modules.loss import _Loss
 
-from backpack.core.derivatives.mseloss import MSELoss
 from backpack.custom_module.branching import Parallel, _Branch
 from backpack.custom_module.reduce_tuple import ReduceTuple
 
@@ -20,16 +19,18 @@ def is_loss(module: Module) -> bool:
     return isinstance(module, _Loss)
 
 
-def is_mse(module: Module) -> bool:
-    """Return whether 'module' is a MSELoss function.
+def is_nll(module: Module) -> bool:
+    """Return whether 'module' is an NLL loss function.
+
+    Current NLL loss functions include MSE and CE.
 
     Args:
         module: A PyTorch module.
 
     Returns:
-        Whether 'module' is an MSE loss function
+        Whether 'module' is an NLL loss function
     """
-    return isinstance(module, MSELoss)
+    return isinstance(module, (MSELoss, CrossEntropyLoss))
 
 
 def is_no_op(module: Module) -> bool:
diff --git a/test/core/derivatives/derivatives_test.py b/test/core/derivatives/derivatives_test.py
index 8c2c0c2d6..fb39bc96c 100644
--- a/test/core/derivatives/derivatives_test.py
+++ b/test/core/derivatives/derivatives_test.py
@@ -7,7 +7,7 @@
 - Transposed Jacobian-matrix products with respect to layer parameters
 """
 from contextlib import nullcontext
-from test.automated_test import check_sizes_and_values
+from test.automated_test import check_sizes, check_sizes_and_values
 from test.core.derivatives.batch_norm_settings import BATCH_NORM_SETTINGS
 from test.core.derivatives.embedding_settings import EMBEDDING_SETTINGS
 from test.core.derivatives.implementation.autograd import AutogradDerivatives
@@ -33,6 +33,7 @@
 from torch import Tensor, rand
 
 from backpack.core.derivatives.convnd import weight_jac_t_save_memory
+from backpack.utils.subsampling import subsample
 
 PROBLEMS = make_test_problems(SETTINGS)
 IDS = [problem.make_id() for problem in PROBLEMS]
@@ -73,8 +74,8 @@
     problem.make_id() for problem in CUSTOM_SLICING_MODULE_PROBLEMS
 ]
 
-MSE_PROBLEMS = [problem for problem in PROBLEMS if problem.is_mse()]
-MSE_IDS = [problem.make_id() for problem in MSE_PROBLEMS]
+NLL_PROBLEMS = [problem for problem in PROBLEMS if problem.is_nll()]
+NLL_IDS = [problem.make_id() for problem in NLL_PROBLEMS]
 
 SUBSAMPLINGS = [None, [0, 0], [2, 0]]
 SUBSAMPLING_IDS = [f"subsampling={s}".replace(" ", "") for s in SUBSAMPLINGS]
@@ -348,7 +349,7 @@ def test_sqrt_hessian_sampled_squared_approximates_hessian(
 
 
 @mark.parametrize("subsampling", SUBSAMPLINGS, ids=SUBSAMPLING_IDS)
-@mark.parametrize("problem", MSE_PROBLEMS, ids=MSE_IDS)
+@mark.parametrize("problem", NLL_PROBLEMS, ids=NLL_IDS)
 def test_sqrt_hessian_sampled_squared_approximates_hessian_nll(
     problem: DerivativesTestProblem,
     subsampling: Union[List[int], None],
@@ -405,6 +406,33 @@ def test_sqrt_hessian_sampled_squared_approximates_hessian_nll(
             raise e
 
 
+@mark.parametrize("subsampling", SUBSAMPLINGS, ids=SUBSAMPLING_IDS)
+@mark.parametrize("problem", NLL_PROBLEMS, ids=NLL_IDS)
+def test_dist_sample_shape_nll(
+    problem: DerivativesTestProblem,
+    subsampling: Union[List[int], None],
+) -> None:
+    """Test distribution sample shape for NLL derivatives.
+
+    Compares the shape sampled from the distribution to the output to
+    verify the shapes match.
+
+    Args:
+        problem: Test case.
+        subsampling: Indices of active samples.
+    """
+    problem.set_up()
+    skip_subsampling_conflict(problem, subsampling)
+    BackpackDerivatives(problem).store_forward_io()
+
+    subsampled_input = subsample(problem.module.input0, subsampling=subsampling)
+    subsampled_target = subsample(problem.module.input1, subsampling=subsampling)
+    samples = problem.derivative._make_distribution(subsampled_input).sample()
+
+    check_sizes(samples, subsampled_target)
+    problem.tear_down()
+
+
 @mark.parametrize("subsampling", SUBSAMPLINGS, ids=SUBSAMPLING_IDS)
 @mark.parametrize("problem", LOSS_FAIL_PROBLEMS, ids=LOSS_FAIL_IDS)
 def test_sqrt_hessian_sampled_should_fail(
diff --git a/test/core/derivatives/problem.py b/test/core/derivatives/problem.py
index 4d98ed533..cd46704bc 100644
--- a/test/core/derivatives/problem.py
+++ b/test/core/derivatives/problem.py
@@ -8,7 +8,7 @@
 from torch import Tensor, long
 
 from backpack import extend
-from backpack.utils.module_classification import is_loss, is_mse
+from backpack.utils.module_classification import is_loss, is_nll
 from backpack.utils.subsampling import subsample
 
 
@@ -141,8 +141,8 @@ def make_output_shape(self):
     def is_loss(self):
         return is_loss(self.make_module())
 
-    def is_mse(self):
-        return is_mse(self.make_module())
+    def is_nll(self):
+        return is_nll(self.make_module())
 
     def forward_pass(
         self, input_requires_grad: bool = False, subsampling: List[int] = None

From cd864721c5ecb10f0a774397154eadadddbc0d81 Mon Sep 17 00:00:00 2001
From: jabader97 <51213988+jabader97@users.noreply.github.com>
Date: Fri, 3 Jun 2022 15:22:54 +0200
Subject: [PATCH 03/29] [ADD] Implement sampled gradients for
 `BCEWithLogitsLoss` (#257)

* [ADD] Added BCEWithLogits loss to NLL base

* [TEST] Skip BCEWithLogitsLoss _sqrt_hessian and
_compute_sampled_grads_manual (not implemented)

* [DOC] Fix darglint

* [DEL] Remove f-string

* [TEST] Skip unimplemented methods for BCEWithLogitsLoss

* [REF] Raise NotImplementedErrors, rename output -> target

* [REF] Rename bceloss -> bcewithlogitsloss

* [REF] Less imports, type annotation, docstring polish

* [ADD] Support `reduction='sum'`

* [DEL] Remove redundant constructor

Co-authored-by: Felix Dangel <fdangel@tue.mpg.de>
---
 .../core/derivatives/bcewithlogitsloss.py     | 84 +++++++++++++++++++
 backpack/utils/module_classification.py       |  6 +-
 test/core/derivatives/__init__.py             |  3 +
 test/core/derivatives/derivatives_test.py     | 10 +++
 test/core/derivatives/loss_settings.py        | 10 +++
 test/utils/skip_test.py                       | 12 ++-
 6 files changed, 121 insertions(+), 4 deletions(-)
 create mode 100644 backpack/core/derivatives/bcewithlogitsloss.py

diff --git a/backpack/core/derivatives/bcewithlogitsloss.py b/backpack/core/derivatives/bcewithlogitsloss.py
new file mode 100644
index 000000000..15c4157d2
--- /dev/null
+++ b/backpack/core/derivatives/bcewithlogitsloss.py
@@ -0,0 +1,84 @@
+"""NLL extention for BCEWithLogits Loss."""
+
+from torch import Tensor
+from torch.distributions import Binomial
+from torch.nn import BCEWithLogitsLoss
+
+from backpack.core.derivatives.nll_base import NLLLossDerivatives
+
+
+class BCELossWithLogitsDerivatives(NLLLossDerivatives):
+    """Derivatives of the BCEWithLogits Loss."""
+
+    def _verify_support(self, module: BCEWithLogitsLoss):
+        """Verification of module support for BCEWithLogitsLoss.
+
+        Currently BCEWithLogitsLoss only supports binary target tensors,
+        2D inputs, and default parameters.
+
+        Args:
+            module: BCEWithLogitsLoss module
+        """
+        self._check_binary(module)
+        self._check_is_default(module)
+        self._check_input_dims(module)
+
+    def _check_binary(self, module: BCEWithLogitsLoss):
+        """Raises exception if labels are not binary.
+
+        Args:
+            module: BCEWithLogitsLoss module
+
+        Raises:
+            NotImplementedError: if labels are non-binary.
+        """
+        if any(x != 0 and x != 1 for x in module.input1.flatten()):
+            raise NotImplementedError(
+                "Only binary targets (0 and 1) are currently supported."
+            )
+
+    def _check_is_default(self, module: BCEWithLogitsLoss):
+        """Raises exception if module parameters are not default.
+
+        Args:
+            module: BCEWithLogitsLoss module
+
+        Raises:
+            NotImplementedError: if module parameters non-default.
+        """
+        if module.weight is not None:
+            raise NotImplementedError("Only None weight is currently supported.")
+        if module.pos_weight is not None:
+            raise NotImplementedError("Only None pos_weight is currently supported.")
+
+    def _check_input_dims(self, module: BCEWithLogitsLoss):
+        """Raises an exception if the shapes of the input are not supported.
+
+        Args:
+            module: BCEWithLogitsLoss module
+
+        Raises:
+            NotImplementedError: if input is not 2D.
+        """
+        if module.input0.dim() != 2:
+            raise NotImplementedError("Only 2D inputs are currently supported.")
+
+    def _make_distribution(self, subsampled_input: Tensor) -> Binomial:
+        """Make the sampling distribution for the NLL loss form of BCEWithLogits.
+
+        The BCEWithLogitsLoss ∝ ∑ᵢ₌₁ⁿ Yᵢ log 𝜎(xᵢ) + (1 − Yᵢ) log(1− 𝜎(xᵢ)).
+        The log likelihood of the Binomial distribution is
+        Yᵢ log p(xᵢ) + (1 − Yᵢ) log(1 − p(xᵢ)), so these are equivalent if
+        p(xᵢ) = 𝜎(xᵢ).
+
+        Args:
+            subsampled_input: input after subsampling
+
+        Returns:
+            Binomial distribution with sigmoid probabilities from the subsampled_input.
+        """
+        return Binomial(probs=subsampled_input.sigmoid())
+
+    @staticmethod
+    def _get_mean_normalization(input: Tensor) -> int:
+        return input.shape[0]
diff --git a/backpack/utils/module_classification.py b/backpack/utils/module_classification.py
index e3321c9ff..c70e5247d 100644
--- a/backpack/utils/module_classification.py
+++ b/backpack/utils/module_classification.py
@@ -1,6 +1,6 @@
 """Contains util function for classification of modules."""
 from torch.fx import GraphModule
-from torch.nn import CrossEntropyLoss, Module, MSELoss, Sequential
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, Module, MSELoss, Sequential
 from torch.nn.modules.loss import _Loss
 
 from backpack.custom_module.branching import Parallel, _Branch
@@ -22,7 +22,7 @@ def is_loss(module: Module) -> bool:
 def is_nll(module: Module) -> bool:
     """Return whether 'module' is an NLL loss function.
 
-    Current NLL loss functions include MSE and CE.
+    Current NLL loss functions include MSE, CE and BCEWithLogits.
 
     Args:
         module: A PyTorch module.
@@ -30,7 +30,7 @@ def is_nll(module: Module) -> bool:
     Returns:
         Whether 'module' is an NLL loss function
     """
-    return isinstance(module, (MSELoss, CrossEntropyLoss))
+    return isinstance(module, (MSELoss, CrossEntropyLoss, BCEWithLogitsLoss))
 
 
 def is_no_op(module: Module) -> bool:
diff --git a/test/core/derivatives/__init__.py b/test/core/derivatives/__init__.py
index b76a1e633..36bc03bfc 100644
--- a/test/core/derivatives/__init__.py
+++ b/test/core/derivatives/__init__.py
@@ -13,6 +13,7 @@
     BatchNorm1d,
     BatchNorm2d,
     BatchNorm3d,
+    BCEWithLogitsLoss,
     Conv1d,
     Conv2d,
     Conv3d,
@@ -45,6 +46,7 @@
 from backpack.core.derivatives.avgpool2d import AvgPool2DDerivatives
 from backpack.core.derivatives.avgpool3d import AvgPool3DDerivatives
 from backpack.core.derivatives.batchnorm_nd import BatchNormNdDerivatives
+from backpack.core.derivatives.bcewithlogitsloss import BCELossWithLogitsDerivatives
 from backpack.core.derivatives.conv1d import Conv1DDerivatives
 from backpack.core.derivatives.conv2d import Conv2DDerivatives
 from backpack.core.derivatives.conv3d import Conv3DDerivatives
@@ -120,4 +122,5 @@
     SumModule: SumModuleDerivatives,
     Pad: PadDerivatives,
     Slicing: SlicingDerivatives,
+    BCEWithLogitsLoss: BCELossWithLogitsDerivatives,
 }
diff --git a/test/core/derivatives/derivatives_test.py b/test/core/derivatives/derivatives_test.py
index fb39bc96c..79664b173 100644
--- a/test/core/derivatives/derivatives_test.py
+++ b/test/core/derivatives/derivatives_test.py
@@ -24,6 +24,7 @@
 from test.utils.skip_test import (
     skip_adaptive_avg_pool3d_cuda,
     skip_batch_norm_train_mode_with_subsampling,
+    skip_BCEWithLogitsLoss,
     skip_subsampling_conflict,
 )
 from typing import List, Union
@@ -292,6 +293,7 @@ def test_sqrt_hessian_squared_equals_hessian(
     """
     problem.set_up()
     skip_subsampling_conflict(problem, subsampling)
+    skip_BCEWithLogitsLoss(problem)  # TODO Implement sqrt_hessian for BCEWithLogitsLoss
 
     backpack_res = BackpackDerivatives(problem).input_hessian_via_sqrt_hessian(
         subsampling=subsampling
@@ -337,6 +339,9 @@ def test_sqrt_hessian_sampled_squared_approximates_hessian(
     """
     problem.set_up()
     skip_subsampling_conflict(problem, subsampling)
+    skip_BCEWithLogitsLoss(
+        problem
+    )  # TODO Implement _compute_sampled_grads_manual for BCEWithLogitsLoss
 
     backpack_res = BackpackDerivatives(problem).input_hessian_via_sqrt_hessian(
         mc_samples=mc_samples, chunks=chunks, subsampling=subsampling
@@ -456,6 +461,7 @@ def test_sum_hessian(problem):
         problem (DerivativesProblem): Problem for derivative test.
     """
     problem.set_up()
+    skip_BCEWithLogitsLoss(problem)  # TODO Implement _sum_hessian for BCEWithLogitsLoss
 
     backpack_res = BackpackDerivatives(problem).sum_hessian()
     autograd_res = AutogradDerivatives(problem).sum_hessian()
@@ -596,6 +602,10 @@ def test_make_hessian_mat_prod(problem: DerivativesTestProblem) -> None:
         problem: test problem
     """
     problem.set_up()
+    skip_BCEWithLogitsLoss(
+        problem
+    )  # TODO Implement _make_hessian_mat_prod for BCEWithLogitsLoss
+
     mat = rand(4, *problem.input_shape, device=problem.device)
 
     autograd_res = AutogradDerivatives(problem).hessian_mat_prod(mat)
diff --git a/test/core/derivatives/loss_settings.py b/test/core/derivatives/loss_settings.py
index 391420cae..ad292e48c 100644
--- a/test/core/derivatives/loss_settings.py
+++ b/test/core/derivatives/loss_settings.py
@@ -77,6 +77,16 @@
         "input_fn": lambda: torch.rand(size=(1, 1)),
         "target_fn": lambda: regression_targets(size=(1, 1)),
     },
+    {
+        "module_fn": lambda: torch.nn.BCEWithLogitsLoss(reduction="mean"),
+        "input_fn": lambda: torch.rand(size=(2, 1)),
+        "target_fn": lambda: classification_targets(size=(2, 1), num_classes=2).float(),
+    },
+    {
+        "module_fn": lambda: torch.nn.BCEWithLogitsLoss(reduction="sum"),
+        "input_fn": lambda: torch.rand(size=(4, 1)),
+        "target_fn": lambda: classification_targets(size=(4, 1), num_classes=2).float(),
+    },
 ]
 
 
diff --git a/test/utils/skip_test.py b/test/utils/skip_test.py
index 4f282662f..0a77a5c1a 100644
--- a/test/utils/skip_test.py
+++ b/test/utils/skip_test.py
@@ -5,7 +5,7 @@
 from typing import List, Union
 
 from pytest import skip
-from torch.nn import BatchNorm1d, BatchNorm2d, BatchNorm3d
+from torch.nn import BatchNorm1d, BatchNorm2d, BatchNorm3d, BCEWithLogitsLoss
 
 from backpack.utils import ADAPTIVE_AVG_POOL_BUG
 
@@ -69,3 +69,13 @@ def skip_large_parameters(
     num_params = sum(p.numel() for p in problem.trainable_parameters())
     if num_params > max_num_params:
         skip(f"Model has too many parameters: {num_params} > {max_num_params}")
+
+
+def skip_BCEWithLogitsLoss(problem: ExtensionsTestProblem) -> None:
+    """Skip if the test problem uses BCEWithLogitsLoss.
+
+    Args:
+        problem: Test case.
+    """
+    if isinstance(problem.module, BCEWithLogitsLoss):
+        skip("Skipping BCEWithLogitsLoss")

From 94f2bdef899188900fe03faa05dbc8324c00230d Mon Sep 17 00:00:00 2001
From: Felix Dangel <48687646+f-dangel@users.noreply.github.com>
Date: Sat, 24 Sep 2022 17:52:31 +0200
Subject: [PATCH 04/29] [REF] Ignore warning to explicitly declare
 `abstractmethod`s (#264)

* [REF] Declare `abstractmethod`s

* [REF] Ignore warning: class inheriting from `ABC` has no abstract methods

Co-authored-by: Felix Dangel <fdangel@tue.mpg.de>
---
 backpack/core/derivatives/basederivatives.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/backpack/core/derivatives/basederivatives.py b/backpack/core/derivatives/basederivatives.py
index 94c152884..2a55ac88d 100644
--- a/backpack/core/derivatives/basederivatives.py
+++ b/backpack/core/derivatives/basederivatives.py
@@ -9,7 +9,7 @@
 from backpack.core.derivatives import shape_check
 
 
-class BaseDerivatives(ABC):
+class BaseDerivatives(ABC):  # noqa: B024
     """First- and second-order partial derivatives of unparameterized module.
 
     Note:
@@ -193,6 +193,7 @@ def hessian_is_diagonal(self, module: Module) -> bool:
 
     # FIXME Currently returns `∂²output[i] / ∂input[i]² * g_out[0][i]`,
     # which s the residual matrix diagonal, rather than the Hessian diagonal
+
     def hessian_diagonal(
         self, module: Module, g_in: Tuple[Tensor], g_out: Tuple[Tensor]
     ) -> Tensor:
@@ -306,7 +307,7 @@ def reshape_like_output(cls, mat: Tensor, module: Module) -> Tensor:
         return cls._reshape_like(mat, module.output.shape)
 
 
-class BaseParameterDerivatives(BaseDerivatives, ABC):
+class BaseParameterDerivatives(BaseDerivatives, ABC):  # noqa: B024
     """First- and second order partial derivatives of a module with parameters.
 
     Assumptions (true for `nn.Linear`, `nn.Conv(Transpose)Nd`, `nn.BatchNormNd`):
@@ -435,7 +436,7 @@ def _weight_jac_mat_prod(
         raise NotImplementedError
 
 
-class BaseLossDerivatives(BaseDerivatives, ABC):
+class BaseLossDerivatives(BaseDerivatives, ABC):  # noqa: B024
     """Second- order partial derivatives of loss functions."""
 
     # TODO Add shape check

From 53ddd861da47df30c664e3b58dbdf19a598a3f05 Mon Sep 17 00:00:00 2001
From: Felix Dangel <48687646+f-dangel@users.noreply.github.com>
Date: Sat, 24 Sep 2022 18:06:08 +0200
Subject: [PATCH 05/29] [FIX] Use correct imports from backpack in website
 examples (#262)

Replaces
`from backpack import ..., X` <-> `from backpack import ..., extensions`
and
`with backpack(X())` <-> `with backpack(extensions.X())`

Also applies white space cleanup.

Co-authored-by: Felix Dangel <fdangel@tue.mpg.de>
---
 docs_src/splash/_includes/code-samples.html | 34 ++++++++++-----------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/docs_src/splash/_includes/code-samples.html b/docs_src/splash/_includes/code-samples.html
index e5a98ba50..7a701bce8 100644
--- a/docs_src/splash/_includes/code-samples.html
+++ b/docs_src/splash/_includes/code-samples.html
@@ -71,14 +71,14 @@
 """</span>
 <span class="kn">from</span> <span class="nn">torch.nn</span> <span class="kn">import</span> <span class="n">CrossEntropyLoss</span><span class="p">,</span> <span class="n">Linear</span>
 <span class="kn">from</span> <span class="nn">utils</span> <span class="kn">import</span> <span class="n">load_mnist_data</span>
-<span style="color: blue;"><span class="kn">from</span> <span class="nn">backpack</span> <span class="kn">import</span> <span class="n">extend</span><span class="p">,</span> <span class="n">backpack</span><span class="p">,</span> <span class="n">Variance</span></span>
+<span style="color: blue;"><span class="kn">from</span> <span class="nn">backpack</span> <span class="kn">import</span> <span class="n">extend</span><span class="p">,</span> <span class="n">backpack</span><span class="p">,</span> <span class="n">extensions</span></span>
 
 <span class="n">X</span><span class="p">,</span> <span class="n">y</span> <span class="o">=</span> <span class="n">load_mnist_data</span><span class="p">()</span>
 <span class="n">model</span> <span class="o">=</span> <span class="n"><span style="color: blue;">extend</span></span><span class="p">(</span><span class="n">Linear</span><span class="p">(</span><span class="mi">784</span><span class="p">,</span> <span class="mi">10</span><span class="p">))</span>
 <span class="n">lossfunc</span> <span class="o">=</span> <span class="n"><span style="color: blue;">extend</span></span><span class="p">(</span><span class="n">CrossEntropyLoss</span><span class="p">())</span>
 <span class="n">loss</span> <span class="o">=</span> <span class="n">lossfunc</span><span class="p">(</span><span class="n">model</span><span class="p">(</span><span class="n">X</span><span class="p">),</span> <span class="n">y</span><span class="p">)</span>
 
-<span style="color: blue;"><span class="k">with</span> <span class="n">backpack</span><span class="p">(</span><span class="n">Variance</span><span class="p">()):</span></span>
+<span style="color: blue;"><span class="k">with</span> <span class="n">backpack</span><span class="p">(</span><span class="n">extensions.Variance</span><span class="p">()):</span></span>
     <span class="n">loss</span><span class="o">.</span><span class="n">backward</span><span class="p">()</span>
 
 <span class="k">for</span> <span class="n">param</span> <span class="ow">in</span> <span class="n">model</span><span class="o">.</span><span class="n">parameters</span><span class="p">():</span>
@@ -118,14 +118,14 @@
 """</span>
 <span class="kn">from</span> <span class="nn">torch.nn</span> <span class="kn">import</span> <span class="n">CrossEntropyLoss</span><span class="p">,</span> <span class="n">Linear</span>
 <span class="kn">from</span> <span class="nn">utils</span> <span class="kn">import</span> <span class="n">load_mnist_data</span>
-<span style="color: blue;"><span class="kn">from</span> <span class="nn">backpack</span> <span class="kn">import</span> <span class="n">extend</span><span class="p">,</span> <span class="n">backpack</span><span class="p">,</span> <span class="n">DiagGGNExact</span></span>
+<span style="color: blue;"><span class="kn">from</span> <span class="nn">backpack</span> <span class="kn">import</span> <span class="n">extend</span><span class="p">,</span> <span class="n">backpack</span><span class="p">,</span> <span class="n">extensions</span></span>
 
 <span class="n">X</span><span class="p">,</span> <span class="n">y</span> <span class="o">=</span> <span class="n">load_mnist_data</span><span class="p">()</span>
 <span class="n">model</span> <span class="o">=</span> <span class="n"><span style="color: blue;">extend</span></span><span class="p">(</span><span class="n">Linear</span><span class="p">(</span><span class="mi">784</span><span class="p">,</span> <span class="mi">10</span><span class="p">))</span>
 <span class="n">lossfunc</span> <span class="o">=</span> <span class="n"><span style="color: blue;">extend</span></span><span class="p">(</span><span class="n">CrossEntropyLoss</span><span class="p">())</span>
 <span class="n">loss</span> <span class="o">=</span> <span class="n">lossfunc</span><span class="p">(</span><span class="n">model</span><span class="p">(</span><span class="n">X</span><span class="p">),</span> <span class="n">y</span><span class="p">)</span>
 
-<span style="color: blue;"><span class="k">with</span> <span class="n">backpack</span><span class="p">(</span><span class="n">DiagGGNExact</span><span class="p">()):</span></span>
+<span style="color: blue;"><span class="k">with</span> <span class="n">backpack</span><span class="p">(</span><span class="n">extensions.DiagGGNExact</span><span class="p">()):</span></span>
     <span class="n">loss</span><span class="o">.</span><span class="n">backward</span><span class="p">()</span>
 
 <span class="k">for</span> <span class="n">param</span> <span class="ow">in</span> <span class="n">model</span><span class="o">.</span><span class="n">parameters</span><span class="p">():</span>
@@ -141,14 +141,14 @@
 """</span>
 <span class="kn">from</span> <span class="nn">torch.nn</span> <span class="kn">import</span> <span class="n">CrossEntropyLoss</span><span class="p">,</span> <span class="n">Linear</span>
 <span class="kn">from</span> <span class="nn">utils</span> <span class="kn">import</span> <span class="n">load_mnist_data</span>
-<span style="color: blue;"><span class="kn">from</span> <span class="nn">backpack</span> <span class="kn">import</span> <span class="n">extend</span><span class="p">,</span> <span class="n">backpack</span><span class="p">,</span> <span class="n">KFAC</span></span>
+<span style="color: blue;"><span class="kn">from</span> <span class="nn">backpack</span> <span class="kn">import</span> <span class="n">extend</span><span class="p">,</span> <span class="n">backpack</span><span class="p">,</span> <span class="n">extensions</span></span>
 
 <span class="n">X</span><span class="p">,</span> <span class="n">y</span> <span class="o">=</span> <span class="n">load_mnist_data</span><span class="p">()</span>
 <span class="n">model</span> <span class="o">=</span> <span class="n"><span style="color: blue;">extend</span></span><span class="p">(</span><span class="n">Linear</span><span class="p">(</span><span class="mi">784</span><span class="p">,</span> <span class="mi">10</span><span class="p">))</span>
 <span class="n">lossfunc</span> <span class="o">=</span> <span class="n"><span style="color: blue;">extend</span></span><span class="p">(</span><span class="n">CrossEntropyLoss</span><span class="p">())</span>
 <span class="n">loss</span> <span class="o">=</span> <span class="n">lossfunc</span><span class="p">(</span><span class="n">model</span><span class="p">(</span><span class="n">X</span><span class="p">),</span> <span class="n">y</span><span class="p">)</span>
 
-<span style="color: blue;"><span class="k">with</span> <span class="n">backpack</span><span class="p">(</span><span class="n">KFAC</span><span class="p">()):</span></span>
+<span style="color: blue;"><span class="k">with</span> <span class="n">backpack</span><span class="p">(</span><span class="n">extensions.KFAC</span><span class="p">()):</span></span>
     <span class="n">loss</span><span class="o">.</span><span class="n">backward</span><span class="p">()</span>
 
 <span class="k">for</span> <span class="n">param</span> <span class="ow">in</span> <span class="n">model</span><span class="o">.</span><span class="n">parameters</span><span class="p">():</span>
@@ -159,20 +159,20 @@
 
 <script>
     window.onload = function() {
-        var varianceButton = document.getElementById("varianceButton");  
-        var gradientButton = document.getElementById("gradientButton");  
-        var diagGGNButton = document.getElementById("diagGGNButton");  
+        var varianceButton = document.getElementById("varianceButton");
+        var gradientButton = document.getElementById("gradientButton");
+        var diagGGNButton = document.getElementById("diagGGNButton");
         var KFACButton = document.getElementById("KFACButton");
 
-        var varianceItem = document.getElementById("varianceItem");  
-        var gradientItem = document.getElementById("gradientItem");  
-        var diagGGNItem = document.getElementById("diagGGNItem");  
-        var KFACItem = document.getElementById("KFACItem");  
+        var varianceItem = document.getElementById("varianceItem");
+        var gradientItem = document.getElementById("gradientItem");
+        var diagGGNItem = document.getElementById("diagGGNItem");
+        var KFACItem = document.getElementById("KFACItem");
 
-        var gradientCode = document.getElementById("gradientCode");  
-        var varianceCode = document.getElementById("varianceCode");  
-        var diagGGNCode = document.getElementById("diagGGNCode");  
-        var KFACCode = document.getElementById("KFACCode");  
+        var gradientCode = document.getElementById("gradientCode");
+        var varianceCode = document.getElementById("varianceCode");
+        var diagGGNCode = document.getElementById("diagGGNCode");
+        var KFACCode = document.getElementById("KFACCode");
 
         var getCurrent = function() {
             if (varianceItem.className === "active") {

From 5bb18100955f2c66bd85498fe88c43bb3c0b640a Mon Sep 17 00:00:00 2001
From: Felix Dangel <48687646+f-dangel@users.noreply.github.com>
Date: Sat, 24 Sep 2022 18:50:43 +0200
Subject: [PATCH 06/29] [FIX] Use batch size 1 in KFAC ResNet tests (#265)

* [FIX] Use batch size 1

* [DEL] Remove unused import

Co-authored-by: Felix Dangel <fdangel@tue.mpg.de>
---
 test/extensions/secondorder/hbp/kfac_settings.py | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/test/extensions/secondorder/hbp/kfac_settings.py b/test/extensions/secondorder/hbp/kfac_settings.py
index 7b244883c..82bfa9b24 100644
--- a/test/extensions/secondorder/hbp/kfac_settings.py
+++ b/test/extensions/secondorder/hbp/kfac_settings.py
@@ -9,7 +9,6 @@
 from torch import rand
 from torch.nn import (
     CrossEntropyLoss,
-    Flatten,
     Identity,
     Linear,
     MSELoss,
@@ -31,15 +30,12 @@
 BATCH_SIZE_1_SETTINGS = [
     {
         "input_fn": lambda: rand(1, 7),
-        "module_fn": lambda: Sequential(
-            Linear(7, 3), ReLU(), Flatten(start_dim=1, end_dim=-1), Linear(3, 1)
-        ),
+        "module_fn": lambda: Sequential(Linear(7, 3), ReLU(), Linear(3, 1)),
         "loss_function_fn": lambda: MSELoss(reduction="mean"),
         "target_fn": lambda: regression_targets((1, 1)),
-        "id_prefix": "one-additional",
     },
     {
-        "input_fn": lambda: rand(3, 10),
+        "input_fn": lambda: rand(1, 10),
         "module_fn": lambda: Sequential(
             Linear(10, 5),
             ReLU(),
@@ -53,11 +49,11 @@
             Linear(5, 4),
         ),
         "loss_function_fn": lambda: CrossEntropyLoss(),
-        "target_fn": lambda: classification_targets((3,), 4),
+        "target_fn": lambda: classification_targets((1,), 4),
         "id_prefix": "branching-linear",
     },
     {
-        "input_fn": lambda: rand(3, 10),
+        "input_fn": lambda: rand(1, 10),
         "module_fn": lambda: Sequential(
             Linear(10, 5),
             ReLU(),
@@ -71,7 +67,7 @@
             Linear(5, 4),
         ),
         "loss_function_fn": lambda: CrossEntropyLoss(),
-        "target_fn": lambda: classification_targets((3,), 4),
+        "target_fn": lambda: classification_targets((1,), 4),
         "id_prefix": "branching-scalar",
     },
 ]

From 6e2da8e91556a298223009fe6abb186ea6608f70 Mon Sep 17 00:00:00 2001
From: Felix Dangel <48687646+f-dangel@users.noreply.github.com>
Date: Thu, 29 Sep 2022 18:43:08 +0200
Subject: [PATCH 07/29] [DOC] Add use case example: computing batched Jacobians
 (#267)

Co-authored-by: Felix Dangel <fdangel@tue.mpg.de>
---
 .../use_cases/example_batched_jacobians.py    | 203 ++++++++++++++++++
 1 file changed, 203 insertions(+)
 create mode 100644 docs_src/examples/use_cases/example_batched_jacobians.py

diff --git a/docs_src/examples/use_cases/example_batched_jacobians.py b/docs_src/examples/use_cases/example_batched_jacobians.py
new file mode 100644
index 000000000..bf6a12ece
--- /dev/null
+++ b/docs_src/examples/use_cases/example_batched_jacobians.py
@@ -0,0 +1,203 @@
+r"""Batched Jacobians
+=====================
+
+In PyTorch, you can easily compute derivatives of a **scalar-valued** variable
+:code:`f` w.r.t. to a variable :code:`param` by calling
+:code:`f.backward()`. This computes the Jacobian :code:`∂(f) / ∂(param)`
+that has shape :code:`[1, *param.shape]`.
+
+If :code:`f` is a reduction of a **batched scalar** :code:`fs` of shape
+:code:`[N]`, then BackPACK is capable to compute the individual gradients for
+each scalar with its :code:`BatchGrad` extension. This yields the Jacobian
+:code:`∂(fs) / ∂(param)` of shape :code:`[N, *param.shape]`.
+
+**This example** demonstrates how to compute the Jacobian of a tensor-valued
+variable :code:`fs`, here for the example of a **batched vector** of shape
+:code:`[N, C]`, whose Jacobian has shape :code:`[N, C, *param.shape]`.
+
+Setup
+-----
+
+We will use the batched vector-valued output of a simple MLP as tensor
+:code:`fs` that should be differentiated w.r.t. the model parameters
+:code:`param_1, param_2, ...`. For :code:`param_i`, this leads to a Jacobian
+:code:`∂(fs) / ∂(param_i)` of shape :code:`[N, C, *param_i.shape]`.
+
+Let's start by importing the required functionality and write a setup function
+to create our synthetic data.
+"""
+import itertools
+from math import sqrt
+from typing import List, Tuple
+
+import matplotlib.pyplot as plt
+from torch import Tensor, allclose, cat, manual_seed, rand, zeros, zeros_like
+from torch.autograd import grad
+from torch.nn import Linear, MSELoss, ReLU, Sequential
+
+from backpack import backpack, extend, extensions
+
+# architecture specifications
+N = 15
+D_in = 10
+D_hidden = 7
+C = 5
+
+
+def setup() -> Tuple[Sequential, Tensor]:
+    """Create a simple MLP with ReLU activations and its synthetic input.
+
+    Returns:
+        A simple MLP and a tensor that can be fed to it.
+    """
+    X = rand(N, D_in)
+    model = Sequential(Linear(D_in, D_hidden), ReLU(), Linear(D_hidden, C))
+
+    return model, X
+
+
+# %%
+# With autograd
+# -------------
+#
+# First, let's compute the Jacobians with PyTorch's :code:`autograd` to verify
+# our results.
+#
+# To do that, we need to differentiate per component of :code:`fs`. This means
+# that we will differentiate multiple times through its graph, therefore we
+# need to set :code:`retain_graph=True`.
+
+manual_seed(0)
+model, X = setup()
+
+fs = model(X)
+autograd_jacobians = [zeros(fs.shape + param.shape) for param in model.parameters()]
+
+for n, c in itertools.product(range(N), range(C)):
+    grads_n_c = grad(fs[n, c], model.parameters(), retain_graph=True)
+    for param_idx, param_grad in enumerate(grads_n_c):
+        autograd_jacobians[param_idx][n, c, :] = param_grad
+
+# %%
+#
+# Let's visualize the Jacobians by flattening the dimensions stemming from
+# :code:`fs` and from :code:`param_i`, and by concatenating them along the
+# parameter dimensions:
+
+plt.figure()
+plt.title(r"Batched Jacobian")
+image = plt.imshow(
+    cat(
+        [
+            jac.flatten(end_dim=fs.dim() - 1).flatten(start_dim=1)
+            for jac in autograd_jacobians
+        ],
+        dim=1,
+    )
+)
+plt.colorbar(image, shrink=0.7)
+
+# %%
+#
+# In the following, we will compute the same Jacobian tensor lists with
+# BackPACK. To compare our results, we will use the following helper function:
+
+
+def compare_tensor_lists(
+    tensor_list1: List[Tensor], tensor_list_2: List[Tensor]
+) -> None:
+    """Checks equality of two tensor lists.
+
+    Args:
+        tensor_list1: First tensor list.
+        tensor_list2: Second tensor list.
+
+    Raises:
+        ValueError: If the two tensor lists don't match.
+    """
+    if len(tensor_list1) != len(tensor_list_2):
+        raise ValueError("Tensor lists have different length.")
+    for tensor1, tensor2 in zip(tensor_list1, tensor_list_2):
+        if tensor1.shape != tensor2.shape:
+            raise ValueError("Tensors have different sizes.")
+        if not allclose(tensor1, tensor2):
+            raise ValueError("Tensors have different values.")
+    print("Both tensor lists match.")
+
+
+# %%
+# Next, we will present two approaches to compute such Jacobians with BackPACK.
+#
+# You can imagine the first one as carrying out the for-loop over :code:`N`
+# parallel, and the second one as carrying out both for loops over :code:`N, C`
+# in parallel. The first approach relies on a first-order extension, the second
+# one on a second-order extension. This means that while the first approach
+# works on quite general graphs, for the second one to work your graph must be
+# fully BackPACK-compatible.
+#
+# With BackPACK's :code:`BatchGrad`
+# ---------------------------------
+#
+# As described in the introduction, BackPACK's :code:`BatchGrad` extension can
+# compute Jacobians of batched scalars. We can therefore compute the
+# derivatives for :code:`fs[:, c]` in one iteration, parallelizing the Jacobian
+# computation over the batch axis. For the full Jacobian, this requires
+# :code:`C` backpropagations, hence we need to tell both :code:`autograd` and
+# BackPACK to retain the graph.
+#
+# Let's do that in code, and check the result:
+
+manual_seed(0)
+model, X = setup()
+
+model = extend(model)
+
+fs = model(X)
+backpack_first_jacobians = [zeros(fs.shape + p.shape) for p in model.parameters()]
+
+for c in range(C):
+    with backpack(extensions.BatchGrad(), retain_graph=True):
+        f = fs[:, c].sum()
+        f.backward(retain_graph=True)
+
+    for param_idx, param in enumerate(model.parameters()):
+        backpack_first_jacobians[param_idx][:, c, :] = param.grad_batch
+
+print("Comparing batched Jacobian from autograd with BackPACK (via BatchGrad):")
+compare_tensor_lists(autograd_jacobians, backpack_first_jacobians)
+
+# %%
+# With BackPACK's :code:`SqrtGGNExact`
+# ------------------------------------
+#
+# The second approach uses BackPACK's :code:`SqrtGGNExact` second-order
+# extension. It computes the matrix square root of the GGN/Fisher.
+#
+# This approach uses that after feeding :code:`fs` through a square loss with
+# :code:`reduction='sum'`, the GGN's square root is the desired Jacobian up to
+# a normalization factor of √2 (to find out more, read Section 2 of `[Dangel,
+# 2021] <https://arxiv.org/abs/2106.02624>`_), and a transposition due to
+# BackPACK's internals.
+#
+# Like that, we get the Jacobian in a single backward pass and don't have to
+# retain the graph:
+
+manual_seed(0)
+model, X = setup()
+
+model = extend(model)
+loss_func = extend(MSELoss(reduction="sum"))
+
+fs = model(X)
+fs_labels = zeros_like(fs)  # can contain arbitrary values.
+loss = loss_func(fs, fs_labels)
+
+with backpack(extensions.SqrtGGNExact()):
+    loss.backward()
+
+backpack_second_jacobians = [
+    param.sqrt_ggn_exact.transpose(0, 1) / sqrt(2) for param in model.parameters()
+]
+
+print("Comparing batched Jacobian from autograd with BackPACK (via SqrtGGNExact):")
+compare_tensor_lists(autograd_jacobians, backpack_second_jacobians)

From 1be2ddcce68c3c5367ea5c3a0ce8e3437ce19687 Mon Sep 17 00:00:00 2001
From: Felix Dangel <48687646+f-dangel@users.noreply.github.com>
Date: Thu, 3 Nov 2022 10:33:14 +0100
Subject: [PATCH 08/29] [CI] Test with `torch=={1.9.0, 1.12.0}` and make tests
 compatible (#277)

* [CI] Test with `torch=={1.9.0, 1.10.0}`

* [CI] Test with `torch=={1.9.0, 1.11.0}`

* [FIX] flake8

* [CI] Test with `torch=={1.9.0, 1.12.0}`

* [TEST] Replace `parameters_to_vector` by custom function

This should fix
`test_network_diag_ggn[<class
'test.converter.converter_cases._Permute'>]`
in `test/converter/test_converter.py`. Between torch 1.11.0 and torch
1.12.0, the GGN-vector products for this case became non-contiguous, and
`torch.nn.utils.convert_parameters.parameters_to_vector` stopped working
as it uses `view`.

Here is a short self-contained snippet to reproduce the issue:

```python
from torch import Tensor, permute, rand, rand_like
from torch.autograd import grad
from torch.nn import Linear, Module
from torch.nn.utils.convert_parameters import parameters_to_vector

from backpack.utils.convert_parameters import tensor_list_to_vector

class Permute(Module):
    def __init__(self):
        super().__init__()
        self.batch_size = 3
        self.in_dim = (5, 3)
        out_dim = 2
        self.linear = Linear(self.in_dim[-1], out_dim)
        self.linear2 = Linear(self.in_dim[-2], out_dim)

    def forward(self, x):
        x = self.linear(x)
        x = x.permute(0, 2, 1)  # method permute
        x = self.linear2(x)
        x = permute(x, (0, 2, 1))  # function permute
        return x

    def input_fn(self) -> Tensor:
        return rand(self.batch_size, *self.in_dim)

model = Permute()

inputs = model.input_fn()
outputs = model(inputs)

params = list(model.parameters())
grad_outputs = rand_like(outputs)
v = [rand_like(p) for p in model.parameters()]

vJ_tuple = grad(outputs, params, grad_outputs=grad_outputs)

for p, vJ in zip(params, vJ_tuple):
    # all contiguous()
    print(p.shape, vJ.shape)
    # between 1.11.0 and 1.12.0, the vector-Jacobian product w.r.t. the second
    # linear layer's weight is not contiguous anymore
    print(p.is_contiguous(), vJ.is_contiguous())

vJ_vector = parameters_to_vector(vJ_tuple)

vJ_vector = tensor_list_to_vector(vJ_tuple)
```

* [REF] Use f-string and add type hints

* [REQ] Require `torch<1.13`

See https://github.com/f-dangel/backpack/issues/272. Waiting for
https://github.com/pytorch/pytorch/issues/88312 before `torch>=1.13`
can be supported.

* [DOC] Update changelog to prepare compatibility patch

* [DOC] fix date

Co-authored-by: Felix Dangel <fdangel@tue.mpg.de>
---
 .github/workflows/test.yaml          |  2 +-
 backpack/utils/convert_parameters.py | 45 +++++++++++++++++++---------
 backpack/utils/examples.py           |  8 +++--
 changelog.md                         | 13 ++++++--
 fully_documented.txt                 |  1 +
 setup.cfg                            |  2 +-
 6 files changed, 50 insertions(+), 21 deletions(-)

diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index ee7391bff..a4ec683be 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -20,7 +20,7 @@ jobs:
     strategy:
       matrix:
         python-version: [3.7, 3.8, 3.9]
-        pytorch-version: [1.9.0, 1.9.1]
+        pytorch-version: [1.9.0, 1.12.0]
     steps:
     - uses: actions/checkout@v1
     - uses: actions/setup-python@v1
diff --git a/backpack/utils/convert_parameters.py b/backpack/utils/convert_parameters.py
index b3f917319..2ec80f774 100644
--- a/backpack/utils/convert_parameters.py
+++ b/backpack/utils/convert_parameters.py
@@ -1,9 +1,12 @@
-import torch
+"""Utility functions to convert between parameter lists and vectors."""
 
+from typing import Iterable, List
 
-def vector_to_parameter_list(vec, parameters):
-    """
-    Convert the vector `vec` to a parameter-list format matching `parameters`.
+from torch import Tensor, cat, typename
+
+
+def vector_to_parameter_list(vec: Tensor, parameters: Iterable[Tensor]) -> List[Tensor]:
+    """Convert the vector `vec` to a parameter-list format matching `parameters`.
 
     This function is the inverse of `parameters_to_vector` from the
     pytorch module `torch.nn.utils.convert_parameters`.
@@ -21,18 +24,20 @@ def vector_to_parameter_list(vec, parameters):
         assert torch.all_close(a, b)
     ```
 
-    Parameters:
-    -----------
-        vec: Tensor
-            a single vector represents the parameters of a model
-        parameters: (Iterable[Tensor])
-            an iterator of Tensors that are of the desired shapes.
+    Args:
+        vec: A single vector represents the parameters of a model
+        parameters: An iterator of Tensors that are of the desired shapes.
+
+    Raises:
+        TypeError: If `vec` is not a PyTorch tensor.
+
+    Returns:
+        List of parameter-shaped tensors containing the entries of `vec`.
     """
     # Ensure vec of type Tensor
-    if not isinstance(vec, torch.Tensor):
-        raise TypeError(
-            "expected torch.Tensor, but got: {}".format(torch.typename(vec))
-        )
+    if not isinstance(vec, Tensor):
+        raise TypeError(f"expected Tensor, but got: {typename(vec)}")
+
     params_new = []
     # Pointer for slicing the vector for each parameter
     pointer = 0
@@ -46,3 +51,15 @@ def vector_to_parameter_list(vec, parameters):
         pointer += num_param
 
     return params_new
+
+
+def tensor_list_to_vector(tensor_list: Iterable[Tensor]) -> Tensor:
+    """Convert a list of tensors into a vector by flattening and concatenation.
+
+    Args:
+        tensor_list: List of tensors.
+
+    Returns:
+        Vector containing the flattened and concatenated tensor inputs.
+    """
+    return cat([t.flatten() for t in tensor_list])
diff --git a/backpack/utils/examples.py b/backpack/utils/examples.py
index 788241140..519f600db 100644
--- a/backpack/utils/examples.py
+++ b/backpack/utils/examples.py
@@ -3,13 +3,15 @@
 
 from torch import Tensor, stack, zeros
 from torch.nn import Module
-from torch.nn.utils.convert_parameters import parameters_to_vector
 from torch.utils.data import DataLoader, Dataset
 from torchvision.datasets import MNIST
 from torchvision.transforms import Compose, Normalize, ToTensor
 
 from backpack.hessianfree.ggnvp import ggn_vector_product
-from backpack.utils.convert_parameters import vector_to_parameter_list
+from backpack.utils.convert_parameters import (
+    tensor_list_to_vector,
+    vector_to_parameter_list,
+)
 
 
 def load_mnist_dataset() -> Dataset:
@@ -114,4 +116,4 @@ def _autograd_ggn_exact_columns(
 
         ggn_d_list = ggn_vector_product(loss, outputs, model, e_d_list)
 
-        yield d, parameters_to_vector(ggn_d_list)
+        yield d, tensor_list_to_vector(ggn_d_list)
diff --git a/changelog.md b/changelog.md
index 7526fa4cc..5d6172670 100644
--- a/changelog.md
+++ b/changelog.md
@@ -6,6 +6,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+## [1.5.1] - 2022-11-03
+
+This patch fixes temporary compatibility issues with the latest PyTorch release.
+
+### Fixed/Removed
+- Circumvent compatibility issues with `torch==1.13.0` by requiring
+  `torch<1.13.0`` ([PR](https://github.com/f-dangel/backpack/pull/276))
+
 ## [1.5.0] - 2022-02-15
 
 This small release improves ResNet support of some second-order extensions and
@@ -378,8 +386,9 @@ co-authoring many PRs shipped in this release.
 
 Initial release
 
-[Unreleased]: https://github.com/f-dangel/backpack/compare/v1.5.0...HEAD
-[1.4.0]: https://github.com/f-dangel/backpack/compare/1.5.0...1.4.0
+[Unreleased]: https://github.com/f-dangel/backpack/compare/v1.5.1...HEAD
+[1.5.1]: https://github.com/f-dangel/backpack/compare/1.5.1...1.5.0
+[1.5.0]: https://github.com/f-dangel/backpack/compare/1.5.0...1.4.0
 [1.4.0]: https://github.com/f-dangel/backpack/compare/1.4.0...1.3.0
 [1.3.0]: https://github.com/f-dangel/backpack/compare/1.3.0...1.2.0
 [1.2.0]: https://github.com/f-dangel/backpack/compare/1.2.0...1.1.1
diff --git a/fully_documented.txt b/fully_documented.txt
index e56a28e68..8b56c76b3 100644
--- a/fully_documented.txt
+++ b/fully_documented.txt
@@ -77,6 +77,7 @@ backpack/utils/__init__.py
 backpack/utils/module_classification.py
 backpack/utils/hooks.py
 backpack/utils/examples.py
+backpack/utils/convert_parameters.py
 
 test/extensions/automated_settings.py
 test/extensions/problem.py
diff --git a/setup.cfg b/setup.cfg
index b4ed8c35a..70a38822b 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -34,7 +34,7 @@ setup_requires =
   setuptools_scm
 # Dependencies of the project (semicolon/line-separated):
 install_requires =
-    torch >= 1.9.0, < 2.0.0
+    torch >= 1.9.0, < 1.13.0
     torchvision >= 0.7.0, < 1.0.0
     einops >= 0.3.0, < 1.0.0
 # Require a specific Python version, e.g. Python 2.7 or >= 3.4

From 7d7cbfe691c14c218566cfd005ede4e1378ee673 Mon Sep 17 00:00:00 2001
From: Felix Dangel <48687646+f-dangel@users.noreply.github.com>
Date: Thu, 3 Nov 2022 11:28:10 +0100
Subject: [PATCH 09/29] [core] Implement Hessian square root for
 `BCEWithLogitsLoss` (#271)

* [ADD] Implement sqrt_hessian for BCEWithLogitsLoss

* [TEST] Add case for sqrt_hessian with non-binary labels

* [DOC] Improve derivation of Hessian square root

Co-authored-by: Felix Dangel <fdangel@tue.mpg.de>
---
 .../core/derivatives/bcewithlogitsloss.py     | 40 ++++++++++++++++++-
 test/core/derivatives/derivatives_test.py     |  3 +-
 test/core/derivatives/loss_settings.py        |  6 +++
 test/utils/skip_test.py                       | 12 ++++++
 4 files changed, 58 insertions(+), 3 deletions(-)

diff --git a/backpack/core/derivatives/bcewithlogitsloss.py b/backpack/core/derivatives/bcewithlogitsloss.py
index 15c4157d2..a3a730264 100644
--- a/backpack/core/derivatives/bcewithlogitsloss.py
+++ b/backpack/core/derivatives/bcewithlogitsloss.py
@@ -1,10 +1,14 @@
 """NLL extention for BCEWithLogits Loss."""
 
-from torch import Tensor
+from math import sqrt
+from typing import List, Tuple
+
+from torch import Tensor, sigmoid
 from torch.distributions import Binomial
 from torch.nn import BCEWithLogitsLoss
 
 from backpack.core.derivatives.nll_base import NLLLossDerivatives
+from backpack.utils.subsampling import subsample
 
 
 class BCELossWithLogitsDerivatives(NLLLossDerivatives):
@@ -58,10 +62,14 @@ def _check_input_dims(self, module: BCEWithLogitsLoss):
             module: BCEWithLogitsLoss module
 
         Raises:
-            NotImplementedError: if input is not 2D.
+            NotImplementedError: if input is not a batch of scalars.
         """
         if module.input0.dim() != 2:
             raise NotImplementedError("Only 2D inputs are currently supported.")
+        if module.input0.shape[1] != 1:
+            raise NotImplementedError(
+                "Only scalar-valued predictions are currently supported."
+            )
 
     def _make_distribution(self, subsampled_input: Tensor) -> Binomial:
         """Make the sampling distribution for the NLL loss form of BCEWithLogits.
@@ -82,3 +90,31 @@ def _make_distribution(self, subsampled_input: Tensor) -> Binomial:
     @staticmethod
     def _get_mean_normalization(input: Tensor) -> int:
         return input.shape[0]
+
+    def _sqrt_hessian(
+        self,
+        module: BCEWithLogitsLoss,
+        g_inp: Tuple[Tensor],
+        g_out: Tuple[Tensor],
+        subsampling: List[int],
+    ) -> Tensor:  # noqa: D102
+        """Return a symmetric factorization of the loss Hessian.
+
+        Let fₙ ∈ ℝ be the input and yₙ ∈ [0; 1] be the label, and σ(fₙ) ∈ (0;
+        1) be the sigmoid probability. Then, the gradient ∇ℓ(fₙ, yₙ) w.r.t. fₙ
+        is ∇ℓ(fₙ, yₙ) = σ(fₙ) - yₙ, and the Hessian ∇²ℓ(fₙ, yₙ) w.r.t. fₙ is
+        ∇²ℓ(fₙ, yₙ) = σ'(fₙ) = σ(fₙ) (1 - σ(fₙ)). Consequently, the (scalar)
+        Hessian square root is √(σ(fₙ) (1 - σ(fₙ))).
+        """
+        self._check_is_default(module)
+        self._check_input_dims(module)
+
+        input0 = subsample(module.input0, subsampling=subsampling)
+        sigma = sigmoid(input0).unsqueeze(0)
+
+        sqrt_H = (sigma * (1 - sigma)).sqrt()
+
+        if module.reduction == "mean":
+            sqrt_H /= sqrt(self._get_mean_normalization(module.input0))
+
+        return sqrt_H
diff --git a/test/core/derivatives/derivatives_test.py b/test/core/derivatives/derivatives_test.py
index 79664b173..44292524d 100644
--- a/test/core/derivatives/derivatives_test.py
+++ b/test/core/derivatives/derivatives_test.py
@@ -25,6 +25,7 @@
     skip_adaptive_avg_pool3d_cuda,
     skip_batch_norm_train_mode_with_subsampling,
     skip_BCEWithLogitsLoss,
+    skip_BCEWithLogitsLoss_non_binary_labels,
     skip_subsampling_conflict,
 )
 from typing import List, Union
@@ -293,7 +294,6 @@ def test_sqrt_hessian_squared_equals_hessian(
     """
     problem.set_up()
     skip_subsampling_conflict(problem, subsampling)
-    skip_BCEWithLogitsLoss(problem)  # TODO Implement sqrt_hessian for BCEWithLogitsLoss
 
     backpack_res = BackpackDerivatives(problem).input_hessian_via_sqrt_hessian(
         subsampling=subsampling
@@ -383,6 +383,7 @@ def test_sqrt_hessian_sampled_squared_approximates_hessian_nll(
     """
     problem.set_up()
     skip_subsampling_conflict(problem, subsampling)
+    skip_BCEWithLogitsLoss_non_binary_labels(problem)
     RTOL, ATOL = 1e-2, 8e-3
 
     autograd_res = AutogradDerivatives(problem).input_hessian(subsampling=subsampling)
diff --git a/test/core/derivatives/loss_settings.py b/test/core/derivatives/loss_settings.py
index ad292e48c..e2987d765 100644
--- a/test/core/derivatives/loss_settings.py
+++ b/test/core/derivatives/loss_settings.py
@@ -87,6 +87,12 @@
         "input_fn": lambda: torch.rand(size=(4, 1)),
         "target_fn": lambda: classification_targets(size=(4, 1), num_classes=2).float(),
     },
+    {
+        "module_fn": lambda: torch.nn.BCEWithLogitsLoss(reduction="mean"),
+        "input_fn": lambda: torch.rand(size=(5, 1)),
+        "target_fn": lambda: torch.rand(size=(5, 1)),
+        "id_prefix": "non-binary-labels",
+    },
 ]
 
 
diff --git a/test/utils/skip_test.py b/test/utils/skip_test.py
index 0a77a5c1a..c11257776 100644
--- a/test/utils/skip_test.py
+++ b/test/utils/skip_test.py
@@ -79,3 +79,15 @@ def skip_BCEWithLogitsLoss(problem: ExtensionsTestProblem) -> None:
     """
     if isinstance(problem.module, BCEWithLogitsLoss):
         skip("Skipping BCEWithLogitsLoss")
+
+
+def skip_BCEWithLogitsLoss_non_binary_labels(problem: ExtensionsTestProblem) -> None:
+    """Skip if the test problem uses BCEWithLogitsLoss and non-binary labels.
+
+    Args:
+        problem: Test case.
+    """
+    if isinstance(problem.module, BCEWithLogitsLoss) and any(
+        y not in [0, 1] for y in problem.target.flatten()
+    ):
+        skip("Skipping BCEWithLogitsLoss with non-binary labels")

From fc170ec8f130dc02ca694bca5194e352dcb24cac Mon Sep 17 00:00:00 2001
From: Felix Dangel <48687646+f-dangel@users.noreply.github.com>
Date: Thu, 3 Nov 2022 13:55:21 +0100
Subject: [PATCH 10/29] [core] Implement manual sampled gradients of
 `BCEWithLogitsLoss`, use by default (#278)

* [ADD] Implement sqrt_hessian for BCEWithLogitsLoss

* [TEST] Add case for sqrt_hessian with non-binary labels

* [DOC] Improve derivation of Hessian square root

* [ADD] Implement manual sampled gradients for `BCEWithLogitsLoss`

* [CI] Add to fully documented files

* [DOC] Fix pydocstyle

Co-authored-by: Felix Dangel <fdangel@tue.mpg.de>
---
 .../core/derivatives/bcewithlogitsloss.py     | 43 ++++++++++++++++++-
 fully_documented.txt                          |  1 +
 test/core/derivatives/derivatives_test.py     |  4 +-
 test/utils/skip_test.py                       |  4 +-
 4 files changed, 45 insertions(+), 7 deletions(-)

diff --git a/backpack/core/derivatives/bcewithlogitsloss.py b/backpack/core/derivatives/bcewithlogitsloss.py
index a3a730264..8dbe3fc46 100644
--- a/backpack/core/derivatives/bcewithlogitsloss.py
+++ b/backpack/core/derivatives/bcewithlogitsloss.py
@@ -3,7 +3,7 @@
 from math import sqrt
 from typing import List, Tuple
 
-from torch import Tensor, sigmoid
+from torch import Size, Tensor, sigmoid
 from torch.distributions import Binomial
 from torch.nn import BCEWithLogitsLoss
 
@@ -14,6 +14,15 @@
 class BCELossWithLogitsDerivatives(NLLLossDerivatives):
     """Derivatives of the BCEWithLogits Loss."""
 
+    def __init__(self, use_autograd: bool = False):
+        """Initialization for BCEWithLogitsLoss derivative.
+
+        Args:
+            use_autograd: Compute gradients with autograd (rather than manual)
+                Defaults to ``False`` (manual computation).
+        """
+        super().__init__(use_autograd=use_autograd)
+
     def _verify_support(self, module: BCEWithLogitsLoss):
         """Verification of module support for BCEWithLogitsLoss.
 
@@ -36,7 +45,7 @@ def _check_binary(self, module: BCEWithLogitsLoss):
         Raises:
             NotImplementedError: if labels are non-binary.
         """
-        if any(x != 0 and x != 1 for x in module.input1.flatten()):
+        if any(x not in [0, 1] for x in module.input1.flatten()):
             raise NotImplementedError(
                 "Only binary targets (0 and 1) are currently supported."
             )
@@ -87,6 +96,30 @@ def _make_distribution(self, subsampled_input: Tensor) -> Binomial:
         """
         return Binomial(probs=subsampled_input.sigmoid())
 
+    def _compute_sampled_grads_manual(
+        self, subsampled_input: Tensor, mc_samples: int
+    ) -> Tensor:
+        """Manually compute gradients from sampled targets.
+
+        Let fₙ ∈ ℝ and yₙ ∈ {0, 1} ∼ p(y | fₙ) and σ(fₙ) the softmax probability.
+        Then the gradient is ∇ℓ(fₙ, yₙ) = σ(fₙ) - fₙ.
+
+        Args:
+            subsampled_input: input after subsampling
+            mc_samples: number of samples
+
+        Returns:
+            Gradient samples
+        """
+        probs = subsampled_input.sigmoid()
+        expand_dims = [mc_samples] + probs.dim() * [-1]
+        probs_unsqeezed = probs.unsqueeze(0).expand(*expand_dims)  # [V N 1]
+
+        distribution = self._make_distribution(subsampled_input)
+        samples = distribution.sample(Size([mc_samples]))  # [V N 1]
+
+        return probs_unsqeezed - samples
+
     @staticmethod
     def _get_mean_normalization(input: Tensor) -> int:
         return input.shape[0]
@@ -100,11 +133,17 @@ def _sqrt_hessian(
     ) -> Tensor:  # noqa: D102
         """Return a symmetric factorization of the loss Hessian.
 
+        # noqa: DAR101
+
         Let fₙ ∈ ℝ be the input and yₙ ∈ [0; 1] be the label, and σ(fₙ) ∈ (0;
         1) be the sigmoid probability. Then, the gradient ∇ℓ(fₙ, yₙ) w.r.t. fₙ
         is ∇ℓ(fₙ, yₙ) = σ(fₙ) - yₙ, and the Hessian ∇²ℓ(fₙ, yₙ) w.r.t. fₙ is
         ∇²ℓ(fₙ, yₙ) = σ'(fₙ) = σ(fₙ) (1 - σ(fₙ)). Consequently, the (scalar)
         Hessian square root is √(σ(fₙ) (1 - σ(fₙ))).
+
+        Returns:
+            Hessian square root factorization of shape ``[1, N, 1]`` where ``N``
+            corresponds to the (subsampled) batch size.
         """
         self._check_is_default(module)
         self._check_input_dims(module)
diff --git a/fully_documented.txt b/fully_documented.txt
index 8b56c76b3..c794968d3 100644
--- a/fully_documented.txt
+++ b/fully_documented.txt
@@ -20,6 +20,7 @@ backpack/core/derivatives/scale_module.py
 backpack/core/derivatives/sum_module.py
 backpack/core/derivatives/dropout.py
 backpack/core/derivatives/slicing.py
+backpack/core/derivatives/bcewithlogitsloss.py
 
 backpack/extensions/__init__.py
 backpack/extensions/backprop_extension.py
diff --git a/test/core/derivatives/derivatives_test.py b/test/core/derivatives/derivatives_test.py
index 44292524d..226529c06 100644
--- a/test/core/derivatives/derivatives_test.py
+++ b/test/core/derivatives/derivatives_test.py
@@ -339,9 +339,7 @@ def test_sqrt_hessian_sampled_squared_approximates_hessian(
     """
     problem.set_up()
     skip_subsampling_conflict(problem, subsampling)
-    skip_BCEWithLogitsLoss(
-        problem
-    )  # TODO Implement _compute_sampled_grads_manual for BCEWithLogitsLoss
+    skip_BCEWithLogitsLoss_non_binary_labels(problem)
 
     backpack_res = BackpackDerivatives(problem).input_hessian_via_sqrt_hessian(
         mc_samples=mc_samples, chunks=chunks, subsampling=subsampling
diff --git a/test/utils/skip_test.py b/test/utils/skip_test.py
index c11257776..a6ba4a4f9 100644
--- a/test/utils/skip_test.py
+++ b/test/utils/skip_test.py
@@ -71,7 +71,7 @@ def skip_large_parameters(
         skip(f"Model has too many parameters: {num_params} > {max_num_params}")
 
 
-def skip_BCEWithLogitsLoss(problem: ExtensionsTestProblem) -> None:
+def skip_BCEWithLogitsLoss(problem: DerivativesTestProblem) -> None:
     """Skip if the test problem uses BCEWithLogitsLoss.
 
     Args:
@@ -81,7 +81,7 @@ def skip_BCEWithLogitsLoss(problem: ExtensionsTestProblem) -> None:
         skip("Skipping BCEWithLogitsLoss")
 
 
-def skip_BCEWithLogitsLoss_non_binary_labels(problem: ExtensionsTestProblem) -> None:
+def skip_BCEWithLogitsLoss_non_binary_labels(problem: DerivativesTestProblem) -> None:
     """Skip if the test problem uses BCEWithLogitsLoss and non-binary labels.
 
     Args:

From 11777d63ef92f3ed6f2b26d8980782aa0b1ae066 Mon Sep 17 00:00:00 2001
From: Felix Dangel <48687646+f-dangel@users.noreply.github.com>
Date: Fri, 4 Nov 2022 11:12:24 +0100
Subject: [PATCH 11/29] [ADD] Support `BCEWithLogitsLoss` in
 `(Batch)DiagHessian` (#279)

* [ADD] Implement sqrt_hessian for BCEWithLogitsLoss

* [TEST] Add case for sqrt_hessian with non-binary labels

* [DOC] Improve derivation of Hessian square root

* [ADD] Implement manual sampled gradients for `BCEWithLogitsLoss`

* [CI] Add to fully documented files

* [DOC] Fix pydocstyle

* [ADD] `DiagHessian` support for `BCEWithLogitsLoss`

* [CI] Skip `BCEWithLogitsLoss` cases in DiagGGNExactBatch

* [CI] Skip `BCEWithLogitsLoss` for `DiagGGNExact`

* [CI] Skip `BCEWithLogitsLoss` in `SqrtGGN` tests

* [REF] Use `BCEWithLogitsLoss` test cases for Hessian diagonal

* [DEL] Remove skip utilities for `BCEWithLogitsLoss`

Co-authored-by: Felix Dangel <fdangel@tue.mpg.de>
---
 .../secondorder/diag_hessian/__init__.py      |  3 +++
 .../secondorder/diag_hessian/losses.py        |  6 +++++
 .../diag_hessian/diagh_settings.py            |  7 ++++--
 .../secondorder/secondorder_settings.py       | 22 +++++++++++++++++++
 4 files changed, 36 insertions(+), 2 deletions(-)

diff --git a/backpack/extensions/secondorder/diag_hessian/__init__.py b/backpack/extensions/secondorder/diag_hessian/__init__.py
index e9f6c8869..246d48e7b 100644
--- a/backpack/extensions/secondorder/diag_hessian/__init__.py
+++ b/backpack/extensions/secondorder/diag_hessian/__init__.py
@@ -9,6 +9,7 @@
     AvgPool1d,
     AvgPool2d,
     AvgPool3d,
+    BCEWithLogitsLoss,
     Conv1d,
     Conv2d,
     Conv3d,
@@ -98,6 +99,7 @@ def __init__(self):
                 SELU: activations.DiagHSELU(),
                 Pad: pad.DiagHPad(),
                 Slicing: slicing.DiagHSlicing(),
+                BCEWithLogitsLoss: losses.DiagHBCEWithLogitsLoss(),
             },
         )
 
@@ -147,5 +149,6 @@ def __init__(self):
                 SELU: activations.DiagHSELU(),
                 Pad: pad.DiagHPad(),
                 Slicing: slicing.DiagHSlicing(),
+                BCEWithLogitsLoss: losses.DiagHBCEWithLogitsLoss(),
             },
         )
diff --git a/backpack/extensions/secondorder/diag_hessian/losses.py b/backpack/extensions/secondorder/diag_hessian/losses.py
index 097730dd4..c6e2e401c 100644
--- a/backpack/extensions/secondorder/diag_hessian/losses.py
+++ b/backpack/extensions/secondorder/diag_hessian/losses.py
@@ -1,3 +1,4 @@
+from backpack.core.derivatives.bcewithlogitsloss import BCELossWithLogitsDerivatives
 from backpack.core.derivatives.crossentropyloss import CrossEntropyLossDerivatives
 from backpack.core.derivatives.mseloss import MSELossDerivatives
 from backpack.extensions.secondorder.diag_hessian.diag_h_base import DiagHBaseModule
@@ -17,3 +18,8 @@ def __init__(self):
 class DiagHCrossEntropyLoss(DiagHLoss):
     def __init__(self):
         super().__init__(derivatives=CrossEntropyLossDerivatives())
+
+
+class DiagHBCEWithLogitsLoss(DiagHLoss):
+    def __init__(self):
+        super().__init__(derivatives=BCELossWithLogitsDerivatives())
diff --git a/test/extensions/secondorder/diag_hessian/diagh_settings.py b/test/extensions/secondorder/diag_hessian/diagh_settings.py
index 8f778713c..99939fa6e 100644
--- a/test/extensions/secondorder/diag_hessian/diagh_settings.py
+++ b/test/extensions/secondorder/diag_hessian/diagh_settings.py
@@ -5,14 +5,17 @@
 """
 
 from test.extensions.automated_settings import make_simple_act_setting
-from test.extensions.secondorder.secondorder_settings import SECONDORDER_SETTINGS
+from test.extensions.secondorder.secondorder_settings import (
+    SECONDORDER_SETTINGS,
+    BCEWithLogitsLoss_SETTINGS,
+)
 
 from torch.nn import LogSigmoid
 
 DiagHESSIAN_SETTINGS = []
 
 SHARED_SETTINGS = SECONDORDER_SETTINGS
-LOCAL_SETTINGS = []
+LOCAL_SETTINGS = BCEWithLogitsLoss_SETTINGS
 LOCAL_SETTINGS.append(make_simple_act_setting(LogSigmoid, bias=True))
 LOCAL_SETTINGS.append(make_simple_act_setting(LogSigmoid, bias=False))
 
diff --git a/test/extensions/secondorder/secondorder_settings.py b/test/extensions/secondorder/secondorder_settings.py
index 369f308e1..aaae09426 100644
--- a/test/extensions/secondorder/secondorder_settings.py
+++ b/test/extensions/secondorder/secondorder_settings.py
@@ -35,6 +35,7 @@
     AvgPool1d,
     AvgPool2d,
     AvgPool3d,
+    BCEWithLogitsLoss,
     Conv1d,
     Conv2d,
     Conv3d,
@@ -366,3 +367,24 @@
         "target_fn": lambda: regression_targets((3, 4)),
     },
 ]
+
+
+###############################################################################
+#                              BCEWithLogitsLoss                              #
+###############################################################################
+BCEWithLogitsLoss_SETTINGS = [
+    {
+        "input_fn": lambda: rand(3, 6),
+        "module_fn": lambda: Sequential(Linear(6, 4), ReLU(), Linear(4, 1)),
+        "loss_function_fn": lambda: BCEWithLogitsLoss(reduction="mean"),
+        "target_fn": lambda: rand(3, 1),
+        "id_prefix": "non-binary-labels",
+    },
+    {
+        "input_fn": lambda: rand(3, 6),
+        "module_fn": lambda: Sequential(Linear(6, 4), ReLU(), Linear(4, 1)),
+        "loss_function_fn": lambda: BCEWithLogitsLoss(reduction="sum"),
+        "target_fn": lambda: classification_targets(size=(3, 1), num_classes=2).float(),
+        "id_prefix": "binary-labels",
+    },
+]

From 52b76178c3dc0eaa4f271317205bba05a805fa1a Mon Sep 17 00:00:00 2001
From: Felix Dangel <48687646+f-dangel@users.noreply.github.com>
Date: Fri, 4 Nov 2022 12:04:40 +0100
Subject: [PATCH 12/29] [ADD] Support `BCEWithLogitsLoss` in
 `(Batch)DiagGGN{Exact, MC}` (#280)

* [ADD] Implement sqrt_hessian for BCEWithLogitsLoss

* [TEST] Add case for sqrt_hessian with non-binary labels

* [DOC] Improve derivation of Hessian square root

* [ADD] Implement manual sampled gradients for `BCEWithLogitsLoss`

* [CI] Add to fully documented files

* [DOC] Fix pydocstyle

* [ADD] `DiagHessian` support for `BCEWithLogitsLoss`

* [CI] Skip `BCEWithLogitsLoss` cases in DiagGGNExactBatch

* [CI] Skip `BCEWithLogitsLoss` for `DiagGGNExact`

* [CI] Skip `BCEWithLogitsLoss` in `SqrtGGN` tests

* [REF] Use `BCEWithLogitsLoss` test cases for Hessian diagonal

* [DEL] Remove skip utilities for `BCEWithLogitsLoss`

* [ADD] Support `BCEWithLogitsLoss` in `(Diag)GGN{Exact,MC}`

Co-authored-by: Felix Dangel <fdangel@tue.mpg.de>
---
 .../secondorder/diag_ggn/__init__.py           |  3 +++
 .../extensions/secondorder/diag_ggn/losses.py  |  6 ++++++
 fully_documented.txt                           |  1 +
 .../secondorder/diag_ggn/diag_ggn_settings.py  | 10 +++++++++-
 .../diag_ggn/test_batch_diag_ggn.py            |  3 +++
 .../secondorder/diag_ggn/test_diag_ggn.py      |  3 +++
 test/utils/skip_extension_test.py              | 18 ++++++++++++++++++
 7 files changed, 43 insertions(+), 1 deletion(-)
 create mode 100644 test/utils/skip_extension_test.py

diff --git a/backpack/extensions/secondorder/diag_ggn/__init__.py b/backpack/extensions/secondorder/diag_ggn/__init__.py
index 58680f673..02a4322ac 100644
--- a/backpack/extensions/secondorder/diag_ggn/__init__.py
+++ b/backpack/extensions/secondorder/diag_ggn/__init__.py
@@ -23,6 +23,7 @@
     BatchNorm1d,
     BatchNorm2d,
     BatchNorm3d,
+    BCEWithLogitsLoss,
     Conv1d,
     Conv2d,
     Conv3d,
@@ -149,6 +150,7 @@ def __init__(self, loss_hessian_strategy: str, savefield: str):
                 Embedding: embedding.DiagGGNEmbedding(),
                 Pad: pad.DiagGGNPad(),
                 Slicing: slicing.DiagGGNSlicing(),
+                BCEWithLogitsLoss: losses.DiagGGNBCEWithLogitsLossDerivatives(),
             },
         )
 
@@ -274,6 +276,7 @@ def __init__(self, loss_hessian_strategy: str, savefield: str):
                 Embedding: embedding.BatchDiagGGNEmbedding(),
                 Pad: pad.DiagGGNPad(),
                 Slicing: slicing.DiagGGNSlicing(),
+                BCEWithLogitsLoss: losses.DiagGGNBCEWithLogitsLossDerivatives(),
             },
         )
 
diff --git a/backpack/extensions/secondorder/diag_ggn/losses.py b/backpack/extensions/secondorder/diag_ggn/losses.py
index 6679a9b3e..e48e1bf4a 100644
--- a/backpack/extensions/secondorder/diag_ggn/losses.py
+++ b/backpack/extensions/secondorder/diag_ggn/losses.py
@@ -1,5 +1,6 @@
 from functools import partial
 
+from backpack.core.derivatives.bcewithlogitsloss import BCELossWithLogitsDerivatives
 from backpack.core.derivatives.crossentropyloss import CrossEntropyLossDerivatives
 from backpack.core.derivatives.mseloss import MSELossDerivatives
 from backpack.extensions.secondorder.diag_ggn.diag_ggn_base import DiagGGNBaseModule
@@ -34,3 +35,8 @@ def __init__(self):
 class DiagGGNCrossEntropyLoss(DiagGGNLoss):
     def __init__(self):
         super().__init__(derivatives=CrossEntropyLossDerivatives())
+
+
+class DiagGGNBCEWithLogitsLossDerivatives(DiagGGNLoss):
+    def __init__(self):
+        super().__init__(derivatives=BCELossWithLogitsDerivatives())
diff --git a/fully_documented.txt b/fully_documented.txt
index c794968d3..912c0ce0a 100644
--- a/fully_documented.txt
+++ b/fully_documented.txt
@@ -114,6 +114,7 @@ test/core/derivatives/scale_module_settings.py
 test/core/derivatives/slicing_settings.py
 test/utils/evaluation_mode.py
 test/utils/skip_test.py
+test/utils/skip_extension_test.py
 test/utils/__init__.py
 test/converter/
 test/utils/test_subsampling.py
diff --git a/test/extensions/secondorder/diag_ggn/diag_ggn_settings.py b/test/extensions/secondorder/diag_ggn/diag_ggn_settings.py
index 869e4b64b..fbff5235d 100644
--- a/test/extensions/secondorder/diag_ggn/diag_ggn_settings.py
+++ b/test/extensions/secondorder/diag_ggn/diag_ggn_settings.py
@@ -11,7 +11,10 @@
 """
 from test.converter.resnet_cases import ResNet1, ResNet2
 from test.core.derivatives.utils import classification_targets, regression_targets
-from test.extensions.secondorder.secondorder_settings import SECONDORDER_SETTINGS
+from test.extensions.secondorder.secondorder_settings import (
+    SECONDORDER_SETTINGS,
+    BCEWithLogitsLoss_SETTINGS,
+)
 from test.utils.evaluation_mode import initialize_training_false_recursive
 
 from torch import rand, randint
@@ -279,4 +282,9 @@
     },
 ]
 
+###############################################################################
+#                              BCEWithLogitsLoss                              #
+###############################################################################
+LOCAL_SETTINGS += BCEWithLogitsLoss_SETTINGS
+
 DiagGGN_SETTINGS = SHARED_SETTINGS + LOCAL_SETTINGS
diff --git a/test/extensions/secondorder/diag_ggn/test_batch_diag_ggn.py b/test/extensions/secondorder/diag_ggn/test_batch_diag_ggn.py
index 0030fa95b..c7ca263d6 100644
--- a/test/extensions/secondorder/diag_ggn/test_batch_diag_ggn.py
+++ b/test/extensions/secondorder/diag_ggn/test_batch_diag_ggn.py
@@ -4,6 +4,7 @@
 from test.extensions.implementation.backpack import BackpackExtensions
 from test.extensions.problem import make_test_problems
 from test.extensions.secondorder.diag_ggn.diag_ggn_settings import DiagGGN_SETTINGS
+from test.utils.skip_extension_test import skip_BCEWithLogitsLoss_non_binary_labels
 from test.utils.skip_test import skip_adaptive_avg_pool3d_cuda
 
 import pytest
@@ -45,6 +46,7 @@ def test_diag_ggn_mc_batch_light(problem):
         problem (ExtensionsTestProblem): Problem for extension test.
     """
     problem.set_up()
+    skip_BCEWithLogitsLoss_non_binary_labels(problem)
 
     backpack_res = BackpackExtensions(problem).diag_ggn_exact_batch()
     mc_samples = 6000
@@ -67,6 +69,7 @@ def test_diag_ggn_mc_batch(problem):
         problem (ExtensionsTestProblem): Problem for extension test.
     """
     problem.set_up()
+    skip_BCEWithLogitsLoss_non_binary_labels(problem)
 
     backpack_res = BackpackExtensions(problem).diag_ggn_exact_batch()
     mc_samples = 300000
diff --git a/test/extensions/secondorder/diag_ggn/test_diag_ggn.py b/test/extensions/secondorder/diag_ggn/test_diag_ggn.py
index 0b7ba0469..b50fda1d1 100644
--- a/test/extensions/secondorder/diag_ggn/test_diag_ggn.py
+++ b/test/extensions/secondorder/diag_ggn/test_diag_ggn.py
@@ -4,6 +4,7 @@
 from test.extensions.implementation.backpack import BackpackExtensions
 from test.extensions.problem import make_test_problems
 from test.extensions.secondorder.diag_ggn.diag_ggn_settings import DiagGGN_SETTINGS
+from test.utils.skip_extension_test import skip_BCEWithLogitsLoss_non_binary_labels
 from test.utils.skip_test import skip_adaptive_avg_pool3d_cuda
 
 import pytest
@@ -45,6 +46,7 @@ def test_diag_ggn_mc_light(problem):
         problem (ExtensionsTestProblem): Problem for extension test.
     """
     problem.set_up()
+    skip_BCEWithLogitsLoss_non_binary_labels(problem)
 
     backpack_res = BackpackExtensions(problem).diag_ggn()
     mc_samples = 3000
@@ -67,6 +69,7 @@ def test_diag_ggn_mc(problem):
         problem (ExtensionsTestProblem): Problem for extension test.
     """
     problem.set_up()
+    skip_BCEWithLogitsLoss_non_binary_labels(problem)
 
     backpack_res = BackpackExtensions(problem).diag_ggn()
     mc_samples = 300000
diff --git a/test/utils/skip_extension_test.py b/test/utils/skip_extension_test.py
new file mode 100644
index 000000000..899bb88ae
--- /dev/null
+++ b/test/utils/skip_extension_test.py
@@ -0,0 +1,18 @@
+"""Contains skip conditions for BackPACK's extension tests."""
+
+from test.extensions.problem import ExtensionsTestProblem
+
+from pytest import skip
+from torch.nn import BCEWithLogitsLoss
+
+
+def skip_BCEWithLogitsLoss_non_binary_labels(problem: ExtensionsTestProblem) -> None:
+    """Skip if case uses BCEWithLogitsLoss as loss function with non-binary labels.
+
+    Args:
+        problem: Extension test case.
+    """
+    if isinstance(problem.loss_function, BCEWithLogitsLoss) and any(
+        y not in [0, 1] for y in problem.target.flatten()
+    ):
+        skip("Skipping BCEWithLogitsLoss with non-binary labels")

From 0812fd6aa8d7d71dd73aad9debb9f4ac7f353cba Mon Sep 17 00:00:00 2001
From: Felix Dangel <48687646+f-dangel@users.noreply.github.com>
Date: Fri, 4 Nov 2022 12:48:30 +0100
Subject: [PATCH 13/29] [ADD] Support `BCEWithLogitsLoss` in `SqrtGGN{Exact,
 MC}` (#282)

* [ADD] Implement sqrt_hessian for BCEWithLogitsLoss

* [TEST] Add case for sqrt_hessian with non-binary labels

* [DOC] Improve derivation of Hessian square root

* [ADD] Implement manual sampled gradients for `BCEWithLogitsLoss`

* [CI] Add to fully documented files

* [DOC] Fix pydocstyle

* [ADD] `DiagHessian` support for `BCEWithLogitsLoss`

* [CI] Skip `BCEWithLogitsLoss` cases in DiagGGNExactBatch

* [CI] Skip `BCEWithLogitsLoss` for `DiagGGNExact`

* [CI] Skip `BCEWithLogitsLoss` in `SqrtGGN` tests

* [REF] Use `BCEWithLogitsLoss` test cases for Hessian diagonal

* [DEL] Remove skip utilities for `BCEWithLogitsLoss`

* [ADD] Support `BCEWithLogitsLoss` in `(Diag)GGN{Exact,MC}`

* [ADD] Support `BCEWithLogitsLoss` in `SqrtGGN{Exact, MC}` extension

* [DEL] Forgot to extract BCEWithLogitsLoss test cases

Co-authored-by: Felix Dangel <fdangel@tue.mpg.de>
---
 .../extensions/secondorder/sqrt_ggn/__init__.py  |  2 ++
 .../extensions/secondorder/sqrt_ggn/losses.py    |  9 +++++++++
 .../secondorder/diag_ggn/diag_ggn_settings.py    | 10 +---------
 .../secondorder/diag_hessian/diagh_settings.py   | 16 ++++++----------
 .../secondorder/secondorder_settings.py          |  2 +-
 .../secondorder/sqrt_ggn/test_sqrt_ggn.py        |  3 +++
 6 files changed, 22 insertions(+), 20 deletions(-)

diff --git a/backpack/extensions/secondorder/sqrt_ggn/__init__.py b/backpack/extensions/secondorder/sqrt_ggn/__init__.py
index 9d55e8e19..a9ed4d32f 100644
--- a/backpack/extensions/secondorder/sqrt_ggn/__init__.py
+++ b/backpack/extensions/secondorder/sqrt_ggn/__init__.py
@@ -12,6 +12,7 @@
     BatchNorm1d,
     BatchNorm2d,
     BatchNorm3d,
+    BCEWithLogitsLoss,
     Conv1d,
     Conv2d,
     Conv3d,
@@ -116,6 +117,7 @@ def __init__(
                 BatchNorm3d: batchnorm_nd.SqrtGGNBatchNormNd(),
                 Pad: pad.SqrtGGNPad(),
                 Slicing: slicing.SqrtGGNSlicing(),
+                BCEWithLogitsLoss: losses.SqrtGGNBCEWithLogithsLoss(),
             },
             subsampling=subsampling,
         )
diff --git a/backpack/extensions/secondorder/sqrt_ggn/losses.py b/backpack/extensions/secondorder/sqrt_ggn/losses.py
index 2294bc794..6f0112e23 100644
--- a/backpack/extensions/secondorder/sqrt_ggn/losses.py
+++ b/backpack/extensions/secondorder/sqrt_ggn/losses.py
@@ -6,6 +6,7 @@
 from torch import Tensor
 from torch.nn import Module
 
+from backpack.core.derivatives.bcewithlogitsloss import BCELossWithLogitsDerivatives
 from backpack.core.derivatives.crossentropyloss import CrossEntropyLossDerivatives
 from backpack.core.derivatives.mseloss import MSELossDerivatives
 from backpack.extensions.secondorder.hbp import LossHessianStrategy
@@ -80,3 +81,11 @@ class SqrtGGNCrossEntropyLoss(SqrtGGNBaseLossModule):
     def __init__(self):
         """Pass derivatives for ``torch.nn.CrossEntropyLoss`` module."""
         super().__init__(CrossEntropyLossDerivatives())
+
+
+class SqrtGGNBCEWithLogithsLoss(SqrtGGNBaseLossModule):
+    """``SqrtGGN{Exact, MC}`` extension for ``torch.nn.BCEWithLogitsLoss`` module."""
+
+    def __init__(self):
+        """Pass derivatives for ``torch.nn.BCEWithLogitsLoss`` module."""
+        super().__init__(BCELossWithLogitsDerivatives())
diff --git a/test/extensions/secondorder/diag_ggn/diag_ggn_settings.py b/test/extensions/secondorder/diag_ggn/diag_ggn_settings.py
index fbff5235d..869e4b64b 100644
--- a/test/extensions/secondorder/diag_ggn/diag_ggn_settings.py
+++ b/test/extensions/secondorder/diag_ggn/diag_ggn_settings.py
@@ -11,10 +11,7 @@
 """
 from test.converter.resnet_cases import ResNet1, ResNet2
 from test.core.derivatives.utils import classification_targets, regression_targets
-from test.extensions.secondorder.secondorder_settings import (
-    SECONDORDER_SETTINGS,
-    BCEWithLogitsLoss_SETTINGS,
-)
+from test.extensions.secondorder.secondorder_settings import SECONDORDER_SETTINGS
 from test.utils.evaluation_mode import initialize_training_false_recursive
 
 from torch import rand, randint
@@ -282,9 +279,4 @@
     },
 ]
 
-###############################################################################
-#                              BCEWithLogitsLoss                              #
-###############################################################################
-LOCAL_SETTINGS += BCEWithLogitsLoss_SETTINGS
-
 DiagGGN_SETTINGS = SHARED_SETTINGS + LOCAL_SETTINGS
diff --git a/test/extensions/secondorder/diag_hessian/diagh_settings.py b/test/extensions/secondorder/diag_hessian/diagh_settings.py
index 99939fa6e..60fc77b96 100644
--- a/test/extensions/secondorder/diag_hessian/diagh_settings.py
+++ b/test/extensions/secondorder/diag_hessian/diagh_settings.py
@@ -1,22 +1,18 @@
-"""Test configurations to test diag_h.
+"""Test cases for DiagHessian and BatchDiagHessian extensions.
 
 The tests are taken from `test.extensions.secondorder.secondorder_settings`,
 but additional custom tests can be defined here by appending it to the list.
 """
 
 from test.extensions.automated_settings import make_simple_act_setting
-from test.extensions.secondorder.secondorder_settings import (
-    SECONDORDER_SETTINGS,
-    BCEWithLogitsLoss_SETTINGS,
-)
+from test.extensions.secondorder.secondorder_settings import SECONDORDER_SETTINGS
 
 from torch.nn import LogSigmoid
 
-DiagHESSIAN_SETTINGS = []
-
 SHARED_SETTINGS = SECONDORDER_SETTINGS
-LOCAL_SETTINGS = BCEWithLogitsLoss_SETTINGS
-LOCAL_SETTINGS.append(make_simple_act_setting(LogSigmoid, bias=True))
-LOCAL_SETTINGS.append(make_simple_act_setting(LogSigmoid, bias=False))
+LOCAL_SETTINGS = [
+    make_simple_act_setting(LogSigmoid, bias=True),
+    make_simple_act_setting(LogSigmoid, bias=False),
+]
 
 DiagHESSIAN_SETTINGS = SHARED_SETTINGS + LOCAL_SETTINGS
diff --git a/test/extensions/secondorder/secondorder_settings.py b/test/extensions/secondorder/secondorder_settings.py
index aaae09426..b011f4e37 100644
--- a/test/extensions/secondorder/secondorder_settings.py
+++ b/test/extensions/secondorder/secondorder_settings.py
@@ -372,7 +372,7 @@
 ###############################################################################
 #                              BCEWithLogitsLoss                              #
 ###############################################################################
-BCEWithLogitsLoss_SETTINGS = [
+SECONDORDER_SETTINGS += [
     {
         "input_fn": lambda: rand(3, 6),
         "module_fn": lambda: Sequential(Linear(6, 4), ReLU(), Linear(4, 1)),
diff --git a/test/extensions/secondorder/sqrt_ggn/test_sqrt_ggn.py b/test/extensions/secondorder/sqrt_ggn/test_sqrt_ggn.py
index 92c44f152..3c6df3d3b 100644
--- a/test/extensions/secondorder/sqrt_ggn/test_sqrt_ggn.py
+++ b/test/extensions/secondorder/sqrt_ggn/test_sqrt_ggn.py
@@ -6,6 +6,7 @@
 from test.extensions.implementation.backpack import BackpackExtensions
 from test.extensions.problem import ExtensionsTestProblem, make_test_problems
 from test.extensions.secondorder.sqrt_ggn.sqrt_ggn_settings import SQRT_GGN_SETTINGS
+from test.utils.skip_extension_test import skip_BCEWithLogitsLoss_non_binary_labels
 from test.utils.skip_test import skip_large_parameters, skip_subsampling_conflict
 from typing import List, Union
 
@@ -70,6 +71,7 @@ def test_sqrt_ggn_mc_integration(
     """
     skip_large_parameters(problem)
     skip_subsampling_conflict(problem, subsampling)
+    skip_BCEWithLogitsLoss_non_binary_labels(problem)
 
     BackpackExtensions(problem).sqrt_ggn_mc(mc_samples=1, subsampling=subsampling)
 
@@ -87,6 +89,7 @@ def test_ggn_mc(
     """
     skip_large_parameters(problem)
     skip_subsampling_conflict(problem, subsampling)
+    skip_BCEWithLogitsLoss_non_binary_labels(problem)
 
     autograd_res = AutogradExtensions(problem).ggn(subsampling=subsampling)
     atol, rtol = 5e-3, 5e-3

From 0a58ce50781fc2e14c8c82b29ef527a6b75e79af Mon Sep 17 00:00:00 2001
From: Felix Dangel <48687646+f-dangel@users.noreply.github.com>
Date: Fri, 4 Nov 2022 14:24:47 +0100
Subject: [PATCH 14/29] [ADD] Support `BCEWithLogitsLoss` in `KFAC` (#283)

* [ADD] Implement sqrt_hessian for BCEWithLogitsLoss

* [TEST] Add case for sqrt_hessian with non-binary labels

* [DOC] Improve derivation of Hessian square root

* [ADD] Implement manual sampled gradients for `BCEWithLogitsLoss`

* [CI] Add to fully documented files

* [DOC] Fix pydocstyle

* [ADD] `DiagHessian` support for `BCEWithLogitsLoss`

* [CI] Skip `BCEWithLogitsLoss` cases in DiagGGNExactBatch

* [CI] Skip `BCEWithLogitsLoss` for `DiagGGNExact`

* [CI] Skip `BCEWithLogitsLoss` in `SqrtGGN` tests

* [REF] Use `BCEWithLogitsLoss` test cases for Hessian diagonal

* [DEL] Remove skip utilities for `BCEWithLogitsLoss`

* [ADD] Support `BCEWithLogitsLoss` in `(Diag)GGN{Exact,MC}`

* [ADD] Support `BCEWithLogitsLoss` in `SqrtGGN{Exact, MC}` extension

* [DEL] Forgot to extract BCEWithLogitsLoss test cases

* [ADD] Support `BCEWithLogitsLoss` in `KFAC`

Co-authored-by: Felix Dangel <fdangel@tue.mpg.de>
---
 backpack/core/derivatives/bcewithlogitsloss.py   | 12 ++++++++++++
 backpack/extensions/secondorder/hbp/__init__.py  |  2 ++
 backpack/extensions/secondorder/hbp/losses.py    | 14 +++++++++++---
 test/extensions/secondorder/hbp/kfac_settings.py | 15 +++++++++++++++
 test/extensions/secondorder/hbp/test_kfac.py     |  3 +++
 5 files changed, 43 insertions(+), 3 deletions(-)

diff --git a/backpack/core/derivatives/bcewithlogitsloss.py b/backpack/core/derivatives/bcewithlogitsloss.py
index 8dbe3fc46..905af5934 100644
--- a/backpack/core/derivatives/bcewithlogitsloss.py
+++ b/backpack/core/derivatives/bcewithlogitsloss.py
@@ -157,3 +157,15 @@ def _sqrt_hessian(
             sqrt_H /= sqrt(self._get_mean_normalization(module.input0))
 
         return sqrt_H
+
+    def hessian_is_psd(self) -> bool:
+        """Return whether the Hessian is PSD.
+
+        Let fₙ ∈ ℝ be the input and yₙ ∈ [0; 1] be the label, and σ(fₙ) ∈ (0;
+        1) be the sigmoid probability. The Hessian ∇²ℓ(fₙ, yₙ) w.r.t. fₙ is
+        ∇²ℓ(fₙ, yₙ) = σ'(fₙ) = σ(fₙ) (1 - σ(fₙ)) > 0. Hence, the Hessian is PSD.
+
+        Returns:
+            True
+        """
+        return True
diff --git a/backpack/extensions/secondorder/hbp/__init__.py b/backpack/extensions/secondorder/hbp/__init__.py
index 642ac172f..84e336311 100644
--- a/backpack/extensions/secondorder/hbp/__init__.py
+++ b/backpack/extensions/secondorder/hbp/__init__.py
@@ -1,6 +1,7 @@
 from torch import Tensor
 from torch.nn import (
     AvgPool2d,
+    BCEWithLogitsLoss,
     Conv2d,
     CrossEntropyLoss,
     Dropout,
@@ -58,6 +59,7 @@ def __init__(
             module_exts={
                 MSELoss: losses.HBPMSELoss(),
                 CrossEntropyLoss: losses.HBPCrossEntropyLoss(),
+                BCEWithLogitsLoss: losses.HBPBCEWithLogitsLoss(),
                 Linear: linear.HBPLinear(),
                 MaxPool2d: pooling.HBPMaxpool2d(),
                 AvgPool2d: pooling.HBPAvgPool2d(),
diff --git a/backpack/extensions/secondorder/hbp/losses.py b/backpack/extensions/secondorder/hbp/losses.py
index 9dad53b6a..646ed5ebe 100644
--- a/backpack/extensions/secondorder/hbp/losses.py
+++ b/backpack/extensions/secondorder/hbp/losses.py
@@ -1,5 +1,6 @@
 from functools import partial
 
+from backpack.core.derivatives.bcewithlogitsloss import BCELossWithLogitsDerivatives
 from backpack.core.derivatives.crossentropyloss import CrossEntropyLossDerivatives
 from backpack.core.derivatives.mseloss import MSELossDerivatives
 from backpack.extensions.curvature import Curvature
@@ -14,9 +15,8 @@ def backpropagate(self, ext, module, g_inp, g_out, backproped):
         )
 
         H_func = self.make_loss_hessian_func(ext)
-        H_loss = H_func(module, g_inp, g_out)
 
-        return H_loss
+        return H_func(module, g_inp, g_out)
 
     def make_loss_hessian_func(self, ext):
         """Get function that produces the backpropagated quantity."""
@@ -30,7 +30,7 @@ def make_loss_hessian_func(self, ext):
         elif hessian_strategy == LossHessianStrategy.SUM:
             return self.derivatives.sum_hessian
         else:
-            raise ValueError("Unknown Hessian strategy: {}".format(hessian_strategy))
+            raise ValueError(f"Unknown Hessian strategy: {hessian_strategy}")
 
 
 class HBPMSELoss(HBPLoss):
@@ -41,3 +41,11 @@ def __init__(self):
 class HBPCrossEntropyLoss(HBPLoss):
     def __init__(self):
         super().__init__(derivatives=CrossEntropyLossDerivatives())
+
+
+class HBPBCEWithLogitsLoss(HBPLoss):
+    """Hessian backpropagation for the ``BCEWithLogitsLoss`` layer."""
+
+    def __init__(self):
+        """Pass derivatives for ``BCEWithLogitsLoss``."""
+        super().__init__(derivatives=BCELossWithLogitsDerivatives())
diff --git a/test/extensions/secondorder/hbp/kfac_settings.py b/test/extensions/secondorder/hbp/kfac_settings.py
index 82bfa9b24..7c1f49f3a 100644
--- a/test/extensions/secondorder/hbp/kfac_settings.py
+++ b/test/extensions/secondorder/hbp/kfac_settings.py
@@ -8,6 +8,7 @@
 
 from torch import rand
 from torch.nn import (
+    BCEWithLogitsLoss,
     CrossEntropyLoss,
     Identity,
     Linear,
@@ -70,4 +71,18 @@
         "target_fn": lambda: classification_targets((1,), 4),
         "id_prefix": "branching-scalar",
     },
+    {
+        "input_fn": lambda: rand(1, 6),
+        "module_fn": lambda: Sequential(Linear(6, 4), ReLU(), Linear(4, 1)),
+        "loss_function_fn": lambda: BCEWithLogitsLoss(reduction="mean"),
+        "target_fn": lambda: rand(1, 1),
+        "id_prefix": "non-binary-labels",
+    },
+    {
+        "input_fn": lambda: rand(1, 6),
+        "module_fn": lambda: Sequential(Linear(6, 4), ReLU(), Linear(4, 1)),
+        "loss_function_fn": lambda: BCEWithLogitsLoss(reduction="sum"),
+        "target_fn": lambda: classification_targets(size=(1, 1), num_classes=2).float(),
+        "id_prefix": "binary-labels",
+    },
 ]
diff --git a/test/extensions/secondorder/hbp/test_kfac.py b/test/extensions/secondorder/hbp/test_kfac.py
index b3b86fad1..35c8f9591 100644
--- a/test/extensions/secondorder/hbp/test_kfac.py
+++ b/test/extensions/secondorder/hbp/test_kfac.py
@@ -7,6 +7,7 @@
     BATCH_SIZE_1_SETTINGS,
     NOT_SUPPORTED_SETTINGS,
 )
+from test.utils.skip_extension_test import skip_BCEWithLogitsLoss_non_binary_labels
 
 import pytest
 
@@ -44,6 +45,7 @@ def test_kfac_should_approx_ggn_montecarlo(problem: ExtensionsTestProblem):
         problem: Test case.
     """
     problem.set_up()
+    skip_BCEWithLogitsLoss_non_binary_labels(problem)
     autograd_res = AutogradExtensions(problem).ggn_blocks()
 
     mc_samples = 300000
@@ -65,6 +67,7 @@ def test_kfac_should_approx_ggn_montecarlo_light(problem: ExtensionsTestProblem)
         problem: Test case.
     """
     problem.set_up()
+    skip_BCEWithLogitsLoss_non_binary_labels(problem)
     autograd_res = AutogradExtensions(problem).ggn_blocks()
 
     mc_samples = 6000

From 19b65cde44e0507bd3e570fb4e1511ec417220df Mon Sep 17 00:00:00 2001
From: Felix Dangel <48687646+f-dangel@users.noreply.github.com>
Date: Fri, 4 Nov 2022 17:13:18 +0100
Subject: [PATCH 15/29] [ADD] Test support for `BCEWithLogitsLoss` in `KFLR`
 (#284)

* [ADD] Implement sqrt_hessian for BCEWithLogitsLoss

* [TEST] Add case for sqrt_hessian with non-binary labels

* [DOC] Improve derivation of Hessian square root

* [ADD] Implement manual sampled gradients for `BCEWithLogitsLoss`

* [CI] Add to fully documented files

* [DOC] Fix pydocstyle

* [ADD] `DiagHessian` support for `BCEWithLogitsLoss`

* [CI] Skip `BCEWithLogitsLoss` cases in DiagGGNExactBatch

* [CI] Skip `BCEWithLogitsLoss` for `DiagGGNExact`

* [CI] Skip `BCEWithLogitsLoss` in `SqrtGGN` tests

* [REF] Use `BCEWithLogitsLoss` test cases for Hessian diagonal

* [DEL] Remove skip utilities for `BCEWithLogitsLoss`

* [ADD] Support `BCEWithLogitsLoss` in `(Diag)GGN{Exact,MC}`

* [ADD] Support `BCEWithLogitsLoss` in `SqrtGGN{Exact, MC}` extension

* [DEL] Forgot to extract BCEWithLogitsLoss test cases

* [ADD] Support `BCEWithLogitsLoss` in `KFAC`

* [TEST] Add KFLR test, split KFAC settings to recycle in KFLR tests

Co-authored-by: Felix Dangel <fdangel@tue.mpg.de>
---
 .../secondorder/hbp/kfac_settings.py          | 43 ++++++++++++-------
 .../secondorder/hbp/kflr_settings.py          |  5 +++
 test/extensions/secondorder/hbp/test_kflr.py  | 31 ++++++++++++-
 3 files changed, 62 insertions(+), 17 deletions(-)

diff --git a/test/extensions/secondorder/hbp/kfac_settings.py b/test/extensions/secondorder/hbp/kfac_settings.py
index 7c1f49f3a..7e92a7f04 100644
--- a/test/extensions/secondorder/hbp/kfac_settings.py
+++ b/test/extensions/secondorder/hbp/kfac_settings.py
@@ -28,13 +28,36 @@
 
 NOT_SUPPORTED_SETTINGS = SHARED_NOT_SUPPORTED_SETTINGS + LOCAL_NOT_SUPPORTED_SETTINGS
 
-BATCH_SIZE_1_SETTINGS = [
+_BATCH_SIZE_1_NO_BRANCHING_SETTINGS = [
     {
         "input_fn": lambda: rand(1, 7),
         "module_fn": lambda: Sequential(Linear(7, 3), ReLU(), Linear(3, 1)),
         "loss_function_fn": lambda: MSELoss(reduction="mean"),
         "target_fn": lambda: regression_targets((1, 1)),
     },
+    {
+        "input_fn": lambda: rand(1, 5),
+        "module_fn": lambda: Sequential(Linear(5, 4), Sigmoid(), Linear(4, 3)),
+        "loss_function_fn": lambda: CrossEntropyLoss(reduction="mean"),
+        "target_fn": lambda: classification_targets((1,), 3),
+    },
+    {
+        "input_fn": lambda: rand(1, 6),
+        "module_fn": lambda: Sequential(Linear(6, 4), ReLU(), Linear(4, 1)),
+        "loss_function_fn": lambda: BCEWithLogitsLoss(reduction="mean"),
+        "target_fn": lambda: rand(1, 1),
+        "id_prefix": "non-binary-labels",
+    },
+    {
+        "input_fn": lambda: rand(1, 6),
+        "module_fn": lambda: Sequential(Linear(6, 4), ReLU(), Linear(4, 1)),
+        "loss_function_fn": lambda: BCEWithLogitsLoss(reduction="sum"),
+        "target_fn": lambda: classification_targets(size=(1, 1), num_classes=2).float(),
+        "id_prefix": "binary-labels",
+    },
+]
+
+_BATCH_SIZE_1_BRANCHING_SETTINGS = [
     {
         "input_fn": lambda: rand(1, 10),
         "module_fn": lambda: Sequential(
@@ -71,18 +94,8 @@
         "target_fn": lambda: classification_targets((1,), 4),
         "id_prefix": "branching-scalar",
     },
-    {
-        "input_fn": lambda: rand(1, 6),
-        "module_fn": lambda: Sequential(Linear(6, 4), ReLU(), Linear(4, 1)),
-        "loss_function_fn": lambda: BCEWithLogitsLoss(reduction="mean"),
-        "target_fn": lambda: rand(1, 1),
-        "id_prefix": "non-binary-labels",
-    },
-    {
-        "input_fn": lambda: rand(1, 6),
-        "module_fn": lambda: Sequential(Linear(6, 4), ReLU(), Linear(4, 1)),
-        "loss_function_fn": lambda: BCEWithLogitsLoss(reduction="sum"),
-        "target_fn": lambda: classification_targets(size=(1, 1), num_classes=2).float(),
-        "id_prefix": "binary-labels",
-    },
 ]
+
+BATCH_SIZE_1_SETTINGS = (
+    _BATCH_SIZE_1_NO_BRANCHING_SETTINGS + _BATCH_SIZE_1_BRANCHING_SETTINGS
+)
diff --git a/test/extensions/secondorder/hbp/kflr_settings.py b/test/extensions/secondorder/hbp/kflr_settings.py
index de61c5b3b..6ffeabd86 100644
--- a/test/extensions/secondorder/hbp/kflr_settings.py
+++ b/test/extensions/secondorder/hbp/kflr_settings.py
@@ -1,5 +1,8 @@
 """Define test cases for KFLR."""
 
+from test.extensions.secondorder.hbp.kfac_settings import (
+    _BATCH_SIZE_1_NO_BRANCHING_SETTINGS,
+)
 from test.extensions.secondorder.secondorder_settings import (
     GROUP_CONV_SETTINGS,
     LINEAR_ADDITIONAL_DIMENSIONS_SETTINGS,
@@ -11,3 +14,5 @@
 LOCAL_NOT_SUPPORTED_SETTINGS = []
 
 NOT_SUPPORTED_SETTINGS = SHARED_NOT_SUPPORTED_SETTINGS + LOCAL_NOT_SUPPORTED_SETTINGS
+
+BATCH_SIZE_1_SETTINGS = _BATCH_SIZE_1_NO_BRANCHING_SETTINGS
diff --git a/test/extensions/secondorder/hbp/test_kflr.py b/test/extensions/secondorder/hbp/test_kflr.py
index 3bb4a900b..7bd9fbc0c 100644
--- a/test/extensions/secondorder/hbp/test_kflr.py
+++ b/test/extensions/secondorder/hbp/test_kflr.py
@@ -1,13 +1,22 @@
 """Test BackPACK's KFLR extension."""
 
+from test.automated_test import check_sizes_and_values
+from test.extensions.implementation.autograd import AutogradExtensions
 from test.extensions.implementation.backpack import BackpackExtensions
-from test.extensions.problem import make_test_problems
-from test.extensions.secondorder.hbp.kflr_settings import NOT_SUPPORTED_SETTINGS
+from test.extensions.problem import ExtensionsTestProblem, make_test_problems
+from test.extensions.secondorder.hbp.kflr_settings import (
+    BATCH_SIZE_1_SETTINGS,
+    NOT_SUPPORTED_SETTINGS,
+)
 
 import pytest
 
+from backpack.utils.kroneckers import kfacs_to_mat
+
 NOT_SUPPORTED_PROBLEMS = make_test_problems(NOT_SUPPORTED_SETTINGS)
 NOT_SUPPORTED_IDS = [problem.make_id() for problem in NOT_SUPPORTED_PROBLEMS]
+BATCH_SIZE_1_PROBLEMS = make_test_problems(BATCH_SIZE_1_SETTINGS)
+BATCH_SIZE_1_IDS = [problem.make_id() for problem in BATCH_SIZE_1_PROBLEMS]
 
 
 @pytest.mark.parametrize("problem", NOT_SUPPORTED_PROBLEMS, ids=NOT_SUPPORTED_IDS)
@@ -23,3 +32,21 @@ def test_kflr_not_supported(problem):
         BackpackExtensions(problem).kflr()
 
     problem.tear_down()
+
+
+@pytest.mark.parametrize("problem", BATCH_SIZE_1_PROBLEMS, ids=BATCH_SIZE_1_IDS)
+def test_kflr_equals_ggn(problem: ExtensionsTestProblem):
+    """Check that for batch_size = 1 and linear layers, KFLR is the GGN block.
+
+    Args:
+        problem: Test case.
+    """
+    problem.set_up()
+    autograd_res = AutogradExtensions(problem).ggn_blocks()
+
+    backpack_kflr = BackpackExtensions(problem).kflr()
+    backpack_res = [kfacs_to_mat(kflr) for kflr in backpack_kflr]
+
+    check_sizes_and_values(autograd_res, backpack_res, atol=1e-7, rtol=1e-5)
+
+    problem.tear_down()

From c3257208ffb7024db2e5deabcfe8a90022e8f618 Mon Sep 17 00:00:00 2001
From: Felix Dangel <48687646+f-dangel@users.noreply.github.com>
Date: Thu, 10 Nov 2022 12:33:41 +0100
Subject: [PATCH 16/29] [REF] Use `unfoldNd` package for input unfolding of
 convolutions (#285)

* [REQ] Use `unfoldNd` package to unfold convolution inputs

The removed code has been extracted into a separate package.

* [DOC] Describe argument and return shape

Co-authored-by: Felix Dangel <fdangel@tue.mpg.de>
---
 backpack/utils/conv.py | 43 +++++++++++++++++-------------------------
 setup.cfg              |  1 +
 2 files changed, 18 insertions(+), 26 deletions(-)

diff --git a/backpack/utils/conv.py b/backpack/utils/conv.py
index 2cc1c5adb..f6b02fb06 100644
--- a/backpack/utils/conv.py
+++ b/backpack/utils/conv.py
@@ -5,6 +5,7 @@
 from torch import Tensor, einsum
 from torch.nn import Conv1d, Conv2d, Conv3d, Module
 from torch.nn.functional import conv1d, conv2d, conv3d, unfold
+from unfoldNd import unfoldNd
 
 
 def get_conv_module(N: int) -> Type[Module]:
@@ -133,34 +134,24 @@ def extract_bias_diagonal(module, S, sum_batch=True):
     return S.sum(sum_before).pow_(2).sum(sum_after)
 
 
-def unfold_by_conv(input, module):
-    """Return the unfolded input using convolution"""
-    N, C_in = input.shape[0], input.shape[1]
-    kernel_size = module.kernel_size
-    kernel_size_numel = module.weight.shape[2:].numel()
+def unfold_by_conv(
+    input: torch.Tensor, module: Union[Conv1d, Conv2d, Conv3d]
+) -> torch.Tensor:
+    """Return the unfolded input using convolution.
 
-    def make_weight():
-        weight = torch.zeros(kernel_size_numel, 1, *kernel_size)
-
-        for i in range(kernel_size_numel):
-            extraction = torch.zeros(kernel_size_numel)
-            extraction[i] = 1.0
-            weight[i] = extraction.reshape(1, *kernel_size)
-
-        repeat = [C_in, 1] + [1 for _ in kernel_size]
-        return weight.repeat(*repeat)
-
-    conv_dim = input.dim() - 2
-    conv = get_conv_function(conv_dim)
+    Args:
+        input: Convolution layer input.
+        module: Convolution layer.
 
-    unfold = conv(
+    Returns:
+        Unfolded input. For a 2d convolution with input of shape `[N, C_in, *, *]`
+        and a kernel of shape `[_, _, K_H, K_W]`, this tensor has shape
+        `[N, C_in * K_H * K_W, L]` where `L` is the output's number of patches.
+    """
+    return unfoldNd(
         input,
-        make_weight().to(input.device),
-        bias=None,
-        stride=module.stride,
-        padding=module.padding,
+        module.kernel_size,
         dilation=module.dilation,
-        groups=C_in,
+        padding=module.padding,
+        stride=module.stride,
     )
-
-    return unfold.reshape(N, C_in * kernel_size_numel, -1)
diff --git a/setup.cfg b/setup.cfg
index 70a38822b..287a6c6d7 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -37,6 +37,7 @@ install_requires =
     torch >= 1.9.0, < 1.13.0
     torchvision >= 0.7.0, < 1.0.0
     einops >= 0.3.0, < 1.0.0
+    unfoldNd >= 0.1.0, < 1.0.0
 # Require a specific Python version, e.g. Python 2.7 or >= 3.4
 python_requires = >=3.7
 

From e1d872b2f51cc969526d8df9c85311e7db72dabc Mon Sep 17 00:00:00 2001
From: Felix Dangel <48687646+f-dangel@users.noreply.github.com>
Date: Thu, 10 Nov 2022 14:03:34 +0100
Subject: [PATCH 17/29] [REF] Fully document convolution utilities (#286)

* [REQ] Use `unfoldNd` package to unfold convolution inputs

The removed code has been extracted into a separate package.

* [DOC] Describe argument and return shape

* [REF] Fully document convolution utilities

Co-authored-by: Felix Dangel <fdangel@tue.mpg.de>
---
 backpack/extensions/secondorder/hbp/conv2d.py |  8 +--
 backpack/utils/conv.py                        | 68 +++++++++++--------
 fully_documented.txt                          |  1 +
 3 files changed, 45 insertions(+), 32 deletions(-)

diff --git a/backpack/extensions/secondorder/hbp/conv2d.py b/backpack/extensions/secondorder/hbp/conv2d.py
index 2def51e66..ba9fe62b2 100644
--- a/backpack/extensions/secondorder/hbp/conv2d.py
+++ b/backpack/extensions/secondorder/hbp/conv2d.py
@@ -52,11 +52,11 @@ def _factors_from_input(self, ext, module):
             yield einsum("bik,bjk->ij", (X, X)) / batch
 
     def _factor_from_sqrt(self, module, backproped):
-        sqrt_ggn = backproped
+        num_spatial_dims = 2
 
-        sqrt_ggn = convUtils.separate_channels_and_pixels(module, sqrt_ggn)
-        sqrt_ggn = einsum("cbij->cbi", (sqrt_ggn,))
-        return einsum("cbi,cbl->il", (sqrt_ggn, sqrt_ggn))
+        sqrt_ggn = backproped.flatten(start_dim=-num_spatial_dims)
+        sqrt_ggn = einsum("cbij->cbi", sqrt_ggn)
+        return einsum("cbi,cbl->il", sqrt_ggn, sqrt_ggn)
 
     def bias(self, ext, module, g_inp, g_out, backproped):
         bp_strategy = ext.get_backprop_strategy()
diff --git a/backpack/utils/conv.py b/backpack/utils/conv.py
index f6b02fb06..b4689bb35 100644
--- a/backpack/utils/conv.py
+++ b/backpack/utils/conv.py
@@ -1,4 +1,6 @@
-from typing import Callable, Type, Union
+"""Utility functions for convolution layers."""
+
+from typing import Callable, Tuple, Type, Union
 
 import torch
 from einops import rearrange
@@ -65,34 +67,45 @@ def unfold_input(module: Union[Conv1d, Conv2d, Conv3d], input: Tensor) -> Tensor
         return unfold_by_conv(input, module)
 
 
-def get_weight_gradient_factors(input, grad_out, module):
+def get_weight_gradient_factors(
+    input: Tensor, grad_out: Tensor, module: Union[Conv1d, Conv2d, Conv3d]
+) -> Tuple[Tensor, Tensor]:
+    """Return the factors for constructing the gradients w.r.t. the kernel.
+
+    Args:
+        input: Convolution layer input.
+        grad_out: Gradient w.r.t. to the convolution layer output.
+        module: Convolution layer.
+
+    Returns:
+        Unfolded input, output gradient with flattened spatial dimensions.
+    """
     X = unfold_input(module, input)
     dE_dY = rearrange(grad_out, "n c ... -> n c (...)")
     return X, dE_dY
 
 
-def separate_channels_and_pixels(module, tensor):
-    """Reshape (V, N, C, H, W) into (V, N, C, H * W)."""
-    return rearrange(tensor, "v n c ... -> v n c (...)")
-
-
-def extract_weight_diagonal(module, unfolded_input, S, sum_batch=True):
+def extract_weight_diagonal(
+    module: Union[Conv1d, Conv2d, Conv3d],
+    unfolded_input: Tensor,
+    S: Tensor,
+    sum_batch: bool = True,
+) -> Tensor:
     """Extract diagonal of ``(Jᵀ S) (Jᵀ S)ᵀ`` where ``J`` is the weight Jacobian.
 
     Args:
-        module (torch.nn.Conv1d or torch.nn.Conv2d or torch.nn.Conv3d): Convolution
-            layer for which the diagonal is extracted w.r.t. the weight.
-        unfolded_input (torch.Tensor): Unfolded input to the convolution. Shape must
-            follow the conventions of ``torch.nn.Unfold``.
-        S (torch.Tensor): Backpropagated (symmetric factorization) of the loss Hessian.
+        module: Convolution layer for which the diagonal is extracted w.r.t. the weight.
+        unfolded_input: Unfolded input to the convolution. Shape must follow the
+            conventions of ``torch.nn.Unfold``.
+        S: Backpropagated (symmetric factorization) of the loss Hessian.
             Has shape ``(V, *module.output.shape)``.
-        sum_batch (bool, optional): Sum out the batch dimension of the weight diagonals.
+        sum_batch: Sum out the batch dimension of the weight diagonals.
             Default value: ``True``.
 
     Returns:
-        torch.Tensor: Per-sample weight diagonal if ``sum_batch=False`` (shape
-            ``(N, module.weight.shape)`` with batch size ``N``) or summed weight
-            diagonal if ``sum_batch=True`` (shape ``module.weight.shape``).
+        Per-sample weight diagonal if ``sum_batch=False`` (shape
+        ``(N, module.weight.shape)`` with batch size ``N``) or summed weight
+        diagonal if ``sum_batch=True`` (shape ``module.weight.shape``).
     """
     S = rearrange(S, "v n (g c) ... -> v n g c (...)", g=module.groups)
     unfolded_input = rearrange(unfolded_input, "n (g c) k -> n g c k", g=module.groups)
@@ -104,28 +117,27 @@ def extract_weight_diagonal(module, unfolded_input, S, sum_batch=True):
         module.weight.shape if sum_batch else (JS.shape[1], *module.weight.shape)
     )
 
-    weight_diagonal = JS.pow_(2).sum(sum_dims).reshape(out_shape)
-
-    return weight_diagonal
+    return JS.pow_(2).sum(sum_dims).reshape(out_shape)
 
 
 # TODO This method applies the bias Jacobian, then squares and sums the result. Intro-
 # duce base class for {Batch}DiagHessian and DiagGGN{Exact,MC} and remove this method
-def extract_bias_diagonal(module, S, sum_batch=True):
+def extract_bias_diagonal(
+    module: Union[Conv1d, Conv2d, Conv3d], S: Tensor, sum_batch: bool = True
+) -> Tensor:
     """Extract diagonal of ``(Jᵀ S) (Jᵀ S)ᵀ`` where ``J`` is the bias Jacobian.
 
     Args:
-        module (torch.nn.Conv1d or torch.nn.Conv2d or torch.nn.Conv3d): Convolution
-            layer for which the diagonal is extracted w.r.t. the bias.
-        S (torch.Tensor): Backpropagated (symmetric factorization) of the loss Hessian.
+        module: Convolution layer for which the diagonal is extracted w.r.t. the bias.
+        S: Backpropagated (symmetric factorization) of the loss Hessian.
             Has shape ``(V, *module.output.shape)``.
-        sum_batch (bool, optional): Sum out the batch dimension of the bias diagonals.
+        sum_batch: Sum out the batch dimension of the bias diagonals.
             Default value: ``True``.
 
     Returns:
-        torch.Tensor: Per-sample bias diagonal if ``sum_batch=False`` (shape
-            ``(N, module.bias.shape)`` with batch size ``N``) or summed bias
-            diagonal if ``sum_batch=True`` (shape ``module.bias.shape``).
+        Per-sample bias diagonal if ``sum_batch=False`` (shape
+        ``(N, module.bias.shape)`` with batch size ``N``) or summed bias
+        diagonal if ``sum_batch=True`` (shape ``module.bias.shape``).
     """
     start_spatial = 3
     sum_before = list(range(start_spatial, S.dim()))
diff --git a/fully_documented.txt b/fully_documented.txt
index 912c0ce0a..34d5bf750 100644
--- a/fully_documented.txt
+++ b/fully_documented.txt
@@ -79,6 +79,7 @@ backpack/utils/module_classification.py
 backpack/utils/hooks.py
 backpack/utils/examples.py
 backpack/utils/convert_parameters.py
+backpack/utils/conv.py
 
 test/extensions/automated_settings.py
 test/extensions/problem.py

From 43c53fa70799786d0312ffbca4628ded5af69727 Mon Sep 17 00:00:00 2001
From: Felix Dangel <48687646+f-dangel@users.noreply.github.com>
Date: Thu, 10 Nov 2022 18:05:48 +0100
Subject: [PATCH 18/29] [REF] Use `unfoldNd` package for input unfolding of
 transpose convolutions (#287)

* [REQ] Use `unfoldNd` package to unfold convolution inputs

The removed code has been extracted into a separate package.

* [DOC] Describe argument and return shape

* [REF] Fully document convolution utilities

* [ADD] Use `unfoldNd` to unfold input of transpose convolution

* [DEL] Remove old code for unfolding

Co-authored-by: Felix Dangel <fdangel@tue.mpg.de>
---
 backpack/utils/conv_transpose.py | 52 ++++++++++++--------------------
 setup.cfg                        |  2 +-
 2 files changed, 20 insertions(+), 34 deletions(-)

diff --git a/backpack/utils/conv_transpose.py b/backpack/utils/conv_transpose.py
index 3c90834be..9658c0284 100644
--- a/backpack/utils/conv_transpose.py
+++ b/backpack/utils/conv_transpose.py
@@ -1,12 +1,12 @@
 """Utility functions for extracting transpose convolution BackPACK quantities."""
 
-from typing import Callable, Type
+from typing import Callable, Type, Union
 
-import torch
 from einops import rearrange
-from torch import einsum
+from torch import Tensor, einsum
 from torch.nn import ConvTranspose1d, ConvTranspose2d, ConvTranspose3d, Module
 from torch.nn.functional import conv_transpose1d, conv_transpose2d, conv_transpose3d
+from unfoldNd import unfold_transposeNd
 
 from backpack.utils.conv import extract_bias_diagonal as conv_extract_bias_diagonal
 
@@ -114,47 +114,33 @@ def extract_bias_diagonal(module, S, sum_batch=True):
     return conv_extract_bias_diagonal(module, S, sum_batch=sum_batch)
 
 
-def unfold_by_conv_transpose(input, module):
+def unfold_by_conv_transpose(
+    input: Tensor, module: Union[ConvTranspose1d, ConvTranspose2d, ConvTranspose3d]
+) -> Tensor:
     """Return the unfolded input using one-hot transpose convolution.
 
     Args:
-        input (torch.Tensor): Input to a transpose convolution.
-        module (torch.nn.ConvTranspose1d or torch.nn.ConvTranspose2d or
-            torch.nn.ConvTranspose3d): Transpose convolution layer that specifies
-            the hyperparameters for unfolding.
+        input: Input to a transpose convolution.
+        module: Transpose convolution layer that specifies the hyperparameters for
+            unfolding.
 
     Returns:
-        torch.Tensor: Unfolded input of shape ``(N, C, K * X)`` with
-            ``K = module.weight.shape[2:].numel()`` the number of kernel elements
-            and ``X = module.output.shape[2:].numel()`` the number of output pixels.
+        Unfolded input of shape ``(N, C, K * X)`` with
+        ``K = module.weight.shape[2:].numel()`` the number of kernel elements
+        and ``X = module.output.shape[2:].numel()`` the number of output pixels.
+        TODO: The returned shape is inconsistent with `unfold_by_conv`, which
+        returns shape `[N, C * K, X]`.
     """
     N, C_in = input.shape[0], input.shape[1]
-    kernel_size = module.kernel_size
-    kernel_size_numel = module.weight.shape[2:].numel()
-
-    def make_weight():
-        weight = torch.zeros(1, kernel_size_numel, *kernel_size)
-
-        for i in range(kernel_size_numel):
-            extraction = torch.zeros(kernel_size_numel)
-            extraction[i] = 1.0
-            weight[0, i] = extraction.reshape(*kernel_size)
 
-        repeat = [C_in, 1] + [1 for _ in kernel_size]
-        weight = weight.repeat(*repeat)
-        return weight.to(module.weight.device)
-
-    conv_dim = input.dim() - 2
-    conv_transpose = get_conv_transpose_function(conv_dim)
-
-    unfold = conv_transpose(
+    unfold = unfold_transposeNd(
         input,
-        make_weight().to(module.weight.device),
-        bias=None,
+        module.kernel_size,
         stride=module.stride,
         padding=module.padding,
+        # TODO The case where output_size is specified in the forward pass of a
+        # ConvTransposeNd is not handled
+        output_padding=0,
         dilation=module.dilation,
-        groups=C_in,
     )
-
     return unfold.reshape(N, C_in, -1)
diff --git a/setup.cfg b/setup.cfg
index 287a6c6d7..1f58b6f3b 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -37,7 +37,7 @@ install_requires =
     torch >= 1.9.0, < 1.13.0
     torchvision >= 0.7.0, < 1.0.0
     einops >= 0.3.0, < 1.0.0
-    unfoldNd >= 0.1.0, < 1.0.0
+    unfoldNd >= 0.2.0, < 1.0.0
 # Require a specific Python version, e.g. Python 2.7 or >= 3.4
 python_requires = >=3.7
 

From 4320a534ce0845bb43d4abf227ad0d79acabd25d Mon Sep 17 00:00:00 2001
From: Felix Dangel <48687646+f-dangel@users.noreply.github.com>
Date: Thu, 10 Nov 2022 18:31:20 +0100
Subject: [PATCH 19/29] [DOC] Fully document transpose convolution utilities
 (#288)

* [REQ] Use `unfoldNd` package to unfold convolution inputs

The removed code has been extracted into a separate package.

* [DOC] Describe argument and return shape

* [REF] Fully document convolution utilities

* [ADD] Use `unfoldNd` to unfold input of transpose convolution

* [DEL] Remove old code for unfolding

* [DOC] Fully-document transpose convolution utilities

Co-authored-by: Felix Dangel <fdangel@tue.mpg.de>
---
 backpack/utils/conv.py           | 16 +++++++-
 backpack/utils/conv_transpose.py | 66 ++++++++++++++++++++------------
 fully_documented.txt             |  1 +
 3 files changed, 56 insertions(+), 27 deletions(-)

diff --git a/backpack/utils/conv.py b/backpack/utils/conv.py
index b4689bb35..deaef6e40 100644
--- a/backpack/utils/conv.py
+++ b/backpack/utils/conv.py
@@ -5,7 +5,15 @@
 import torch
 from einops import rearrange
 from torch import Tensor, einsum
-from torch.nn import Conv1d, Conv2d, Conv3d, Module
+from torch.nn import (
+    Conv1d,
+    Conv2d,
+    Conv3d,
+    ConvTranspose1d,
+    ConvTranspose2d,
+    ConvTranspose3d,
+    Module,
+)
 from torch.nn.functional import conv1d, conv2d, conv3d, unfold
 from unfoldNd import unfoldNd
 
@@ -123,7 +131,11 @@ def extract_weight_diagonal(
 # TODO This method applies the bias Jacobian, then squares and sums the result. Intro-
 # duce base class for {Batch}DiagHessian and DiagGGN{Exact,MC} and remove this method
 def extract_bias_diagonal(
-    module: Union[Conv1d, Conv2d, Conv3d], S: Tensor, sum_batch: bool = True
+    module: Union[
+        Conv1d, Conv2d, Conv3d, ConvTranspose1d, ConvTranspose2d, ConvTranspose3d
+    ],
+    S: Tensor,
+    sum_batch: bool = True,
 ) -> Tensor:
     """Extract diagonal of ``(Jᵀ S) (Jᵀ S)ᵀ`` where ``J`` is the bias Jacobian.
 
diff --git a/backpack/utils/conv_transpose.py b/backpack/utils/conv_transpose.py
index 9658c0284..edb22b8d0 100644
--- a/backpack/utils/conv_transpose.py
+++ b/backpack/utils/conv_transpose.py
@@ -1,6 +1,6 @@
 """Utility functions for extracting transpose convolution BackPACK quantities."""
 
-from typing import Callable, Type, Union
+from typing import Callable, Tuple, Type, Union
 
 from einops import rearrange
 from torch import Tensor, einsum
@@ -43,7 +43,21 @@ def get_conv_transpose_function(N: int) -> Callable:
     }[N]
 
 
-def get_weight_gradient_factors(input, grad_out, module):
+def get_weight_gradient_factors(
+    input: Tensor,
+    grad_out: Tensor,
+    module: Union[ConvTranspose1d, ConvTranspose2d, ConvTranspose3d],
+) -> Tuple[Tensor, Tensor]:
+    """Return factors for computing gradients w.r.t. the kernel.
+
+    Args:
+        input: Input to the transpose convolution layer.
+        grad_out: Gradient w.r.t. the transpose convolution layer's output.
+        module: Transpose convolution layer.
+
+    Returns:
+        unfolded input, output gradient with flattened spatial dimensions
+    """
     M, C_in = input.shape[0], input.shape[1]
     kernel_size_numel = module.weight.shape[2:].numel()
 
@@ -53,23 +67,26 @@ def get_weight_gradient_factors(input, grad_out, module):
     return X, dE_dY
 
 
-def extract_weight_diagonal(module, unfolded_input, S, sum_batch=True):
+def extract_weight_diagonal(
+    module: Union[ConvTranspose1d, ConvTranspose2d, ConvTranspose3d],
+    unfolded_input: Tensor,
+    S: Tensor,
+    sum_batch: bool = True,
+) -> Tensor:
     """Extract diagonal of ``(Jᵀ S) (Jᵀ S)ᵀ`` where ``J`` is the weight Jacobian.
 
     Args:
-        module (torch.nn.ConvTranspose1d or torch.nn.ConvTranspose2d or
-            torch.nn.ConvTranspose3d ): Convolution layer for which the diagonal is
-            extracted w.r.t. the weight.
-        unfolded_input (torch.Tensor): Unfolded input to the transpose convolution.
-        S (torch.Tensor): Backpropagated (symmetric factorization) of the loss Hessian.
+        module: Convolution layer for which the diagonal is extracted w.r.t. the weight.
+        unfolded_input: Unfolded input to the transpose convolution.
+        S: Backpropagated (symmetric factorization) of the loss Hessian.
             Has shape ``(V, *module.output.shape)``.
-        sum_batch (bool, optional): Sum out the batch dimension of the weight diagonals.
+        sum_batch: Sum out the batch dimension of the weight diagonals.
             Default value: ``True``.
 
     Returns:
-        torch.Tensor: Per-sample weight diagonal if ``sum_batch=False`` (shape
-            ``(N, module.weight.shape)`` with batch size ``N``) or summed weight
-            diagonal if ``sum_batch=True`` (shape ``module.weight.shape``).
+        Per-sample weight diagonal if ``sum_batch=False`` (shape
+        ``(N, module.weight.shape)`` with batch size ``N``) or summed weight
+        diagonal if ``sum_batch=True`` (shape ``module.weight.shape``).
     """
     S = rearrange(S, "v n (g o) ... -> v n g o (...)", g=module.groups)
     unfolded_input = rearrange(
@@ -86,30 +103,29 @@ def extract_weight_diagonal(module, unfolded_input, S, sum_batch=True):
         module.weight.shape if sum_batch else (JS.shape[1], *module.weight.shape)
     )
 
-    weight_diagonal = JS.pow_(2).sum(sum_dims).reshape(out_shape)
-
-    return weight_diagonal
+    return JS.pow_(2).sum(sum_dims).reshape(out_shape)
 
 
 # TODO This method applies the bias Jacobian, then squares and sums the result. Intro-
 # duce base class for {Batch}DiagHessian and DiagGGN{Exact,MC} and remove this method
-def extract_bias_diagonal(module, S, sum_batch=True):
+def extract_bias_diagonal(
+    module: Union[ConvTranspose1d, ConvTranspose2d, ConvTranspose3d],
+    S: Tensor,
+    sum_batch: bool = True,
+) -> Tensor:
     """Extract diagonal of ``(Jᵀ S) (Jᵀ S)ᵀ`` where ``J`` is the weight Jacobian.
 
     Args:
-        module (torch.nn.ConvTranspose1d or torch.nn.ConvTranspose2d or
-            torch.nn.ConvTranspose3d ): Convolution layer for which the diagonal is
-            extracted w.r.t. the bias.
-        unfolded_input (torch.Tensor): Unfolded input to the transpose convolution.
-        S (torch.Tensor): Backpropagated (symmetric factorization) of the loss Hessian.
+        module: Convolution layer for which the diagonal is extracted w.r.t. the bias.
+        S: Backpropagated (symmetric factorization) of the loss Hessian.
             Has shape ``(V, *module.output.shape)``.
-        sum_batch (bool, optional): Sum out the batch dimension of the bias diagonals.
+        sum_batch: Sum out the batch dimension of the bias diagonals.
             Default value: ``True``.
 
     Returns:
-        torch.Tensor: Per-sample bias diagonal if ``sum_batch=False`` (shape
-            ``(N, module.bias.shape)`` with batch size ``N``) or summed bias
-            diagonal if ``sum_batch=True`` (shape ``module.bias.shape``).
+        Per-sample bias diagonal if ``sum_batch=False`` (shape
+        ``(N, module.bias.shape)`` with batch size ``N``) or summed bias
+        diagonal if ``sum_batch=True`` (shape ``module.bias.shape``).
     """
     return conv_extract_bias_diagonal(module, S, sum_batch=sum_batch)
 
diff --git a/fully_documented.txt b/fully_documented.txt
index 34d5bf750..2ec9d36a8 100644
--- a/fully_documented.txt
+++ b/fully_documented.txt
@@ -80,6 +80,7 @@ backpack/utils/hooks.py
 backpack/utils/examples.py
 backpack/utils/convert_parameters.py
 backpack/utils/conv.py
+backpack/utils/conv_transpose.py
 
 test/extensions/automated_settings.py
 test/extensions/problem.py

From 4b4141f569973efd6272462df3d20386c5e7350a Mon Sep 17 00:00:00 2001
From: Felix Dangel <48687646+f-dangel@users.noreply.github.com>
Date: Thu, 10 Nov 2022 18:58:09 +0100
Subject: [PATCH 20/29] [REF] Make output shape of `unfold_by_conv_transpose`
 consistent with convolution case (#289)

* [REQ] Use `unfoldNd` package to unfold convolution inputs

The removed code has been extracted into a separate package.

* [DOC] Describe argument and return shape

* [REF] Fully document convolution utilities

* [ADD] Use `unfoldNd` to unfold input of transpose convolution

* [DEL] Remove old code for unfolding

* [DOC] Fully-document transpose convolution utilities

* [REF] Make output shape of unfold_by_conv_transpose consistent

Co-authored-by: Felix Dangel <fdangel@tue.mpg.de>
---
 .../secondorder/diag_ggn/convtransposend.py    | 10 ++--------
 backpack/utils/conv_transpose.py               | 18 +++++-------------
 2 files changed, 7 insertions(+), 21 deletions(-)

diff --git a/backpack/extensions/secondorder/diag_ggn/convtransposend.py b/backpack/extensions/secondorder/diag_ggn/convtransposend.py
index 2e83fae4d..857524bdf 100644
--- a/backpack/extensions/secondorder/diag_ggn/convtransposend.py
+++ b/backpack/extensions/secondorder/diag_ggn/convtransposend.py
@@ -9,10 +9,7 @@ def bias(self, ext, module, grad_inp, grad_out, backproped):
 
     def weight(self, ext, module, grad_inp, grad_out, backproped):
         X = convUtils.unfold_by_conv_transpose(module.input0, module)
-        weight_diag = convUtils.extract_weight_diagonal(
-            module, X, backproped, sum_batch=True
-        )
-        return weight_diag
+        return convUtils.extract_weight_diagonal(module, X, backproped, sum_batch=True)
 
 
 class BatchDiagGGNConvTransposeND(DiagGGNBaseModule):
@@ -22,7 +19,4 @@ def bias(self, ext, module, grad_inp, grad_out, backproped):
 
     def weight(self, ext, module, grad_inp, grad_out, backproped):
         X = convUtils.unfold_by_conv_transpose(module.input0, module)
-        weight_diag = convUtils.extract_weight_diagonal(
-            module, X, backproped, sum_batch=False
-        )
-        return weight_diag
+        return convUtils.extract_weight_diagonal(module, X, backproped, sum_batch=False)
diff --git a/backpack/utils/conv_transpose.py b/backpack/utils/conv_transpose.py
index edb22b8d0..30da486c8 100644
--- a/backpack/utils/conv_transpose.py
+++ b/backpack/utils/conv_transpose.py
@@ -58,10 +58,7 @@ def get_weight_gradient_factors(
     Returns:
         unfolded input, output gradient with flattened spatial dimensions
     """
-    M, C_in = input.shape[0], input.shape[1]
-    kernel_size_numel = module.weight.shape[2:].numel()
-
-    X = unfold_by_conv_transpose(input, module).reshape(M, C_in * kernel_size_numel, -1)
+    X = unfold_by_conv_transpose(input, module)
     dE_dY = rearrange(grad_out, "n c ... -> n c (...)")
 
     return X, dE_dY
@@ -91,12 +88,12 @@ def extract_weight_diagonal(
     S = rearrange(S, "v n (g o) ... -> v n g o (...)", g=module.groups)
     unfolded_input = rearrange(
         unfolded_input,
-        "n (g c) (k x) -> n g c k x",
+        "n (g c k) x -> n g c k x",
         g=module.groups,
         k=module.weight.shape[2:].numel(),
     )
 
-    JS = einsum("ngckx,vngox->vngcok", (unfolded_input, S))
+    JS = einsum("ngckx,vngox->vngcok", unfolded_input, S)
 
     sum_dims = [0, 1] if sum_batch else [0]
     out_shape = (
@@ -141,15 +138,11 @@ def unfold_by_conv_transpose(
             unfolding.
 
     Returns:
-        Unfolded input of shape ``(N, C, K * X)`` with
+        Unfolded input of shape ``(N, C * K, X)`` with
         ``K = module.weight.shape[2:].numel()`` the number of kernel elements
         and ``X = module.output.shape[2:].numel()`` the number of output pixels.
-        TODO: The returned shape is inconsistent with `unfold_by_conv`, which
-        returns shape `[N, C * K, X]`.
     """
-    N, C_in = input.shape[0], input.shape[1]
-
-    unfold = unfold_transposeNd(
+    return unfold_transposeNd(
         input,
         module.kernel_size,
         stride=module.stride,
@@ -159,4 +152,3 @@ def unfold_by_conv_transpose(
         output_padding=0,
         dilation=module.dilation,
     )
-    return unfold.reshape(N, C_in, -1)

From 5e0fb3521790c179dc41fb4528360497620e59b4 Mon Sep 17 00:00:00 2001
From: Felix Dangel <48687646+f-dangel@users.noreply.github.com>
Date: Sun, 13 Nov 2022 13:33:51 +0100
Subject: [PATCH 21/29] [DOC] Fully document `HBPConv2d` (#290)

Add documentation to the `HBP` extension of `Conv2d` layers, which is
responsible to compute `KFAC/KFLR/KFRA`. The docstrings draw connections
to the notation in the [KFC paper](https://arxiv.org/pdf/1602.01407.pdf), and
outline important differences, as well as improvements for consistency.
Also add a test case for `KFAC, KFLR` for which both approximations become exact.

Note to myself: I made notes how to connect Hessian backpropagation to `KFAC`
for convolutions by imposing a Kronecker structure on the backpropagated quantity.
This concept can also be applied to `KFRA` to achieve more consistency, but is
currently not done by the code.

* [DOC] Fully document `HBPConv2d`

* [TEST] KFAC/KFLR for convolution with single output

Convolution layers with a single output behave like linear layers, as
the weights are not shared over the input.

* [DOC] Polish docstrings

* [TEST] Add integration test for KFRA
---
 backpack/extensions/secondorder/hbp/conv2d.py | 220 ++++++++++++++----
 fully_documented.txt                          |   1 +
 .../secondorder/hbp/kfac_settings.py          |  12 +
 .../secondorder/hbp/kfra_settings.py          |   5 +
 test/extensions/secondorder/hbp/test_kfra.py  |  38 ++-
 test/utils/skip_extension_test.py             |  10 +
 6 files changed, 243 insertions(+), 43 deletions(-)

diff --git a/backpack/extensions/secondorder/hbp/conv2d.py b/backpack/extensions/secondorder/hbp/conv2d.py
index ba9fe62b2..06e5a947b 100644
--- a/backpack/extensions/secondorder/hbp/conv2d.py
+++ b/backpack/extensions/secondorder/hbp/conv2d.py
@@ -1,4 +1,11 @@
-from torch import einsum
+"""Kronecker approximations of the Hessian for convolution layers."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, List, Tuple
+
+from torch import Tensor, einsum
+from torch.nn import Conv2d
 
 from backpack.core.derivatives.conv2d import Conv2DDerivatives
 from backpack.extensions.secondorder.hbp.hbp_options import (
@@ -8,74 +15,209 @@
 from backpack.extensions.secondorder.hbp.hbpbase import HBPBaseModule
 from backpack.utils import conv as convUtils
 
+if TYPE_CHECKING:
+    from backpack.extensions.secondorder.hbp import HBP
+
 
 class HBPConv2d(HBPBaseModule):
+    """Computes Kronecker-structured Hessian approximations for convolution layers."""
+
     def __init__(self):
+        """Pass derivatives for convolution."""
         super().__init__(derivatives=Conv2DDerivatives(), params=["weight", "bias"])
 
-    def weight(self, ext, module, g_inp, g_out, backproped):
-
-        if module.groups != 1:
-            raise NotImplementedError(
-                f"groups ≠ 1 is not supported by {ext.__class__.__name__} "
-                + f"(got {module.groups})."
-            )
-
+    def weight(
+        self,
+        ext: HBP,
+        module: Conv2d,
+        g_inp: Tuple[Tensor],
+        g_out: Tuple[Tensor],
+        backproped: Tensor,
+    ) -> List[Tensor]:
+        """Compute the Kronecker factors for the weight Hessian approximation.
+
+        Note:
+            TODO The Kronecker factor computed from the backpropagated quantity
+            differs from the KFC paper (https://arxiv.org/pdf/1602.01407.pdf)
+            by a factor of |Τ| = H * W where H, W denote the spatial output
+            dimensions of the convolution. If this convention is changed to be
+            more consistent with the paper, this must be clearly communicated
+            to users as it will alter the scale of the KFAC quantity for weights of
+            convolutions in comparison to older versions.
+
+        Args:
+            ext: HBP extension.
+            module: convolution layer the backpropagation is performed on.
+            g_inp: input gradient.
+            g_out: output gradient.
+            backproped: Backpropagated quantity, depends on the approximation mode.
+                For KFLR/KFAC this is the MC/exact matrix square root of the GGN w.r.t.
+                the convolution output (shape `[N, C, H, W]`) and has shape
+                `[M, N, C, H, W]` with `M` the number of MC samples or the number of
+                classes for KFAC/KFLR, respectively. For KFRA, the backpropagated
+                object approximates the batch-averaged Hessian w.r.t. to the convolution
+                output and has shape `[C * H * W, C * H * W]`.
+
+        Returns:
+            List of Kronecker factors whose Kronecker product approximates the weight
+            Hessian. Its length depends on the Hessian approximation. If `[A, B, C]`
+            is returned, then `A ⊗ B ⊗ C` has shape `[weights.numel(), weights.numel()]`
+            and approximates the weight Hessian.
+        """
+        self._maybe_raise_groups_not_implemented_error(ext, module)
+
+        kron_factors: List[Tensor] = []
         bp_strategy = ext.get_backprop_strategy()
 
-        if BackpropStrategy.is_batch_average(bp_strategy):
-            return self._weight_for_batch_average(ext, module, backproped)
+        # TODO Find a setting in which this corresponds to an autodiff quantity, test
+        if BackpropStrategy.is_batch_average(bp_strategy):  # KFRA
+            kron_factors.append(self._factor_from_batch_average(module, backproped))
 
-        elif BackpropStrategy.is_sqrt(bp_strategy):
-            return self._weight_for_sqrt(ext, module, backproped)
+        elif BackpropStrategy.is_sqrt(bp_strategy):  # KFLR, KFAC
+            kron_factors.append(self._factor_from_sqrt(module, backproped))
 
-    # TODO: Require tests
-    def _weight_for_batch_average(self, ext, module, backproped):
-        kron_factors = [self._factor_from_batch_average(module, backproped)]
         kron_factors += self._factors_from_input(ext, module)
         return kron_factors
 
-    def _weight_for_sqrt(self, ext, module, backproped):
-        kron_factors = [self._factor_from_sqrt(module, backproped)]
-        kron_factors += self._factors_from_input(ext, module)
-        return kron_factors
+    def _factors_from_input(self, ext: HBP, module: Conv2d) -> List[Tensor]:
+        """Compute the un-centered covariance of the unfolded input.
 
-    def _factors_from_input(self, ext, module):
-        X = convUtils.unfold_input(module, module.input0)
-        batch = X.size(0)
+        In the notation of https://arxiv.org/pdf/1602.01407.pdf,
+        this computes Ω from equation (32) for KFAC.
 
-        ea_strategy = ext.get_ea_strategy()
+        Args:
+            ext: HBP extension.
+            module: Convolution layer.
 
+        Raises:
+            NotImplementedError: If the backpropagation strategy differs from KFAC.
+
+        Returns:
+            List containing the tensor of the un-centered covariance of the unfolded
+            input. For a convolution with kernel of size `[_, C_in, K_H, K_W]`, its
+            shape is shape `[C_in * K_H * K_W, C_in * K_H * K_W]`.
+        """
+        ea_strategy = ext.get_ea_strategy()
         if ExpectationApproximation.should_average_param_jac(ea_strategy):
             raise NotImplementedError("Undefined")
-        else:
-            yield einsum("bik,bjk->ij", (X, X)) / batch
 
-    def _factor_from_sqrt(self, module, backproped):
+        X = convUtils.unfold_input(module, module.input0)
+
+        return [einsum("bik,bjk->ij", X, X) / X.shape[0]]
+
+    def _factor_from_sqrt(self, module: Conv2d, backproped: Tensor) -> Tensor:
+        """Compute the Kronecker factor from the backpropagated GGN matrix square root.
+
+        In the notation of https://arxiv.org/pdf/1602.01407.pdf,
+        this computes |Τ| * Γ from equation (32) for KFAC.
+
+        Note:
+            In comparison to the KFC paper, the output differs by a factor of |Τ|.
+            This is because |Τ| * Γ is the MC/exact GGN w.r.t. the convolution's bias,
+            For two-dimensional convolution with output of shape `[N, C_out, H, W]`,
+            |Τ| = H * W.
+
+        Args:
+            module: Convolution layer.
+            backproped: Backpropagated quantity, corresponding to the MC/exact matrix
+                square square root of the GGN w.r.t. the convolution output. For a
+                convolution with output shape `[N, C_out, H, W]`, this square root is
+                of shape `[M, N, C_out, H, W]` where `M` is the number of MC samples
+                for KFAC, and the number of classes for KFLR. The matrix square root
+                already incorporates a normalization factor for batch size averaging.
+
+        Returns:
+            MC/exact GGN w.r.t. the bias. Has shape `[C_out, C_out]`
+        """
         num_spatial_dims = 2
 
         sqrt_ggn = backproped.flatten(start_dim=-num_spatial_dims)
         sqrt_ggn = einsum("cbij->cbi", sqrt_ggn)
         return einsum("cbi,cbl->il", sqrt_ggn, sqrt_ggn)
 
-    def bias(self, ext, module, g_inp, g_out, backproped):
+    def bias(
+        self,
+        ext: HBP,
+        module: Conv2d,
+        g_inp: Tuple[Tensor],
+        g_out: Tuple[Tensor],
+        backproped: Tensor,
+    ) -> List[Tensor]:
+        """Compute the Kronecker factors for the bias Hessian approximation.
+
+        Args:
+            ext: HBP extension.
+            module: convolution layer the backpropagation is performed on.
+            g_inp: input gradient.
+            g_out: output gradient.
+            backproped: Backpropagated quantity, depends on the approximation mode.
+                For KFLR/KFAC this is the MC/exact matrix square root of the GGN w.r.t.
+                the convolution output (shape `[N, C, H, W]`) and has shape
+                `[M, N, C, H, W]` with `M` the number of MC samples or the number of
+                classes for KFAC/KFLR, respectively. For KFRA, the backpropagated
+                object approximates the batch-averaged Hessian w.r.t. the convolution
+                output and has shape `[C * H * W, C * H * W]`.
+
+        Returns:
+            List of Kronecker factors whose Kronecker product approximates the bias
+            Hessian. Its length depends on the Hessian approximation. If `[A, B, C]`
+            is returned, then `A ⊗ B ⊗ C` has shape `[bias.numel(), bias.numel()]`
+            and approximates the bias Hessian.
+        """
+        kron_factors: List[Tensor] = []
         bp_strategy = ext.get_backprop_strategy()
 
-        if BackpropStrategy.is_batch_average(bp_strategy):
-            return self._bias_for_batch_average(module, backproped)
-        elif BackpropStrategy.is_sqrt(bp_strategy):
-            return self._bias_for_sqrt(module, backproped)
-
-    def _bias_for_sqrt(self, module, backproped):
-        return [self._factor_from_sqrt(module, backproped)]
+        # TODO Find a setting in which this corresponds to an autodiff quantity, test
+        if BackpropStrategy.is_batch_average(bp_strategy):  # KFRA
+            kron_factors.append(self._factor_from_batch_average(module, backproped))
+        elif BackpropStrategy.is_sqrt(bp_strategy):  # KFAC/KFLR
+            kron_factors.append(self._factor_from_sqrt(module, backproped))
 
-    # TODO: Require tests
-    def _bias_for_batch_average(self, module, backproped):
-        return [self._factor_from_batch_average(module, backproped)]
+        return kron_factors
 
-    def _factor_from_batch_average(self, module, backproped):
+    def _factor_from_batch_average(self, module: Conv2d, backproped: Tensor) -> Tensor:
+        """Compute the Kronecker factor from the backpropagated output Hessian proxy.
+
+        Note:
+            TODO Currently, the Kronecker approximation that needs to be imposed on
+            the backpropagated Hessian proxy to achieve a Kronecker structure of the
+            weight Hessian differs from KFC (https://arxiv.org/pdf/1602.01407.pdf).
+            This could be changed for this factor to be more consistent with the
+            KFC approximations. If this is changed, this must be clearly communicated
+            to users as it will alter the KFRA quantity for weights of
+            convolutions in comparison to older versions. NOTE that this method is
+            currently shared by the weights and bias terms for KFRA, but the described
+            improvement would only apply to the weights, and not the bias.
+
+        Args:
+            module: Convolution layer.
+            backproped: Approximation for the batch-averaged Hessian w.r.t. the output
+                of the convolution layer. Has shape `[C * H * W, C * H * W]` if the
+                convolution's output is of shape `[N, C, H, W]`.
+
+        Returns:
+            Kronecker factor used for approximating the weight Hessian in convolutions.
+            Has shape `[C, C]` with `C` the convolution's output channels.
+        """
         _, out_c, out_x, out_y = module.output.size()
         out_pixels = out_x * out_y
         # sum over spatial coordinates
         result = backproped.view(out_c, out_pixels, out_c, out_pixels).sum([1, 3])
         return result.contiguous()
+
+    @staticmethod
+    def _maybe_raise_groups_not_implemented_error(ext: HBP, module: Conv2d):
+        """Raise NotImplementedError for grouped convolution.
+
+        Args:
+            ext: HBP extension.
+            module: Convolution layer.
+
+        Raises:
+            NotImplementedError: If groups ≠ 1.
+        """
+        if module.groups != 1:
+            ext_name = ext.__class__.__name__
+            raise NotImplementedError(
+                f"groups ≠ 1 is not supported by {ext_name} (got {module.groups})."
+            )
diff --git a/fully_documented.txt b/fully_documented.txt
index 2ec9d36a8..9a7fbe616 100644
--- a/fully_documented.txt
+++ b/fully_documented.txt
@@ -68,6 +68,7 @@ backpack/extensions/secondorder/diag_hessian/pad.py
 backpack/extensions/secondorder/diag_hessian/slicing.py
 backpack/extensions/secondorder/sqrt_ggn/
 backpack/extensions/secondorder/hbp/custom_module.py
+backpack/extensions/secondorder/hbp/conv2d.py
 
 backpack/hessianfree/ggnvp.py
 
diff --git a/test/extensions/secondorder/hbp/kfac_settings.py b/test/extensions/secondorder/hbp/kfac_settings.py
index 7e92a7f04..73511ad72 100644
--- a/test/extensions/secondorder/hbp/kfac_settings.py
+++ b/test/extensions/secondorder/hbp/kfac_settings.py
@@ -9,7 +9,9 @@
 from torch import rand
 from torch.nn import (
     BCEWithLogitsLoss,
+    Conv2d,
     CrossEntropyLoss,
+    Flatten,
     Identity,
     Linear,
     MSELoss,
@@ -55,6 +57,16 @@
         "target_fn": lambda: classification_targets(size=(1, 1), num_classes=2).float(),
         "id_prefix": "binary-labels",
     },
+    # convolution with single output is a linear layer (no weight sharing across input)
+    {
+        "input_fn": lambda: rand(1, 2, 3, 3),
+        "module_fn": lambda: Sequential(
+            Conv2d(2, 4, 3), ReLU(), Conv2d(4, 1, 1), Flatten()
+        ),
+        "loss_function_fn": lambda: MSELoss(reduction="mean"),
+        "target_fn": lambda: regression_targets((1, 1)),
+        "id_prefix": "convolution-single-output",
+    },
 ]
 
 _BATCH_SIZE_1_BRANCHING_SETTINGS = [
diff --git a/test/extensions/secondorder/hbp/kfra_settings.py b/test/extensions/secondorder/hbp/kfra_settings.py
index 94e65c2b7..713ff741a 100644
--- a/test/extensions/secondorder/hbp/kfra_settings.py
+++ b/test/extensions/secondorder/hbp/kfra_settings.py
@@ -1,5 +1,8 @@
 """Define test cases for KFRA."""
 
+from test.extensions.secondorder.hbp.kfac_settings import (
+    _BATCH_SIZE_1_NO_BRANCHING_SETTINGS,
+)
 from test.extensions.secondorder.secondorder_settings import (
     GROUP_CONV_SETTINGS,
     LINEAR_ADDITIONAL_DIMENSIONS_SETTINGS,
@@ -11,3 +14,5 @@
 LOCAL_NOT_SUPPORTED_SETTINGS = []
 
 NOT_SUPPORTED_SETTINGS = SHARED_NOT_SUPPORTED_SETTINGS + LOCAL_NOT_SUPPORTED_SETTINGS
+
+BATCH_SIZE_1_SETTINGS = _BATCH_SIZE_1_NO_BRANCHING_SETTINGS
diff --git a/test/extensions/secondorder/hbp/test_kfra.py b/test/extensions/secondorder/hbp/test_kfra.py
index 387438308..031c81111 100644
--- a/test/extensions/secondorder/hbp/test_kfra.py
+++ b/test/extensions/secondorder/hbp/test_kfra.py
@@ -1,21 +1,28 @@
 """Test BackPACK's KFRA extension."""
 
 from test.extensions.implementation.backpack import BackpackExtensions
-from test.extensions.problem import make_test_problems
-from test.extensions.secondorder.hbp.kfra_settings import NOT_SUPPORTED_SETTINGS
+from test.extensions.problem import ExtensionsTestProblem, make_test_problems
+from test.extensions.secondorder.hbp.kfra_settings import (
+    BATCH_SIZE_1_SETTINGS,
+    NOT_SUPPORTED_SETTINGS,
+)
+from test.utils.skip_extension_test import skip_BCEWithLogitsLoss
 
 import pytest
+from torch import Tensor, prod
 
 NOT_SUPPORTED_PROBLEMS = make_test_problems(NOT_SUPPORTED_SETTINGS)
 NOT_SUPPORTED_IDS = [problem.make_id() for problem in NOT_SUPPORTED_PROBLEMS]
+BATCH_SIZE_1_PROBLEMS = make_test_problems(BATCH_SIZE_1_SETTINGS)
+BATCH_SIZE_1_IDS = [problem.make_id() for problem in BATCH_SIZE_1_PROBLEMS]
 
 
 @pytest.mark.parametrize("problem", NOT_SUPPORTED_PROBLEMS, ids=NOT_SUPPORTED_IDS)
-def test_kfra_not_supported(problem):
+def test_kfra_not_supported(problem: ExtensionsTestProblem):
     """Check that the KFRA extension does not allow specific hyperparameters/modules.
 
     Args:
-        problem (ExtensionsTestProblem): Test case.
+        problem: Test case.
     """
     problem.set_up()
 
@@ -23,3 +30,26 @@ def test_kfra_not_supported(problem):
         BackpackExtensions(problem).kfra()
 
     problem.tear_down()
+
+
+@pytest.mark.parametrize("problem", BATCH_SIZE_1_PROBLEMS, ids=BATCH_SIZE_1_IDS)
+def test_kfra_dimensions(problem: ExtensionsTestProblem):
+    """Check that block Hessian approximation of KFRA has correct dimension.
+
+    This test runs KFRA code, but due to the approximations made in KFRA, a case
+    where it becomes exact and can therefore be tested for correct values still
+    needs to be identified.
+
+    Args:
+        problem: Test case.
+    """
+    problem.set_up()
+    skip_BCEWithLogitsLoss(problem)
+
+    backpack_kfra = BackpackExtensions(problem).kfra()
+    for p, p_kfra in zip(problem.trainable_parameters(), backpack_kfra):
+        assert all(kron.dim() == 2 for kron in p_kfra)
+        assert all(kron.shape[0] == kron.shape[1] for kron in p_kfra)
+
+        kron_dims = Tensor([kron_fac.shape[0] for kron_fac in p_kfra])
+        assert p.numel() == prod(kron_dims)
diff --git a/test/utils/skip_extension_test.py b/test/utils/skip_extension_test.py
index 899bb88ae..7c782872a 100644
--- a/test/utils/skip_extension_test.py
+++ b/test/utils/skip_extension_test.py
@@ -16,3 +16,13 @@ def skip_BCEWithLogitsLoss_non_binary_labels(problem: ExtensionsTestProblem) ->
         y not in [0, 1] for y in problem.target.flatten()
     ):
         skip("Skipping BCEWithLogitsLoss with non-binary labels")
+
+
+def skip_BCEWithLogitsLoss(problem: ExtensionsTestProblem) -> None:
+    """Skip if case uses BCEWithLogitsLoss as loss function.
+
+    Args:
+        problem: Extension test case.
+    """
+    if isinstance(problem.loss_function, BCEWithLogitsLoss):
+        skip("Skipping BCEWithLogitsLoss")

From de2ea903cded249c6b123de386fcfa1950d8b925 Mon Sep 17 00:00:00 2001
From: Felix Dangel <48687646+f-dangel@users.noreply.github.com>
Date: Mon, 14 Nov 2022 10:38:54 +0100
Subject: [PATCH 22/29] [ADD] Support Kronecker Hessian approximations for
 `Conv{1,2,3}d` (#291)

Generalize the Kronecker-factored approximations of Hessian diagonal blocks
(`KFRA`, `KFLR`, `KFAC`) for `Conv2d` to `Conv1d` and `Conv3d`. Add a test for the
`KFRA` approximation under specific limits.

* [DOC] Fully document `HBPConv2d`

* [TEST] KFAC/KFLR for convolution with single output

Convolution layers with a single output behave like linear layers, as
the weights are not shared over the input.

* [DOC] Polish docstrings

* [TEST] Add integration test for KFRA

* [ADD] Kronecker approximations for `ConvNd` (`N=1,2,3`)

* [DOC] Fix some typos in the docstrings

* [DOC] Simplify description of returned Kronecker proxies

* [TEST] Replace KFRA property check by value check

* [CI] Add `ConvNd` files to fully-documented

* [FIX] Typo in file name

* [FIX] Call KFRA, not KFLR
---
 .../extensions/secondorder/hbp/__init__.py    |   6 +
 backpack/extensions/secondorder/hbp/conv1d.py |  11 +
 backpack/extensions/secondorder/hbp/conv2d.py | 224 +---------------
 backpack/extensions/secondorder/hbp/conv3d.py |  11 +
 backpack/extensions/secondorder/hbp/convnd.py | 245 ++++++++++++++++++
 fully_documented.txt                          |   3 +
 .../secondorder/hbp/kfac_settings.py          |  24 +-
 test/extensions/secondorder/hbp/test_kfra.py  |  24 +-
 8 files changed, 317 insertions(+), 231 deletions(-)
 create mode 100644 backpack/extensions/secondorder/hbp/conv1d.py
 create mode 100644 backpack/extensions/secondorder/hbp/conv3d.py
 create mode 100644 backpack/extensions/secondorder/hbp/convnd.py

diff --git a/backpack/extensions/secondorder/hbp/__init__.py b/backpack/extensions/secondorder/hbp/__init__.py
index 84e336311..cd47d34ef 100644
--- a/backpack/extensions/secondorder/hbp/__init__.py
+++ b/backpack/extensions/secondorder/hbp/__init__.py
@@ -2,7 +2,9 @@
 from torch.nn import (
     AvgPool2d,
     BCEWithLogitsLoss,
+    Conv1d,
     Conv2d,
+    Conv3d,
     CrossEntropyLoss,
     Dropout,
     Flatten,
@@ -28,7 +30,9 @@
 
 from . import (
     activations,
+    conv1d,
     conv2d,
+    conv3d,
     custom_module,
     dropout,
     flatten,
@@ -64,7 +68,9 @@ def __init__(
                 MaxPool2d: pooling.HBPMaxpool2d(),
                 AvgPool2d: pooling.HBPAvgPool2d(),
                 ZeroPad2d: padding.HBPZeroPad2d(),
+                Conv1d: conv1d.HBPConv1d(),
                 Conv2d: conv2d.HBPConv2d(),
+                Conv3d: conv3d.HBPConv3d(),
                 Dropout: dropout.HBPDropout(),
                 Flatten: flatten.HBPFlatten(),
                 ReLU: activations.HBPReLU(),
diff --git a/backpack/extensions/secondorder/hbp/conv1d.py b/backpack/extensions/secondorder/hbp/conv1d.py
new file mode 100644
index 000000000..3cc9234d4
--- /dev/null
+++ b/backpack/extensions/secondorder/hbp/conv1d.py
@@ -0,0 +1,11 @@
+"""Kronecker approximations of the Hessian for 1d convolution layers."""
+
+from backpack.extensions.secondorder.hbp.convnd import HBPConvNd
+
+
+class HBPConv1d(HBPConvNd):
+    """Computes Kronecker-structured Hessian approximations for 1d convolutions."""
+
+    def __init__(self):
+        """Instantiate base class with convolution dimension."""
+        super().__init__(N=1)
diff --git a/backpack/extensions/secondorder/hbp/conv2d.py b/backpack/extensions/secondorder/hbp/conv2d.py
index 06e5a947b..9b5a7c62e 100644
--- a/backpack/extensions/secondorder/hbp/conv2d.py
+++ b/backpack/extensions/secondorder/hbp/conv2d.py
@@ -1,223 +1,11 @@
-"""Kronecker approximations of the Hessian for convolution layers."""
+"""Kronecker approximations of the Hessian for 2d convolution layers."""
 
-from __future__ import annotations
+from backpack.extensions.secondorder.hbp.convnd import HBPConvNd
 
-from typing import TYPE_CHECKING, List, Tuple
 
-from torch import Tensor, einsum
-from torch.nn import Conv2d
-
-from backpack.core.derivatives.conv2d import Conv2DDerivatives
-from backpack.extensions.secondorder.hbp.hbp_options import (
-    BackpropStrategy,
-    ExpectationApproximation,
-)
-from backpack.extensions.secondorder.hbp.hbpbase import HBPBaseModule
-from backpack.utils import conv as convUtils
-
-if TYPE_CHECKING:
-    from backpack.extensions.secondorder.hbp import HBP
-
-
-class HBPConv2d(HBPBaseModule):
-    """Computes Kronecker-structured Hessian approximations for convolution layers."""
+class HBPConv2d(HBPConvNd):
+    """Computes Kronecker-structured Hessian approximations for 2d convolutions."""
 
     def __init__(self):
-        """Pass derivatives for convolution."""
-        super().__init__(derivatives=Conv2DDerivatives(), params=["weight", "bias"])
-
-    def weight(
-        self,
-        ext: HBP,
-        module: Conv2d,
-        g_inp: Tuple[Tensor],
-        g_out: Tuple[Tensor],
-        backproped: Tensor,
-    ) -> List[Tensor]:
-        """Compute the Kronecker factors for the weight Hessian approximation.
-
-        Note:
-            TODO The Kronecker factor computed from the backpropagated quantity
-            differs from the KFC paper (https://arxiv.org/pdf/1602.01407.pdf)
-            by a factor of |Τ| = H * W where H, W denote the spatial output
-            dimensions of the convolution. If this convention is changed to be
-            more consistent with the paper, this must be clearly communicated
-            to users as it will alter the scale of the KFAC quantity for weights of
-            convolutions in comparison to older versions.
-
-        Args:
-            ext: HBP extension.
-            module: convolution layer the backpropagation is performed on.
-            g_inp: input gradient.
-            g_out: output gradient.
-            backproped: Backpropagated quantity, depends on the approximation mode.
-                For KFLR/KFAC this is the MC/exact matrix square root of the GGN w.r.t.
-                the convolution output (shape `[N, C, H, W]`) and has shape
-                `[M, N, C, H, W]` with `M` the number of MC samples or the number of
-                classes for KFAC/KFLR, respectively. For KFRA, the backpropagated
-                object approximates the batch-averaged Hessian w.r.t. to the convolution
-                output and has shape `[C * H * W, C * H * W]`.
-
-        Returns:
-            List of Kronecker factors whose Kronecker product approximates the weight
-            Hessian. Its length depends on the Hessian approximation. If `[A, B, C]`
-            is returned, then `A ⊗ B ⊗ C` has shape `[weights.numel(), weights.numel()]`
-            and approximates the weight Hessian.
-        """
-        self._maybe_raise_groups_not_implemented_error(ext, module)
-
-        kron_factors: List[Tensor] = []
-        bp_strategy = ext.get_backprop_strategy()
-
-        # TODO Find a setting in which this corresponds to an autodiff quantity, test
-        if BackpropStrategy.is_batch_average(bp_strategy):  # KFRA
-            kron_factors.append(self._factor_from_batch_average(module, backproped))
-
-        elif BackpropStrategy.is_sqrt(bp_strategy):  # KFLR, KFAC
-            kron_factors.append(self._factor_from_sqrt(module, backproped))
-
-        kron_factors += self._factors_from_input(ext, module)
-        return kron_factors
-
-    def _factors_from_input(self, ext: HBP, module: Conv2d) -> List[Tensor]:
-        """Compute the un-centered covariance of the unfolded input.
-
-        In the notation of https://arxiv.org/pdf/1602.01407.pdf,
-        this computes Ω from equation (32) for KFAC.
-
-        Args:
-            ext: HBP extension.
-            module: Convolution layer.
-
-        Raises:
-            NotImplementedError: If the backpropagation strategy differs from KFAC.
-
-        Returns:
-            List containing the tensor of the un-centered covariance of the unfolded
-            input. For a convolution with kernel of size `[_, C_in, K_H, K_W]`, its
-            shape is shape `[C_in * K_H * K_W, C_in * K_H * K_W]`.
-        """
-        ea_strategy = ext.get_ea_strategy()
-        if ExpectationApproximation.should_average_param_jac(ea_strategy):
-            raise NotImplementedError("Undefined")
-
-        X = convUtils.unfold_input(module, module.input0)
-
-        return [einsum("bik,bjk->ij", X, X) / X.shape[0]]
-
-    def _factor_from_sqrt(self, module: Conv2d, backproped: Tensor) -> Tensor:
-        """Compute the Kronecker factor from the backpropagated GGN matrix square root.
-
-        In the notation of https://arxiv.org/pdf/1602.01407.pdf,
-        this computes |Τ| * Γ from equation (32) for KFAC.
-
-        Note:
-            In comparison to the KFC paper, the output differs by a factor of |Τ|.
-            This is because |Τ| * Γ is the MC/exact GGN w.r.t. the convolution's bias,
-            For two-dimensional convolution with output of shape `[N, C_out, H, W]`,
-            |Τ| = H * W.
-
-        Args:
-            module: Convolution layer.
-            backproped: Backpropagated quantity, corresponding to the MC/exact matrix
-                square square root of the GGN w.r.t. the convolution output. For a
-                convolution with output shape `[N, C_out, H, W]`, this square root is
-                of shape `[M, N, C_out, H, W]` where `M` is the number of MC samples
-                for KFAC, and the number of classes for KFLR. The matrix square root
-                already incorporates a normalization factor for batch size averaging.
-
-        Returns:
-            MC/exact GGN w.r.t. the bias. Has shape `[C_out, C_out]`
-        """
-        num_spatial_dims = 2
-
-        sqrt_ggn = backproped.flatten(start_dim=-num_spatial_dims)
-        sqrt_ggn = einsum("cbij->cbi", sqrt_ggn)
-        return einsum("cbi,cbl->il", sqrt_ggn, sqrt_ggn)
-
-    def bias(
-        self,
-        ext: HBP,
-        module: Conv2d,
-        g_inp: Tuple[Tensor],
-        g_out: Tuple[Tensor],
-        backproped: Tensor,
-    ) -> List[Tensor]:
-        """Compute the Kronecker factors for the bias Hessian approximation.
-
-        Args:
-            ext: HBP extension.
-            module: convolution layer the backpropagation is performed on.
-            g_inp: input gradient.
-            g_out: output gradient.
-            backproped: Backpropagated quantity, depends on the approximation mode.
-                For KFLR/KFAC this is the MC/exact matrix square root of the GGN w.r.t.
-                the convolution output (shape `[N, C, H, W]`) and has shape
-                `[M, N, C, H, W]` with `M` the number of MC samples or the number of
-                classes for KFAC/KFLR, respectively. For KFRA, the backpropagated
-                object approximates the batch-averaged Hessian w.r.t. the convolution
-                output and has shape `[C * H * W, C * H * W]`.
-
-        Returns:
-            List of Kronecker factors whose Kronecker product approximates the bias
-            Hessian. Its length depends on the Hessian approximation. If `[A, B, C]`
-            is returned, then `A ⊗ B ⊗ C` has shape `[bias.numel(), bias.numel()]`
-            and approximates the bias Hessian.
-        """
-        kron_factors: List[Tensor] = []
-        bp_strategy = ext.get_backprop_strategy()
-
-        # TODO Find a setting in which this corresponds to an autodiff quantity, test
-        if BackpropStrategy.is_batch_average(bp_strategy):  # KFRA
-            kron_factors.append(self._factor_from_batch_average(module, backproped))
-        elif BackpropStrategy.is_sqrt(bp_strategy):  # KFAC/KFLR
-            kron_factors.append(self._factor_from_sqrt(module, backproped))
-
-        return kron_factors
-
-    def _factor_from_batch_average(self, module: Conv2d, backproped: Tensor) -> Tensor:
-        """Compute the Kronecker factor from the backpropagated output Hessian proxy.
-
-        Note:
-            TODO Currently, the Kronecker approximation that needs to be imposed on
-            the backpropagated Hessian proxy to achieve a Kronecker structure of the
-            weight Hessian differs from KFC (https://arxiv.org/pdf/1602.01407.pdf).
-            This could be changed for this factor to be more consistent with the
-            KFC approximations. If this is changed, this must be clearly communicated
-            to users as it will alter the KFRA quantity for weights of
-            convolutions in comparison to older versions. NOTE that this method is
-            currently shared by the weights and bias terms for KFRA, but the described
-            improvement would only apply to the weights, and not the bias.
-
-        Args:
-            module: Convolution layer.
-            backproped: Approximation for the batch-averaged Hessian w.r.t. the output
-                of the convolution layer. Has shape `[C * H * W, C * H * W]` if the
-                convolution's output is of shape `[N, C, H, W]`.
-
-        Returns:
-            Kronecker factor used for approximating the weight Hessian in convolutions.
-            Has shape `[C, C]` with `C` the convolution's output channels.
-        """
-        _, out_c, out_x, out_y = module.output.size()
-        out_pixels = out_x * out_y
-        # sum over spatial coordinates
-        result = backproped.view(out_c, out_pixels, out_c, out_pixels).sum([1, 3])
-        return result.contiguous()
-
-    @staticmethod
-    def _maybe_raise_groups_not_implemented_error(ext: HBP, module: Conv2d):
-        """Raise NotImplementedError for grouped convolution.
-
-        Args:
-            ext: HBP extension.
-            module: Convolution layer.
-
-        Raises:
-            NotImplementedError: If groups ≠ 1.
-        """
-        if module.groups != 1:
-            ext_name = ext.__class__.__name__
-            raise NotImplementedError(
-                f"groups ≠ 1 is not supported by {ext_name} (got {module.groups})."
-            )
+        """Instantiate base class with convolution dimension."""
+        super().__init__(N=2)
diff --git a/backpack/extensions/secondorder/hbp/conv3d.py b/backpack/extensions/secondorder/hbp/conv3d.py
new file mode 100644
index 000000000..9f9873b29
--- /dev/null
+++ b/backpack/extensions/secondorder/hbp/conv3d.py
@@ -0,0 +1,11 @@
+"""Kronecker approximations of the Hessian for 3d convolution layers."""
+
+from backpack.extensions.secondorder.hbp.convnd import HBPConvNd
+
+
+class HBPConv3d(HBPConvNd):
+    """Computes Kronecker-structured Hessian approximations for 3d convolutions."""
+
+    def __init__(self):
+        """Instantiate base class with convolution dimension."""
+        super().__init__(N=3)
diff --git a/backpack/extensions/secondorder/hbp/convnd.py b/backpack/extensions/secondorder/hbp/convnd.py
new file mode 100644
index 000000000..3f20dd718
--- /dev/null
+++ b/backpack/extensions/secondorder/hbp/convnd.py
@@ -0,0 +1,245 @@
+"""Kronecker approximations of the Hessian for convolution layers."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, List, Tuple, Union
+
+from torch import Tensor, einsum
+from torch.nn import Conv1d, Conv2d, Conv3d
+
+from backpack.core.derivatives.conv1d import Conv1DDerivatives
+from backpack.core.derivatives.conv2d import Conv2DDerivatives
+from backpack.core.derivatives.conv3d import Conv3DDerivatives
+from backpack.extensions.secondorder.hbp.hbp_options import (
+    BackpropStrategy,
+    ExpectationApproximation,
+)
+from backpack.extensions.secondorder.hbp.hbpbase import HBPBaseModule
+from backpack.utils import conv as convUtils
+
+if TYPE_CHECKING:
+    from backpack.extensions.secondorder.hbp import HBP
+
+
+class HBPConvNd(HBPBaseModule):
+    """Computes Kronecker-structured Hessian approximations for convolution layers.
+
+    NOTE docstrings use 2d convolution to explain the arguments and output shapes.
+    """
+
+    def __init__(self, N: int):
+        """Store dimension of convolution.
+
+        Args:
+            N: Dimension of convolution.
+        """
+        self._conv_dim = N
+        derivatives_cls = {
+            1: Conv1DDerivatives,
+            2: Conv2DDerivatives,
+            3: Conv3DDerivatives,
+        }[N]
+        super().__init__(derivatives_cls(), params=["weight", "bias"])
+
+    def weight(
+        self,
+        ext: HBP,
+        module: Union[Conv1d, Conv2d, Conv3d],
+        g_inp: Tuple[Tensor],
+        g_out: Tuple[Tensor],
+        backproped: Tensor,
+    ) -> List[Tensor]:
+        """Compute the Kronecker factors for the weight Hessian approximation.
+
+        Note:
+            TODO The Kronecker factor computed from the backpropagated quantity
+            differs from the KFC paper (https://arxiv.org/pdf/1602.01407.pdf)
+            by a factor of |Τ| = H * W where H, W denote the spatial output
+            dimensions of the convolution. If this convention is changed to be
+            more consistent with the paper, this must be clearly communicated
+            to users as it will alter the scale of the KFAC quantity for weights of
+            convolutions in comparison to older versions.
+
+        Args:
+            ext: HBP extension.
+            module: convolution layer the backpropagation is performed on.
+            g_inp: input gradient.
+            g_out: output gradient.
+            backproped: Backpropagated quantity, depends on the approximation mode.
+                For KFLR/KFAC this is the MC/exact matrix square root of the GGN w.r.t.
+                the convolution output (shape `[M, N, C, H, W]`) and has shape
+                `[M, N, C, H, W]` with `M` the number of MC samples or the number of
+                classes for KFAC/KFLR, respectively. For KFRA, the backpropagated
+                object approximates the batch-averaged GGN w.r.t. to the convolution
+                output and has shape `[C * H * W, C * H * W]`.
+
+        Returns:
+            List of Kronecker factors whose Kronecker product approximates the weight
+            Hessian. Its length depends on the Hessian approximation. If `[A, B]` is
+            returned, then `A ⊗ B` has shape `[weight.numel(), weight.numel()]` and
+            approximates the weight Hessian.
+        """
+        self._maybe_raise_groups_not_implemented_error(ext, module)
+
+        kron_factors: List[Tensor] = []
+        bp_strategy = ext.get_backprop_strategy()
+
+        if BackpropStrategy.is_batch_average(bp_strategy):  # KFRA
+            kron_factors.append(self._factor_from_batch_average(module, backproped))
+
+        elif BackpropStrategy.is_sqrt(bp_strategy):  # KFLR, KFAC
+            kron_factors.append(self._factor_from_sqrt(module, backproped))
+
+        kron_factors += self._factors_from_input(ext, module)
+        return kron_factors
+
+    def _factors_from_input(
+        self, ext: HBP, module: Union[Conv1d, Conv2d, Conv3d]
+    ) -> List[Tensor]:
+        """Compute the un-centered covariance of the unfolded input.
+
+        In the notation of https://arxiv.org/pdf/1602.01407.pdf,
+        this computes Ω from equation (32) for KFAC.
+
+        Args:
+            ext: HBP extension.
+            module: Convolution layer.
+
+        Raises:
+            NotImplementedError: If the backpropagation strategy differs from KFAC.
+
+        Returns:
+            List containing the tensor of the un-centered covariance of the unfolded
+            input. For a convolution with kernel of size `[_, C_in, K_H, K_W]`, its
+            shape is `[C_in * K_H * K_W, C_in * K_H * K_W]`.
+        """
+        ea_strategy = ext.get_ea_strategy()
+        if ExpectationApproximation.should_average_param_jac(ea_strategy):
+            raise NotImplementedError("Undefined")
+
+        X = convUtils.unfold_input(module, module.input0)
+
+        return [einsum("bik,bjk->ij", X, X) / X.shape[0]]
+
+    def _factor_from_sqrt(
+        self, module: Union[Conv1d, Conv2d, Conv3d], backproped: Tensor
+    ) -> Tensor:
+        """Compute the Kronecker factor from the backpropagated GGN matrix square root.
+
+        In the notation of https://arxiv.org/pdf/1602.01407.pdf,
+        this computes |Τ| * Γ from equation (32) for KFAC.
+
+        Note:
+            In comparison to the KFC paper, the output differs by a factor of |Τ|.
+            This is because |Τ| * Γ is the MC/exact GGN w.r.t. the convolution's bias.
+            For two-dimensional convolution with output of shape `[N, C_out, H, W]`,
+            |Τ| = H * W.
+
+        Args:
+            module: Convolution layer.
+            backproped: Backpropagated quantity, corresponding to the MC/exact matrix
+                square square root of the GGN w.r.t. the convolution output. For a
+                convolution with output shape `[N, C_out, H, W]`, this square root is
+                of shape `[M, N, C_out, H, W]` where `M` is the number of MC samples
+                for KFAC, and the number of classes for KFLR. The matrix square root
+                already incorporates a normalization factor for batch size averaging.
+
+        Returns:
+            MC/exact GGN w.r.t. the bias. Has shape `[C_out, C_out]`
+        """
+        sqrt_ggn = backproped.flatten(start_dim=-self._conv_dim)
+        sqrt_ggn = einsum("cbij->cbi", sqrt_ggn)
+        return einsum("cbi,cbl->il", sqrt_ggn, sqrt_ggn)
+
+    def bias(
+        self,
+        ext: HBP,
+        module: Union[Conv1d, Conv2d, Conv3d],
+        g_inp: Tuple[Tensor],
+        g_out: Tuple[Tensor],
+        backproped: Tensor,
+    ) -> List[Tensor]:
+        """Compute the Kronecker factors for the bias Hessian approximation.
+
+        Args:
+            ext: HBP extension.
+            module: convolution layer the backpropagation is performed on.
+            g_inp: input gradient.
+            g_out: output gradient.
+            backproped: Backpropagated quantity, depends on the approximation mode.
+                For KFLR/KFAC this is the MC/exact matrix square root of the GGN w.r.t.
+                the convolution output (shape `[M, N, C, H, W]`) and has shape
+                `[M, N, C, H, W]` with `M` the number of MC samples or the number of
+                classes for KFAC/KFLR, respectively. For KFRA, the backpropagated
+                object approximates the batch-averaged GGN w.r.t. to the convolution
+                output and has shape `[C * H * W, C * H * W]`.
+
+        Returns:
+            List containing a single tensor of shape `[bias.numel(), bias.numel()]` that
+            approximates the bias Hessian.
+        """
+        kron_factors: List[Tensor] = []
+        bp_strategy = ext.get_backprop_strategy()
+
+        if BackpropStrategy.is_batch_average(bp_strategy):  # KFRA
+            kron_factors.append(self._factor_from_batch_average(module, backproped))
+
+        elif BackpropStrategy.is_sqrt(bp_strategy):  # KFAC/KFLR
+            kron_factors.append(self._factor_from_sqrt(module, backproped))
+
+        return kron_factors
+
+    def _factor_from_batch_average(
+        self, module: Union[Conv1d, Conv2d, Conv3d], backproped: Tensor
+    ) -> Tensor:
+        """Compute the Kronecker factor from the backpropagated output Hessian proxy.
+
+        Note:
+            TODO Currently, the Kronecker approximation that needs to be imposed on
+            the backpropagated Hessian proxy to achieve a Kronecker structure of the
+            weight Hessian differs from KFC (https://arxiv.org/pdf/1602.01407.pdf).
+            This could be changed for this factor to be more consistent with the
+            KFC approximations. If this is changed, this must be clearly communicated
+            to users as it will alter the KFRA quantity for weights of
+            convolutions in comparison to older versions. NOTE that this method is
+            currently shared by the weights and bias terms for KFRA, but the described
+            improvement would only apply to the weights, and not the bias.
+
+        Args:
+            module: Convolution layer.
+            backproped: Approximation for the batch-averaged Hessian w.r.t. the output
+                of the convolution layer. Has shape `[C * H * W, C * H * W]` if the
+                convolution's output is of shape `[N, C, H, W]`.
+
+        Returns:
+            Kronecker factor used for approximating the weight Hessian in convolutions.
+            Has shape `[C, C]` with `C` the convolution's output channels.
+        """
+        spatial_dim = module.output.shape[-self._conv_dim :].numel()
+        out_channels = module.output.shape[-self._conv_dim - 1]
+
+        # sum over spatial coordinates
+        return (
+            backproped.view(out_channels, spatial_dim, out_channels, spatial_dim)
+            .sum([1, 3])
+            .contiguous()
+        )
+
+    @staticmethod
+    def _maybe_raise_groups_not_implemented_error(
+        ext: HBP, module: Union[Conv1d, Conv2d, Conv3d]
+    ):
+        """Raise NotImplementedError for grouped convolution.
+
+        Args:
+            ext: HBP extension.
+            module: Convolution layer.
+
+        Raises:
+            NotImplementedError: If groups ≠ 1.
+        """
+        if module.groups != 1:
+            ext_name = ext.__class__.__name__
+            raise NotImplementedError(
+                f"groups ≠ 1 is not supported by {ext_name} (got {module.groups})."
+            )
diff --git a/fully_documented.txt b/fully_documented.txt
index 9a7fbe616..4f3d33c88 100644
--- a/fully_documented.txt
+++ b/fully_documented.txt
@@ -68,7 +68,10 @@ backpack/extensions/secondorder/diag_hessian/pad.py
 backpack/extensions/secondorder/diag_hessian/slicing.py
 backpack/extensions/secondorder/sqrt_ggn/
 backpack/extensions/secondorder/hbp/custom_module.py
+backpack/extensions/secondorder/hbp/conv1d.py
 backpack/extensions/secondorder/hbp/conv2d.py
+backpack/extensions/secondorder/hbp/conv3d.py
+backpack/extensions/secondorder/hbp/convnd.py
 
 backpack/hessianfree/ggnvp.py
 
diff --git a/test/extensions/secondorder/hbp/kfac_settings.py b/test/extensions/secondorder/hbp/kfac_settings.py
index 73511ad72..ec124be01 100644
--- a/test/extensions/secondorder/hbp/kfac_settings.py
+++ b/test/extensions/secondorder/hbp/kfac_settings.py
@@ -9,7 +9,9 @@
 from torch import rand
 from torch.nn import (
     BCEWithLogitsLoss,
+    Conv1d,
     Conv2d,
+    Conv3d,
     CrossEntropyLoss,
     Flatten,
     Identity,
@@ -58,6 +60,16 @@
         "id_prefix": "binary-labels",
     },
     # convolution with single output is a linear layer (no weight sharing across input)
+    {
+        "input_fn": lambda: rand(1, 2, 4),
+        "module_fn": lambda: Sequential(
+            Conv1d(2, 4, 4), ReLU(), Conv1d(4, 1, 1), Flatten()
+        ),
+        "loss_function_fn": lambda: MSELoss(reduction="mean"),
+        "target_fn": lambda: regression_targets((1, 1)),
+        "id_prefix": "conv1d-single-output",
+    },
+    # convolution with single output is a linear layer (no weight sharing across input)
     {
         "input_fn": lambda: rand(1, 2, 3, 3),
         "module_fn": lambda: Sequential(
@@ -65,7 +77,17 @@
         ),
         "loss_function_fn": lambda: MSELoss(reduction="mean"),
         "target_fn": lambda: regression_targets((1, 1)),
-        "id_prefix": "convolution-single-output",
+        "id_prefix": "conv2d-single-output",
+    },
+    # convolution with single output is a linear layer (no weight sharing across input)
+    {
+        "input_fn": lambda: rand(1, 2, 3, 3, 3),
+        "module_fn": lambda: Sequential(
+            Conv3d(2, 3, 3), ReLU(), Conv3d(3, 1, 1), Flatten()
+        ),
+        "loss_function_fn": lambda: MSELoss(reduction="mean"),
+        "target_fn": lambda: regression_targets((1, 1)),
+        "id_prefix": "conv3d-single-output",
     },
 ]
 
diff --git a/test/extensions/secondorder/hbp/test_kfra.py b/test/extensions/secondorder/hbp/test_kfra.py
index 031c81111..6a3563587 100644
--- a/test/extensions/secondorder/hbp/test_kfra.py
+++ b/test/extensions/secondorder/hbp/test_kfra.py
@@ -1,5 +1,7 @@
 """Test BackPACK's KFRA extension."""
 
+from test.automated_test import check_sizes_and_values
+from test.extensions.implementation.autograd import AutogradExtensions
 from test.extensions.implementation.backpack import BackpackExtensions
 from test.extensions.problem import ExtensionsTestProblem, make_test_problems
 from test.extensions.secondorder.hbp.kfra_settings import (
@@ -9,7 +11,8 @@
 from test.utils.skip_extension_test import skip_BCEWithLogitsLoss
 
 import pytest
-from torch import Tensor, prod
+
+from backpack.utils.kroneckers import kfacs_to_mat
 
 NOT_SUPPORTED_PROBLEMS = make_test_problems(NOT_SUPPORTED_SETTINGS)
 NOT_SUPPORTED_IDS = [problem.make_id() for problem in NOT_SUPPORTED_PROBLEMS]
@@ -33,12 +36,8 @@ def test_kfra_not_supported(problem: ExtensionsTestProblem):
 
 
 @pytest.mark.parametrize("problem", BATCH_SIZE_1_PROBLEMS, ids=BATCH_SIZE_1_IDS)
-def test_kfra_dimensions(problem: ExtensionsTestProblem):
-    """Check that block Hessian approximation of KFRA has correct dimension.
-
-    This test runs KFRA code, but due to the approximations made in KFRA, a case
-    where it becomes exact and can therefore be tested for correct values still
-    needs to be identified.
+def test_kfra_equals_ggn(problem: ExtensionsTestProblem):
+    """Check that for batch_size = 1 and linear layers, KFRA is the GGN block.
 
     Args:
         problem: Test case.
@@ -46,10 +45,11 @@ def test_kfra_dimensions(problem: ExtensionsTestProblem):
     problem.set_up()
     skip_BCEWithLogitsLoss(problem)
 
+    autograd_res = AutogradExtensions(problem).ggn_blocks()
+
     backpack_kfra = BackpackExtensions(problem).kfra()
-    for p, p_kfra in zip(problem.trainable_parameters(), backpack_kfra):
-        assert all(kron.dim() == 2 for kron in p_kfra)
-        assert all(kron.shape[0] == kron.shape[1] for kron in p_kfra)
+    backpack_res = [kfacs_to_mat(kfra) for kfra in backpack_kfra]
 
-        kron_dims = Tensor([kron_fac.shape[0] for kron_fac in p_kfra])
-        assert p.numel() == prod(kron_dims)
+    check_sizes_and_values(autograd_res, backpack_res, atol=1e-7, rtol=1e-5)
+
+    problem.tear_down()

From d3b134ff677cbfcd5f5a38b390daa48070ad6765 Mon Sep 17 00:00:00 2001
From: Felix Dangel <48687646+f-dangel@users.noreply.github.com>
Date: Tue, 15 Nov 2022 10:54:44 +0100
Subject: [PATCH 23/29] [ADD] Kronecker Hessian approximations for
 `ConvTranspose{1,2,3}d` (#292)

Generalize Kronecker approximations for convolution to transpose convolution.

* [DOC] Fully document `HBPConv2d`

* [TEST] KFAC/KFLR for convolution with single output

Convolution layers with a single output behave like linear layers, as
the weights are not shared over the input.

* [DOC] Polish docstrings

* [TEST] Add integration test for KFRA

* [ADD] Kronecker approximations for `ConvNd` (`N=1,2,3`)

* [DOC] Fix some typos in the docstrings

* [DOC] Simplify description of returned Kronecker proxies

* [TEST] Replace KFRA property check by value check

* [CI] Add `ConvNd` files to fully-documented

* [FIX] Typo in file name

* [FIX] Call KFRA, not KFLR

* [ADD] Kronecker approximations for `ConvTranspose{1,2,3}d`

- adds a test case for `ConvTranspose2d`. Note that due to the
  different index order, working with the Kronecker representation
  for weights of transpose convolutions is involved. Warn user about
  this.
- adapt tests of `KFAC, KFLR, KFRA` by adding a utility function to
  fix the index order after expanding the Kronecker product.

* [TEST] Add cases for `ConvTranspose{1,3}d`

* [FIX] pydocstyle
---
 .../extensions/secondorder/hbp/__init__.py    |   9 +
 backpack/extensions/secondorder/hbp/conv2d.py |   2 +-
 .../secondorder/hbp/conv_transpose1d.py       |  11 +
 .../secondorder/hbp/conv_transpose2d.py       |  11 +
 .../secondorder/hbp/conv_transpose3d.py       |  11 +
 .../secondorder/hbp/conv_transposend.py       | 273 ++++++++++++++++++
 fully_documented.txt                          |   5 +
 test/extensions/implementation/backpack.py    |  39 +++
 .../secondorder/hbp/kfac_settings.py          |  45 +++
 test/extensions/secondorder/hbp/test_kfac.py  |  12 +-
 test/extensions/secondorder/hbp/test_kflr.py  |   7 +-
 test/extensions/secondorder/hbp/test_kfra.py  |   6 +-
 test/utils/conv_transpose.py                  |  81 ++++++
 13 files changed, 494 insertions(+), 18 deletions(-)
 create mode 100644 backpack/extensions/secondorder/hbp/conv_transpose1d.py
 create mode 100644 backpack/extensions/secondorder/hbp/conv_transpose2d.py
 create mode 100644 backpack/extensions/secondorder/hbp/conv_transpose3d.py
 create mode 100644 backpack/extensions/secondorder/hbp/conv_transposend.py
 create mode 100644 test/utils/conv_transpose.py

diff --git a/backpack/extensions/secondorder/hbp/__init__.py b/backpack/extensions/secondorder/hbp/__init__.py
index cd47d34ef..883434a33 100644
--- a/backpack/extensions/secondorder/hbp/__init__.py
+++ b/backpack/extensions/secondorder/hbp/__init__.py
@@ -5,6 +5,9 @@
     Conv1d,
     Conv2d,
     Conv3d,
+    ConvTranspose1d,
+    ConvTranspose2d,
+    ConvTranspose3d,
     CrossEntropyLoss,
     Dropout,
     Flatten,
@@ -33,6 +36,9 @@
     conv1d,
     conv2d,
     conv3d,
+    conv_transpose1d,
+    conv_transpose2d,
+    conv_transpose3d,
     custom_module,
     dropout,
     flatten,
@@ -71,6 +77,9 @@ def __init__(
                 Conv1d: conv1d.HBPConv1d(),
                 Conv2d: conv2d.HBPConv2d(),
                 Conv3d: conv3d.HBPConv3d(),
+                ConvTranspose1d: conv_transpose1d.HBPConvTranspose1d(),
+                ConvTranspose2d: conv_transpose2d.HBPConvTranspose2d(),
+                ConvTranspose3d: conv_transpose3d.HBPConvTranspose3d(),
                 Dropout: dropout.HBPDropout(),
                 Flatten: flatten.HBPFlatten(),
                 ReLU: activations.HBPReLU(),
diff --git a/backpack/extensions/secondorder/hbp/conv2d.py b/backpack/extensions/secondorder/hbp/conv2d.py
index 9b5a7c62e..031663375 100644
--- a/backpack/extensions/secondorder/hbp/conv2d.py
+++ b/backpack/extensions/secondorder/hbp/conv2d.py
@@ -4,7 +4,7 @@
 
 
 class HBPConv2d(HBPConvNd):
-    """Computes Kronecker-structured Hessian approximations for 2d convolutions."""
+    """Compute Kronecker-structured Hessian approximations for 2d convolutions."""
 
     def __init__(self):
         """Instantiate base class with convolution dimension."""
diff --git a/backpack/extensions/secondorder/hbp/conv_transpose1d.py b/backpack/extensions/secondorder/hbp/conv_transpose1d.py
new file mode 100644
index 000000000..db92e5b6d
--- /dev/null
+++ b/backpack/extensions/secondorder/hbp/conv_transpose1d.py
@@ -0,0 +1,11 @@
+"""Kronecker approximations of the Hessian for 1d transpose convolution layers."""
+
+from backpack.extensions.secondorder.hbp.conv_transposend import HBPConvTransposeNd
+
+
+class HBPConvTranspose1d(HBPConvTransposeNd):
+    """Compute Kronecker-structured Hessian proxies for 1d transpose convolutions."""
+
+    def __init__(self):
+        """Instantiate base class with convolution dimension."""
+        super().__init__(N=1)
diff --git a/backpack/extensions/secondorder/hbp/conv_transpose2d.py b/backpack/extensions/secondorder/hbp/conv_transpose2d.py
new file mode 100644
index 000000000..1601eb0ee
--- /dev/null
+++ b/backpack/extensions/secondorder/hbp/conv_transpose2d.py
@@ -0,0 +1,11 @@
+"""Kronecker approximations of the Hessian for 2d transpose convolution layers."""
+
+from backpack.extensions.secondorder.hbp.conv_transposend import HBPConvTransposeNd
+
+
+class HBPConvTranspose2d(HBPConvTransposeNd):
+    """Compute Kronecker-structured Hessian proxies for 2d transpose convolutions."""
+
+    def __init__(self):
+        """Instantiate base class with convolution dimension."""
+        super().__init__(N=2)
diff --git a/backpack/extensions/secondorder/hbp/conv_transpose3d.py b/backpack/extensions/secondorder/hbp/conv_transpose3d.py
new file mode 100644
index 000000000..439328487
--- /dev/null
+++ b/backpack/extensions/secondorder/hbp/conv_transpose3d.py
@@ -0,0 +1,11 @@
+"""Kronecker approximations of the Hessian for 3d transpose convolution layers."""
+
+from backpack.extensions.secondorder.hbp.conv_transposend import HBPConvTransposeNd
+
+
+class HBPConvTranspose3d(HBPConvTransposeNd):
+    """Compute Kronecker-structured Hessian proxies for 3d transpose convolutions."""
+
+    def __init__(self):
+        """Instantiate base class with convolution dimension."""
+        super().__init__(N=3)
diff --git a/backpack/extensions/secondorder/hbp/conv_transposend.py b/backpack/extensions/secondorder/hbp/conv_transposend.py
new file mode 100644
index 000000000..01fba1966
--- /dev/null
+++ b/backpack/extensions/secondorder/hbp/conv_transposend.py
@@ -0,0 +1,273 @@
+"""Kronecker approximations of the Hessian for transpose convolution layers."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, List, Tuple, Union
+from warnings import warn
+
+from torch import Tensor, einsum
+from torch.nn import ConvTranspose1d, ConvTranspose2d, ConvTranspose3d
+
+from backpack.core.derivatives.conv_transpose1d import ConvTranspose1DDerivatives
+from backpack.core.derivatives.conv_transpose2d import ConvTranspose2DDerivatives
+from backpack.core.derivatives.conv_transpose3d import ConvTranspose3DDerivatives
+from backpack.extensions.secondorder.hbp.hbp_options import (
+    BackpropStrategy,
+    ExpectationApproximation,
+)
+from backpack.extensions.secondorder.hbp.hbpbase import HBPBaseModule
+from backpack.utils.conv_transpose import unfold_by_conv_transpose
+
+if TYPE_CHECKING:
+    from backpack.extensions.secondorder.hbp import HBP
+
+
+class HBPConvTransposeNd(HBPBaseModule):
+    """Computes Kronecker-structured Hessian proxies for transpose convolution layers.
+
+    NOTE docstrings use 2d transpose convolution to explain the arguments and output
+    shapes.
+    """
+
+    def __init__(self, N: int):
+        """Store dimension of transpose convolution.
+
+        Args:
+            N: Dimension of transpose convolution.
+        """
+        self._conv_dim = N
+        derivatives_cls = {
+            1: ConvTranspose1DDerivatives,
+            2: ConvTranspose2DDerivatives,
+            3: ConvTranspose3DDerivatives,
+        }[N]
+        super().__init__(derivatives_cls(), params=["weight", "bias"])
+
+    def weight(
+        self,
+        ext: HBP,
+        module: Union[ConvTranspose1d, ConvTranspose2d, ConvTranspose3d],
+        g_inp: Tuple[Tensor],
+        g_out: Tuple[Tensor],
+        backproped: Tensor,
+    ) -> List[Tensor]:
+        """Compute the Kronecker factors for the (transposed!) weight Hessian proxy.
+
+        Note:
+            (IMPORTANT) The returned Kronecker factors approximate the Hessian w.r.t.
+            the kernel after transposing its input and output channel axes, that is
+            `weight.transpose(0, 1)`. This is due to the different order of input and
+            output channels in the kernels of convolution and transpose convolution.
+
+            TODO The current convention to generalize the Kronecker factor
+            differs from the KFC paper (https://arxiv.org/pdf/1602.01407.pdf)
+            by a factor of |Τ| = H * W where H, W denote the spatial output
+            dimensions of the transpose convolution. If this convention is changed to
+            be more consistent with the paper, this must be clearly communicated
+            to users as it will alter the scale of the KFAC quantity for weights of
+            transpose convolutions in comparison to older versions.
+
+        Args:
+            ext: HBP extension.
+            module: transpose convolution layer the backpropagation is performed on.
+            g_inp: input gradient.
+            g_out: output gradient.
+            backproped: Backpropagated quantity, depends on the approximation mode.
+                For KFLR/KFAC this is the MC/exact matrix square root of the GGN w.r.t.
+                the transpose convolution output (shape `[M, N, C, H, W]`) and has shape
+                `[M, N, C, H, W]` with `M` the number of MC samples or the number of
+                classes for KFAC/KFLR, respectively. For KFRA, the backpropagated
+                object approximates the batch-averaged GGN w.r.t. to the transpose
+                convolution output and has shape `[C * H * W, C * H * W]`.
+
+        Returns:
+            List of Kronecker factors whose Kronecker product approximates the Hessian
+            w.r.t. the transposed weight (please carefully read the notice above). Its
+            length depends on the Hessian approximation. If `[A, B]` is returned, then
+            `A ⊗ B` has shape `[weight.numel(), weight.numel()]` and approximates the
+            Hessian w.r.t. `weight.transpose(0, 1)`.
+        """
+        self._maybe_raise_groups_not_implemented_error(ext, module)
+
+        kron_factors: List[Tensor] = []
+        bp_strategy = ext.get_backprop_strategy()
+
+        if BackpropStrategy.is_batch_average(bp_strategy):  # KFRA
+            kron_factors.append(self._factor_from_batch_average(module, backproped))
+
+        elif BackpropStrategy.is_sqrt(bp_strategy):  # KFLR, KFAC
+            kron_factors.append(self._factor_from_sqrt(module, backproped))
+
+        kron_factors += self._factors_from_input(ext, module)
+
+        self._warn_approximation_transpose_weight()
+
+        return kron_factors
+
+    def _factors_from_input(
+        self, ext: HBP, module: Union[ConvTranspose1d, ConvTranspose2d, ConvTranspose3d]
+    ) -> List[Tensor]:
+        """Compute the un-centered covariance of the unfolded input.
+
+        In the notation of https://arxiv.org/pdf/1602.01407.pdf, this computes Ω from
+        equation (32) for KFAC, but using the unfolded input for transpose convolution.
+
+        Args:
+            ext: HBP extension.
+            module: Transpose convolution layer.
+
+        Raises:
+            NotImplementedError: If the backpropagation strategy differs from KFAC.
+
+        Returns:
+            List containing the tensor of the un-centered covariance of the unfolded
+            input. For a transpose convolution kernel of size `[C_in, _, K_H, K_W]`, its
+            shape is `[C_in * K_H * K_W, C_in * K_H * K_W]`.
+        """
+        ea_strategy = ext.get_ea_strategy()
+        if ExpectationApproximation.should_average_param_jac(ea_strategy):
+            raise NotImplementedError("Undefined")
+
+        X = unfold_by_conv_transpose(module.input0, module)
+
+        return [einsum("bik,bjk->ij", X, X) / X.shape[0]]
+
+    def _factor_from_sqrt(
+        self,
+        module: Union[ConvTranspose1d, ConvTranspose2d, ConvTranspose3d],
+        backproped: Tensor,
+    ) -> Tensor:
+        """Compute the Kronecker factor from the backpropagated GGN matrix square root.
+
+        In the notation of https://arxiv.org/pdf/1602.01407.pdf,
+        this computes |Τ| * Γ from equation (32) for KFAC.
+
+        Note:
+            In comparison to the KFC paper, the output differs by a factor of |Τ|.
+            This is because |Τ| * Γ is the MC/exact GGN w.r.t. the transpose
+            convolution's bias. For two-dimensional convolution with output of shape
+            `[N, C_out, H, W]`, |Τ| = H * W.
+
+        Args:
+            module: Transpose convolution layer.
+            backproped: Backpropagated quantity, corresponding to the MC/exact matrix
+                square square root of the GGN w.r.t. the convolution output. For a
+                convolution with output shape `[N, C_out, H, W]`, this square root is
+                of shape `[M, N, C_out, H, W]` where `M` is the number of MC samples
+                for KFAC, and the number of classes for KFLR. The matrix square root
+                already incorporates a normalization factor for batch size averaging.
+
+        Returns:
+            MC/exact GGN w.r.t. the bias. Has shape `[C_out, C_out]`
+        """
+        sqrt_ggn = backproped.flatten(start_dim=-self._conv_dim)
+        sqrt_ggn = einsum("cbij->cbi", sqrt_ggn)
+        return einsum("cbi,cbl->il", sqrt_ggn, sqrt_ggn)
+
+    def bias(
+        self,
+        ext: HBP,
+        module: Union[ConvTranspose1d, ConvTranspose2d, ConvTranspose3d],
+        g_inp: Tuple[Tensor],
+        g_out: Tuple[Tensor],
+        backproped: Tensor,
+    ) -> List[Tensor]:
+        """Compute the Kronecker factors for the bias Hessian approximation.
+
+        Args:
+            ext: HBP extension.
+            module: Transpose convolution layer the backpropagation is performed on.
+            g_inp: input gradient.
+            g_out: output gradient.
+            backproped: Backpropagated quantity, depends on the approximation mode.
+                For KFLR/KFAC this is the MC/exact matrix square root of the GGN w.r.t.
+                the transpose convolution output (shape `[M, N, C, H, W]`) and has shape
+                `[M, N, C, H, W]` with `M` the number of MC samples or the number of
+                classes for KFAC/KFLR, respectively. For KFRA, the backpropagated
+                object approximates the batch-averaged GGN w.r.t. to the transpose
+                convolution output and has shape `[C * H * W, C * H * W]`.
+
+        Returns:
+            List containing a single tensor of shape `[bias.numel(), bias.numel()]` that
+            approximates the bias Hessian.
+        """
+        kron_factors: List[Tensor] = []
+        bp_strategy = ext.get_backprop_strategy()
+
+        if BackpropStrategy.is_batch_average(bp_strategy):  # KFRA
+            kron_factors.append(self._factor_from_batch_average(module, backproped))
+
+        elif BackpropStrategy.is_sqrt(bp_strategy):  # KFAC/KFLR
+            kron_factors.append(self._factor_from_sqrt(module, backproped))
+
+        return kron_factors
+
+    def _factor_from_batch_average(
+        self,
+        module: Union[ConvTranspose1d, ConvTranspose2d, ConvTranspose3d],
+        backproped: Tensor,
+    ) -> Tensor:
+        """Compute the Kronecker factor from the backpropagated output Hessian proxy.
+
+        Note:
+            TODO Currently, the Kronecker approximation that needs to be imposed on
+            the backpropagated Hessian proxy to achieve a Kronecker structure of the
+            weight Hessian differs from KFC (https://arxiv.org/pdf/1602.01407.pdf).
+            This could be changed for this factor to be more consistent with the
+            KFC approximations. If this is changed, this must be clearly communicated
+            to users as it will alter the KFRA quantity for weights of transpose
+            convolutions in comparison to older versions. NOTE that this method is
+            currently shared by the weights and bias terms for KFRA, but the described
+            improvement would only apply to the weights, and not the bias.
+
+        Args:
+            module: Transpose convolution layer.
+            backproped: Approximation for the batch-averaged Hessian w.r.t. the output
+                of the convolution layer. Has shape `[C * H * W, C * H * W]` if the
+                transpose convolution's output is of shape `[N, C, H, W]`.
+
+        Returns:
+            Kronecker factor used for approximating the weight Hessian in transpose
+            convolutions. Has shape `[C, C]` with `C` the transpose convolution's output
+            channels.
+        """
+        spatial_dim = module.output.shape[-self._conv_dim :].numel()
+        out_channels = module.output.shape[-self._conv_dim - 1]
+
+        # sum over spatial coordinates
+        return backproped.reshape(
+            out_channels, spatial_dim, out_channels, spatial_dim
+        ).sum([1, 3])
+
+    @staticmethod
+    def _maybe_raise_groups_not_implemented_error(
+        ext: HBP, module: Union[ConvTranspose1d, ConvTranspose2d, ConvTranspose3d]
+    ):
+        """Raise NotImplementedError for grouped convolution.
+
+        Args:
+            ext: HBP extension.
+            module: Transpose convolution layer.
+
+        Raises:
+            NotImplementedError: If groups ≠ 1.
+        """
+        if module.groups != 1:
+            ext_name = ext.__class__.__name__
+            raise NotImplementedError(
+                f"groups ≠ 1 is not supported by {ext_name} (got {module.groups})."
+            )
+
+    @staticmethod
+    def _warn_approximation_transpose_weight():
+        """Warn user that Kronecker approximation holds for the transposed weight."""
+        warn(
+            "The Kronecker factors stored in the weight parameters of transpose "
+            "convolutions approximate the Hessian w.r.t. to the transposed kernel "
+            "`weight.transpose(0, 1)` because of the shape convention of transposed "
+            "convolution kernels in PyTorch. Take this into account when working with "
+            "the factors! For example, to multiply the Hessian approximation "
+            "given by `[A, B]` onto a vector `v` of same shape as `weight`, you have "
+            "to swap its dimension before and after multiplication: "
+            "`((A ⊗ B) (v.transpose(0, 1)).transpose(0, 1)`"
+        )
diff --git a/fully_documented.txt b/fully_documented.txt
index 4f3d33c88..f05271763 100644
--- a/fully_documented.txt
+++ b/fully_documented.txt
@@ -72,6 +72,10 @@ backpack/extensions/secondorder/hbp/conv1d.py
 backpack/extensions/secondorder/hbp/conv2d.py
 backpack/extensions/secondorder/hbp/conv3d.py
 backpack/extensions/secondorder/hbp/convnd.py
+backpack/extensions/secondorder/hbp/conv_transpose1d.py
+backpack/extensions/secondorder/hbp/conv_transpose2d.py
+backpack/extensions/secondorder/hbp/conv_transpose3d.py
+backpack/extensions/secondorder/hbp/conv_transposend.py
 
 backpack/hessianfree/ggnvp.py
 
@@ -124,6 +128,7 @@ test/utils/skip_extension_test.py
 test/utils/__init__.py
 test/converter/
 test/utils/test_subsampling.py
+test/utils/conv_transpose.py
 test/custom_module/
 test/test_retain_graph.py
 test/test_batch_first.py
diff --git a/test/extensions/implementation/backpack.py b/test/extensions/implementation/backpack.py
index a1206ff78..74c3e7cc7 100644
--- a/test/extensions/implementation/backpack.py
+++ b/test/extensions/implementation/backpack.py
@@ -7,12 +7,14 @@
 )
 from test.extensions.problem import ExtensionsTestProblem
 from test.utils import chunk_sizes
+from test.utils.conv_transpose import fix_index_order_conv_transpose_weights
 from typing import List
 
 from torch import Tensor, cat, einsum
 
 import backpack.extensions as new_ext
 from backpack import backpack
+from backpack.utils.kroneckers import kfacs_to_mat
 
 
 class BackpackExtensions(ExtensionsImplementation):
@@ -169,18 +171,55 @@ def kfac(self, mc_samples: int = 1) -> List[List[Tensor]]:  # noqa:D102
             loss.backward()
         return self.problem.collect_data("kfac")
 
+    def kfac_as_mat(self, chunks: int = 1, mc_samples: int = 1) -> List[Tensor]:
+        """Return the matrix representation of the KFAC approximation.
+
+        Args:
+            chunks: Number of chunks to split the MC samples. Default: `1`.
+            mc_samples: Number of MC samples. Default: `1`.
+
+        Returns:
+            List of tensors containing the block-diagonal Hessian's
+            approximation implied by KFAC in matrix format.
+        """
+        kfac = [
+            kfacs_to_mat(kron_list)
+            for kron_list in self.kfac_chunk(mc_samples=mc_samples, chunks=chunks)
+        ]
+        return fix_index_order_conv_transpose_weights(self.problem.model, kfac)
+
     def kflr(self) -> List[List[Tensor]]:  # noqa:D102
         with backpack(new_ext.KFLR()):
             _, _, loss = self.problem.forward_pass()
             loss.backward()
         return self.problem.collect_data("kflr")
 
+    def kflr_as_mat(self) -> List[Tensor]:
+        """Return the matrix representation of the KFLR approximation.
+
+        Returns:
+            List of tensors containing the block-diagonal Hessian's
+            approximation implied by KFLR in matrix format.
+        """
+        kflr = [kfacs_to_mat(kron_list) for kron_list in self.kflr()]
+        return fix_index_order_conv_transpose_weights(self.problem.model, kflr)
+
     def kfra(self) -> List[List[Tensor]]:  # noqa:D102
         with backpack(new_ext.KFRA()):
             _, _, loss = self.problem.forward_pass()
             loss.backward()
         return self.problem.collect_data("kfra")
 
+    def kfra_as_mat(self) -> List[Tensor]:
+        """Return the matrix representation of the KFRA approximation.
+
+        Returns:
+            List of tensors containing the block-diagonal Hessian's
+            approximation implied by KFRA in matrix format.
+        """
+        kfra = [kfacs_to_mat(kron_list) for kron_list in self.kfra()]
+        return fix_index_order_conv_transpose_weights(self.problem.model, kfra)
+
     def diag_h_batch(self) -> List[Tensor]:  # noqa:D102
         with backpack(new_ext.BatchDiagHessian()):
             _, _, loss = self.problem.forward_pass()
diff --git a/test/extensions/secondorder/hbp/kfac_settings.py b/test/extensions/secondorder/hbp/kfac_settings.py
index ec124be01..771322d4b 100644
--- a/test/extensions/secondorder/hbp/kfac_settings.py
+++ b/test/extensions/secondorder/hbp/kfac_settings.py
@@ -12,6 +12,9 @@
     Conv1d,
     Conv2d,
     Conv3d,
+    ConvTranspose1d,
+    ConvTranspose2d,
+    ConvTranspose3d,
     CrossEntropyLoss,
     Flatten,
     Identity,
@@ -89,6 +92,48 @@
         "target_fn": lambda: regression_targets((1, 1)),
         "id_prefix": "conv3d-single-output",
     },
+    # transpose convolution with single output is a linear layer (no weight
+    # sharing across input)
+    {
+        "input_fn": lambda: rand(1, 2, 9),
+        "module_fn": lambda: Sequential(
+            ConvTranspose1d(2, 4, 3, padding=5),
+            Sigmoid(),
+            ConvTranspose1d(4, 1, 1),
+            Flatten(),
+        ),
+        "loss_function_fn": lambda: MSELoss(reduction="mean"),
+        "target_fn": lambda: regression_targets((1, 1)),
+        "id_prefix": "convtranspose1d-single-output",
+    },
+    # transpose convolution with single output is a linear layer (no weight
+    # sharing across input)
+    {
+        "input_fn": lambda: rand(1, 2, 4, 4),
+        "module_fn": lambda: Sequential(
+            ConvTranspose2d(2, 3, 2, padding=2),
+            Sigmoid(),
+            ConvTranspose2d(3, 1, 1),
+            Flatten(),
+        ),
+        "loss_function_fn": lambda: MSELoss(reduction="mean"),
+        "target_fn": lambda: regression_targets((1, 1)),
+        "id_prefix": "convtranspose2d-single-output",
+    },
+    # transpose convolution with single output is a linear layer (no weight
+    # sharing across input)
+    {
+        "input_fn": lambda: rand(1, 2, 6, 6, 6),
+        "module_fn": lambda: Sequential(
+            ConvTranspose3d(2, 4, 2, padding=3),
+            Sigmoid(),
+            ConvTranspose3d(4, 1, 1),
+            Flatten(),
+        ),
+        "loss_function_fn": lambda: MSELoss(reduction="mean"),
+        "target_fn": lambda: regression_targets((1, 1)),
+        "id_prefix": "convtranspose3d-single-output",
+    },
 ]
 
 _BATCH_SIZE_1_BRANCHING_SETTINGS = [
diff --git a/test/extensions/secondorder/hbp/test_kfac.py b/test/extensions/secondorder/hbp/test_kfac.py
index 35c8f9591..fa76400d8 100644
--- a/test/extensions/secondorder/hbp/test_kfac.py
+++ b/test/extensions/secondorder/hbp/test_kfac.py
@@ -11,8 +11,6 @@
 
 import pytest
 
-from backpack.utils.kroneckers import kfacs_to_mat
-
 NOT_SUPPORTED_PROBLEMS = make_test_problems(NOT_SUPPORTED_SETTINGS)
 NOT_SUPPORTED_IDS = [problem.make_id() for problem in NOT_SUPPORTED_PROBLEMS]
 BATCH_SIZE_1_PROBLEMS = make_test_problems(BATCH_SIZE_1_SETTINGS)
@@ -49,8 +47,9 @@ def test_kfac_should_approx_ggn_montecarlo(problem: ExtensionsTestProblem):
     autograd_res = AutogradExtensions(problem).ggn_blocks()
 
     mc_samples = 300000
-    backpack_kfac = BackpackExtensions(problem).kfac_chunk(mc_samples)
-    backpack_res = [kfacs_to_mat(kfac) for kfac in backpack_kfac]
+    backpack_res = BackpackExtensions(problem).kfac_as_mat(
+        chunks=10, mc_samples=mc_samples
+    )
 
     check_sizes_and_values(autograd_res, backpack_res, atol=5e-3, rtol=5e-3)
 
@@ -68,11 +67,10 @@ def test_kfac_should_approx_ggn_montecarlo_light(problem: ExtensionsTestProblem)
     """
     problem.set_up()
     skip_BCEWithLogitsLoss_non_binary_labels(problem)
-    autograd_res = AutogradExtensions(problem).ggn_blocks()
 
+    autograd_res = AutogradExtensions(problem).ggn_blocks()
     mc_samples = 6000
-    backpack_kfac = BackpackExtensions(problem).kfac(mc_samples)
-    backpack_res = [kfacs_to_mat(kfac) for kfac in backpack_kfac]
+    backpack_res = BackpackExtensions(problem).kfac_as_mat(mc_samples=mc_samples)
 
     check_sizes_and_values(autograd_res, backpack_res, atol=1e-2, rtol=1e-2)
 
diff --git a/test/extensions/secondorder/hbp/test_kflr.py b/test/extensions/secondorder/hbp/test_kflr.py
index 7bd9fbc0c..0a0871a5b 100644
--- a/test/extensions/secondorder/hbp/test_kflr.py
+++ b/test/extensions/secondorder/hbp/test_kflr.py
@@ -11,8 +11,6 @@
 
 import pytest
 
-from backpack.utils.kroneckers import kfacs_to_mat
-
 NOT_SUPPORTED_PROBLEMS = make_test_problems(NOT_SUPPORTED_SETTINGS)
 NOT_SUPPORTED_IDS = [problem.make_id() for problem in NOT_SUPPORTED_PROBLEMS]
 BATCH_SIZE_1_PROBLEMS = make_test_problems(BATCH_SIZE_1_SETTINGS)
@@ -42,10 +40,9 @@ def test_kflr_equals_ggn(problem: ExtensionsTestProblem):
         problem: Test case.
     """
     problem.set_up()
-    autograd_res = AutogradExtensions(problem).ggn_blocks()
 
-    backpack_kflr = BackpackExtensions(problem).kflr()
-    backpack_res = [kfacs_to_mat(kflr) for kflr in backpack_kflr]
+    autograd_res = AutogradExtensions(problem).ggn_blocks()
+    backpack_res = BackpackExtensions(problem).kflr_as_mat()
 
     check_sizes_and_values(autograd_res, backpack_res, atol=1e-7, rtol=1e-5)
 
diff --git a/test/extensions/secondorder/hbp/test_kfra.py b/test/extensions/secondorder/hbp/test_kfra.py
index 6a3563587..dc7105912 100644
--- a/test/extensions/secondorder/hbp/test_kfra.py
+++ b/test/extensions/secondorder/hbp/test_kfra.py
@@ -12,8 +12,6 @@
 
 import pytest
 
-from backpack.utils.kroneckers import kfacs_to_mat
-
 NOT_SUPPORTED_PROBLEMS = make_test_problems(NOT_SUPPORTED_SETTINGS)
 NOT_SUPPORTED_IDS = [problem.make_id() for problem in NOT_SUPPORTED_PROBLEMS]
 BATCH_SIZE_1_PROBLEMS = make_test_problems(BATCH_SIZE_1_SETTINGS)
@@ -46,9 +44,7 @@ def test_kfra_equals_ggn(problem: ExtensionsTestProblem):
     skip_BCEWithLogitsLoss(problem)
 
     autograd_res = AutogradExtensions(problem).ggn_blocks()
-
-    backpack_kfra = BackpackExtensions(problem).kfra()
-    backpack_res = [kfacs_to_mat(kfra) for kfra in backpack_kfra]
+    backpack_res = BackpackExtensions(problem).kfra_as_mat()
 
     check_sizes_and_values(autograd_res, backpack_res, atol=1e-7, rtol=1e-5)
 
diff --git a/test/utils/conv_transpose.py b/test/utils/conv_transpose.py
new file mode 100644
index 000000000..e9c14dfaa
--- /dev/null
+++ b/test/utils/conv_transpose.py
@@ -0,0 +1,81 @@
+"""Utility functions for testing transpose convolutions."""
+
+from typing import List, Union
+
+from torch import Tensor
+from torch.nn import ConvTranspose1d, ConvTranspose2d, ConvTranspose3d, Module
+
+
+def fix_index_order_conv_transpose_weights(model: Module, kfac_mats: List[Tensor]):
+    """Fix index order for expanded Kronecker proxies of transpose convolution weights.
+
+    The Kronecker product stored in weights of transpose convolutions represents the
+    Hessian approximation w.r.t. `weight.transpose(0, 1)` rather than `weight` due to
+    the differing index order convention of transpose convolution. This function
+    transposed the axes in the expanded Hessian approximation, such that it is w.r.t.
+    `weights`. Approximations for all other parameters are left unchanged.
+
+    Args:
+        model: A (container or regular) module representing a neural network.
+        kfac_mats: The expanded Kronecker approximations w.r.t. the model's parameters.
+
+    Returns:
+        Expanded Kronecker approximations w.r.t. the model's parameters, but modifies
+        the matrices stemming from weights of transpose convolutions in order to
+        represent the correct index order.
+    """
+    params = [p for p in model.parameters() if p.requires_grad]
+    conv_t_modules = _get_conv_transpose_modules(model)
+
+    fix_idx = [_index(module.weight, params) for module in conv_t_modules]
+
+    for idx in fix_idx:
+        param = params[idx]
+        C_in, C_out = param.shape[:2]
+        K = param.shape[2:].numel()
+
+        kron = kfac_mats[idx]
+        kron = kron.reshape(C_out, C_in, K, C_out, C_in, K)
+        kron = kron.transpose(0, 1).transpose(3, 4)
+        kron = kron.reshape(param.numel(), param.numel())
+
+        kfac_mats[idx] = kron
+
+    return kfac_mats
+
+
+def _get_conv_transpose_modules(
+    model: Module,
+) -> List[Union[ConvTranspose1d, ConvTranspose2d, ConvTranspose3d]]:
+    """Extract the transpose convolution modules into a list.
+
+    Args:
+        model: Neural network represented as a (container or regular) module.
+
+    Returns:
+        List containing the transpose convolution modules of model.
+    """
+    children = list(model.children())
+
+    if len(children) > 0:
+        return sum([_get_conv_transpose_modules(c) for c in children], [])
+    elif isinstance(model, (ConvTranspose1d, ConvTranspose2d, ConvTranspose3d)):
+        return [model]
+    else:
+        return []
+
+
+def _index(param: Tensor, param_list: List[Tensor]) -> int:
+    """Return the position of `param` in `param_list`.
+
+    Uses the data_ptr to check equality of parameters.
+
+    Args:
+        param: The tensor whose position will be returned.
+        param_list: A list containing multiple tensors.
+
+    Returns:
+        Index of `param`.
+    """
+    param_data_ptrs = [param.data_ptr() for param in param_list]
+    return param_data_ptrs.index(param.data_ptr())

From 0fe55d79a159eb2d53046ff02168d34518d36e47 Mon Sep 17 00:00:00 2001
From: Felix Dangel <48687646+f-dangel@users.noreply.github.com>
Date: Tue, 20 Dec 2022 12:47:03 +0100
Subject: [PATCH 24/29] [FIX] Support `torch>=1.13`  (#296)

* [FIX] Copy `_grad_input_padding` from torch==1.9

The function was removed between torch 1.12.1 and torch 1.13.
Reintroducing it should fix
https://github.com/f-dangel/backpack/issues/272.

* [CI] Use latest two torch releases for tests

* [FIX] Ignore flake8 warning about abstract methods

* [FIX] Import

* [CI] Test with `torch=={1.9.0, 1.12.0}` and make tests compatible (#276)

* [CI] Test with `torch=={1.9.0, 1.10.0}`

* [CI] Test with `torch=={1.9.0, 1.11.0}`

* [FIX] flake8

* [CI] Test with `torch=={1.9.0, 1.12.0}`

* [TEST] Replace `parameters_to_vector` by custom function

This should fix
`test_network_diag_ggn[<class
'test.converter.converter_cases._Permute'>]`
in `test/converter/test_converter.py`. Between torch 1.11.0 and torch
1.12.0, the GGN-vector products for this case became non-contiguous, and
`torch.nn.utils.convert_parameters.parameters_to_vector` stopped working
as it uses `view`.

Here is a short self-contained snippet to reproduce the issue:

```python
from torch import Tensor, permute, rand, rand_like
from torch.autograd import grad
from torch.nn import Linear, Module
from torch.nn.utils.convert_parameters import parameters_to_vector

from backpack.utils.convert_parameters import tensor_list_to_vector

class Permute(Module):
    def __init__(self):
        super().__init__()
        self.batch_size = 3
        self.in_dim = (5, 3)
        out_dim = 2
        self.linear = Linear(self.in_dim[-1], out_dim)
        self.linear2 = Linear(self.in_dim[-2], out_dim)

    def forward(self, x):
        x = self.linear(x)
        x = x.permute(0, 2, 1)  # method permute
        x = self.linear2(x)
        x = permute(x, (0, 2, 1))  # function permute
        return x

    def input_fn(self) -> Tensor:
        return rand(self.batch_size, *self.in_dim)

model = Permute()

inputs = model.input_fn()
outputs = model(inputs)

params = list(model.parameters())
grad_outputs = rand_like(outputs)
v = [rand_like(p) for p in model.parameters()]

vJ_tuple = grad(outputs, params, grad_outputs=grad_outputs)

for p, vJ in zip(params, vJ_tuple):
    # all contiguous()
    print(p.shape, vJ.shape)
    # between 1.11.0 and 1.12.0, the vector-Jacobian product w.r.t. the second
    # linear layer's weight is not contiguous anymore
    print(p.is_contiguous(), vJ.is_contiguous())

vJ_vector = parameters_to_vector(vJ_tuple)

vJ_vector = tensor_list_to_vector(vJ_tuple)
```

* [REF] Use f-string and add type hints

* [REQ] Require `torch<1.13`

See https://github.com/f-dangel/backpack/issues/272. Waiting for
https://github.com/pytorch/pytorch/issues/88312 before `torch>=1.13`
can be supported.

* [DOC] Update changelog to prepare compatibility patch

* [DOC] fix date

Co-authored-by: Felix Dangel <fdangel@tue.mpg.de>

* [CI] Test torch from 1.9 to 1.13

* [FIX] Ignore 'zip()' without an explicit 'strict=' parameter

* [REF] Make GGNvps contiguous before flattening and concatenation

* [CI] Unambiguously specify tested torch versions

* [REF] Import _grad_input_padding from torch for torch<1.13

* [FIX] Exception handling for Hessians of linear functions

* [REF] Same `_grad_input_padding` import strategy for conv_transpose

* [FIX] Merge conflict

* [CI] Ignore docstring check of _grad_input_padding

* [DOC] Add type annotation, remove unused import

* [DOC] Add type annotation for output
---
 .github/workflows/test.yaml                   | 10 +++-
 backpack/core/derivatives/conv_transposend.py |  7 ++-
 backpack/core/derivatives/convnd.py           |  7 ++-
 backpack/utils/__init__.py                    |  1 +
 backpack/utils/conv.py                        | 60 +++++++++++++++++--
 backpack/utils/convert_parameters.py          | 14 +----
 backpack/utils/examples.py                    |  9 ++-
 setup.cfg                                     |  1 +
 .../derivatives/implementation/autograd.py    |  2 +-
 9 files changed, 84 insertions(+), 27 deletions(-)

diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index a4ec683be..5458f6241 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -20,7 +20,13 @@ jobs:
     strategy:
       matrix:
         python-version: [3.7, 3.8, 3.9]
-        pytorch-version: [1.9.0, 1.12.0]
+        pytorch-version:
+          - "==1.9.1"
+          - "==1.10.1"
+          - "==1.11.0"
+          - "==1.12.1"
+          - "==1.13.1"
+          - "" # latest
     steps:
     - uses: actions/checkout@v1
     - uses: actions/setup-python@v1
@@ -30,7 +36,7 @@ jobs:
       run: |
         python -m pip install --upgrade pip
         make install-test
-        pip install torch==${{ matrix.pytorch-version }} torchvision
+        pip install torch${{ matrix.pytorch-version }} torchvision
     - name: Run test
       if: contains('refs/heads/master refs/heads/development refs/heads/release', github.ref)
       run: |
diff --git a/backpack/core/derivatives/conv_transposend.py b/backpack/core/derivatives/conv_transposend.py
index f0046337f..3f7fb40e1 100644
--- a/backpack/core/derivatives/conv_transposend.py
+++ b/backpack/core/derivatives/conv_transposend.py
@@ -5,9 +5,9 @@
 from numpy import prod
 from torch import Tensor, einsum
 from torch.nn import ConvTranspose1d, ConvTranspose2d, ConvTranspose3d, Module
-from torch.nn.grad import _grad_input_padding
 
 from backpack.core.derivatives.basederivatives import BaseParameterDerivatives
+from backpack.utils import TORCH_VERSION_AT_LEAST_1_13
 from backpack.utils.conv import get_conv_function
 from backpack.utils.conv_transpose import (
     get_conv_transpose_function,
@@ -15,6 +15,11 @@
 )
 from backpack.utils.subsampling import subsample
 
+if TORCH_VERSION_AT_LEAST_1_13:
+    from backpack.utils.conv import _grad_input_padding
+else:
+    from torch.nn.grad import _grad_input_padding
+
 
 class ConvTransposeNDDerivatives(BaseParameterDerivatives):
     """Base class for partial derivatives of transpose convolution."""
diff --git a/backpack/core/derivatives/convnd.py b/backpack/core/derivatives/convnd.py
index 167a5ad75..c3d428694 100644
--- a/backpack/core/derivatives/convnd.py
+++ b/backpack/core/derivatives/convnd.py
@@ -5,13 +5,18 @@
 from numpy import prod
 from torch import Tensor, einsum
 from torch.nn import Conv1d, Conv2d, Conv3d, Module
-from torch.nn.grad import _grad_input_padding
 
 from backpack.core.derivatives.basederivatives import BaseParameterDerivatives
+from backpack.utils import TORCH_VERSION_AT_LEAST_1_13
 from backpack.utils.conv import get_conv_function, unfold_by_conv
 from backpack.utils.conv_transpose import get_conv_transpose_function
 from backpack.utils.subsampling import subsample
 
+if TORCH_VERSION_AT_LEAST_1_13:
+    from backpack.utils.conv import _grad_input_padding
+else:
+    from torch.nn.grad import _grad_input_padding
+
 
 class weight_jac_t_save_memory:
     """Choose algorithm to apply transposed convolution weight Jacobian."""
diff --git a/backpack/utils/__init__.py b/backpack/utils/__init__.py
index d5fb6701b..381b6c519 100644
--- a/backpack/utils/__init__.py
+++ b/backpack/utils/__init__.py
@@ -4,5 +4,6 @@
 TORCH_VERSION = packaging.version.parse(get_distribution("torch").version)
 TORCH_VERSION_AT_LEAST_1_9_1 = TORCH_VERSION >= packaging.version.parse("1.9.1")
 TORCH_VERSION_AT_LEAST_2_0_0 = TORCH_VERSION >= packaging.version.parse("2.0.0")
+TORCH_VERSION_AT_LEAST_1_13 = TORCH_VERSION >= packaging.version.parse("1.13")
 
 ADAPTIVE_AVG_POOL_BUG: bool = not TORCH_VERSION_AT_LEAST_2_0_0
diff --git a/backpack/utils/conv.py b/backpack/utils/conv.py
index deaef6e40..d6d5a03e6 100644
--- a/backpack/utils/conv.py
+++ b/backpack/utils/conv.py
@@ -1,8 +1,8 @@
 """Utility functions for convolution layers."""
 
 from typing import Callable, Tuple, Type, Union
+from warnings import warn
 
-import torch
 from einops import rearrange
 from torch import Tensor, einsum
 from torch.nn import (
@@ -158,9 +158,7 @@ def extract_bias_diagonal(
     return S.sum(sum_before).pow_(2).sum(sum_after)
 
 
-def unfold_by_conv(
-    input: torch.Tensor, module: Union[Conv1d, Conv2d, Conv3d]
-) -> torch.Tensor:
+def unfold_by_conv(input: Tensor, module: Union[Conv1d, Conv2d, Conv3d]) -> Tensor:
     """Return the unfolded input using convolution.
 
     Args:
@@ -179,3 +177,57 @@ def unfold_by_conv(
         padding=module.padding,
         stride=module.stride,
     )
+
+
+def _grad_input_padding(
+    grad_output: Tensor,
+    input_size: Tuple[int, ...],
+    stride: Tuple[int, ...],
+    padding: Tuple[int, ...],
+    kernel_size: Tuple[int, ...],
+    dilation: Union[None, Tuple[int]] = None,
+) -> Tuple[int, ...]:
+    """Determine padding for the VJP of convolution.
+
+    # noqa: DAR101
+    # noqa: DAR201
+    # noqa: DAR401
+
+    Note:
+        This function was copied from the PyTorch repository (version 1.9).
+        It was removed between torch 1.12.1 and torch 1.13.
+    """
+    if dilation is None:
+        # For backward compatibility
+        warn(
+            "_grad_input_padding 'dilation' argument not provided. Default of 1 is used."
+        )
+        dilation = [1] * len(stride)
+
+    input_size = list(input_size)
+    k = grad_output.dim() - 2
+
+    if len(input_size) == k + 2:
+        input_size = input_size[-k:]
+    if len(input_size) != k:
+        raise ValueError(f"input_size must have {k+2} elements (got {len(input_size)})")
+
+    def dim_size(d):
+        return (
+            (grad_output.size(d + 2) - 1) * stride[d]
+            - 2 * padding[d]
+            + 1
+            + dilation[d] * (kernel_size[d] - 1)
+        )
+
+    min_sizes = [dim_size(d) for d in range(k)]
+    max_sizes = [min_sizes[d] + stride[d] - 1 for d in range(k)]
+    for size, min_size, max_size in zip(input_size, min_sizes, max_sizes):
+        if size < min_size or size > max_size:
+            raise ValueError(
+                f"requested an input grad size of {input_size}, but valid sizes range "
+                f"from {min_sizes} to {max_sizes} (for a grad_output of "
+                f"{grad_output.size()[2:]})"
+            )
+
+    return tuple(input_size[d] - min_sizes[d] for d in range(k))
diff --git a/backpack/utils/convert_parameters.py b/backpack/utils/convert_parameters.py
index 2ec80f774..d4cad85e0 100644
--- a/backpack/utils/convert_parameters.py
+++ b/backpack/utils/convert_parameters.py
@@ -2,7 +2,7 @@
 
 from typing import Iterable, List
 
-from torch import Tensor, cat, typename
+from torch import Tensor, typename
 
 
 def vector_to_parameter_list(vec: Tensor, parameters: Iterable[Tensor]) -> List[Tensor]:
@@ -51,15 +51,3 @@ def vector_to_parameter_list(vec: Tensor, parameters: Iterable[Tensor]) -> List[
         pointer += num_param
 
     return params_new
-
-
-def tensor_list_to_vector(tensor_list: Iterable[Tensor]) -> Tensor:
-    """Convert a list of tensors into a vector by flattening and concatenation.
-
-    Args:
-        tensor_list: List of tensors.
-
-    Returns:
-        Vector containing the flattened and concatenated tensor inputs.
-    """
-    return cat([t.flatten() for t in tensor_list])
diff --git a/backpack/utils/examples.py b/backpack/utils/examples.py
index 519f600db..b52b8a903 100644
--- a/backpack/utils/examples.py
+++ b/backpack/utils/examples.py
@@ -3,15 +3,13 @@
 
 from torch import Tensor, stack, zeros
 from torch.nn import Module
+from torch.nn.utils.convert_parameters import parameters_to_vector
 from torch.utils.data import DataLoader, Dataset
 from torchvision.datasets import MNIST
 from torchvision.transforms import Compose, Normalize, ToTensor
 
 from backpack.hessianfree.ggnvp import ggn_vector_product
-from backpack.utils.convert_parameters import (
-    tensor_list_to_vector,
-    vector_to_parameter_list,
-)
+from backpack.utils.convert_parameters import vector_to_parameter_list
 
 
 def load_mnist_dataset() -> Dataset:
@@ -115,5 +113,6 @@ def _autograd_ggn_exact_columns(
         e_d_list = vector_to_parameter_list(e_d, trainable_parameters)
 
         ggn_d_list = ggn_vector_product(loss, outputs, model, e_d_list)
+        ggn_d_list = [t.contiguous() for t in ggn_d_list]
 
-        yield d, tensor_list_to_vector(ggn_d_list)
+        yield d, parameters_to_vector(ggn_d_list)
diff --git a/setup.cfg b/setup.cfg
index 1f58b6f3b..79905eec4 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -107,6 +107,7 @@ ignore =
 	W291, # trailing whitespace
 	W503, # line break before binary operator
 	W504, # line break after binary operator
+  B905, # 'zip()' without an explicit 'strict=' parameter
 exclude = docs, build, .git, docs_src/rtd, docs_src/rtd_output, .eggs
 
 # Differences with pytorch
diff --git a/test/core/derivatives/implementation/autograd.py b/test/core/derivatives/implementation/autograd.py
index 921e46132..4f72c0ba2 100644
--- a/test/core/derivatives/implementation/autograd.py
+++ b/test/core/derivatives/implementation/autograd.py
@@ -248,7 +248,7 @@ def _elementwise_hessian(self, tensor: Tensor, x: Tensor) -> Tensor:
         for t in tensor.flatten():
             try:
                 yield self._hessian(t, x)
-            except (RuntimeError, AttributeError):
+            except (RuntimeError, AttributeError, TypeError):
                 yield zeros(*x.shape, *x.shape, device=x.device, dtype=x.dtype)
 
     def hessian_is_zero(self) -> bool:  # noqa: D102

From d4530c04f5597e35ffe43aa1f2c4195a77ed1c46 Mon Sep 17 00:00:00 2001
From: Felix Dangel <48687646+f-dangel@users.noreply.github.com>
Date: Sun, 5 Mar 2023 19:52:47 +0100
Subject: [PATCH 25/29] [CI] Apply latest `black` and `flake8` (#301)

---
 backpack/core/derivatives/batchnorm_nd.py                       | 1 -
 backpack/core/derivatives/shape_check.py                        | 2 --
 docs_src/examples/use_cases/example_save_memory_convolutions.py | 2 --
 setup.cfg                                                       | 1 +
 test/test_problems_activations.py                               | 1 -
 test/test_problems_bn.py                                        | 1 -
 test/test_problems_kfacs.py                                     | 1 -
 test/test_problems_linear.py                                    | 1 -
 8 files changed, 1 insertion(+), 9 deletions(-)

diff --git a/backpack/core/derivatives/batchnorm_nd.py b/backpack/core/derivatives/batchnorm_nd.py
index 7fe15255a..ba1810526 100644
--- a/backpack/core/derivatives/batchnorm_nd.py
+++ b/backpack/core/derivatives/batchnorm_nd.py
@@ -93,7 +93,6 @@ def _jac_t_mat_prod(
         self._check_parameters(module)
         N: int = self._get_n_axis(module)
         if module.training:
-
             if subsampling is not None:
                 raise NotImplementedError(
                     "BatchNorm VJP sub-sampling is not defined in train mode."
diff --git a/backpack/core/derivatives/shape_check.py b/backpack/core/derivatives/shape_check.py
index d141f8fc8..d9d998ae3 100644
--- a/backpack/core/derivatives/shape_check.py
+++ b/backpack/core/derivatives/shape_check.py
@@ -249,7 +249,6 @@ def make_hessian_mat_prod_accept_vectors(
 
     @functools.wraps(make_hessian_mat_prod)
     def _wrapped_make_hessian_mat_prod(self, module, g_inp, g_out):
-
         hessian_mat_prod = make_hessian_mat_prod(self, module, g_inp, g_out)
 
         def _new_hessian_mat_prod(mat):
@@ -280,7 +279,6 @@ def make_hessian_mat_prod_check_shapes(
 
     @functools.wraps(make_hessian_mat_prod)
     def _wrapped_make_hessian_mat_prod(self, module, g_inp, g_out):
-
         hessian_mat_prod = make_hessian_mat_prod(self, module, g_inp, g_out)
 
         def _new_hessian_mat_prod(mat):
diff --git a/docs_src/examples/use_cases/example_save_memory_convolutions.py b/docs_src/examples/use_cases/example_save_memory_convolutions.py
index 54ae04518..e4feb09c7 100644
--- a/docs_src/examples/use_cases/example_save_memory_convolutions.py
+++ b/docs_src/examples/use_cases/example_save_memory_convolutions.py
@@ -135,7 +135,6 @@ def compare_peakmem(device):
     print(f"Device: {device}")
 
     for save_memory in True, False:
-
         with weight_jac_t_save_memory(save_memory=save_memory):
 
             def work():
@@ -168,7 +167,6 @@ def compare_runtime(device):
     print(f"Device: {device}")
 
     for save_memory in True, False:
-
         with weight_jac_t_save_memory(save_memory=save_memory):
             start = time.time()
 
diff --git a/setup.cfg b/setup.cfg
index 79905eec4..3816c8205 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -108,6 +108,7 @@ ignore =
 	W503, # line break before binary operator
 	W504, # line break after binary operator
   B905, # 'zip()' without an explicit 'strict=' parameter
+  B028, # No explicit stacklevel keyword argument found (warn)
 exclude = docs, build, .git, docs_src/rtd, docs_src/rtd_output, .eggs
 
 # Differences with pytorch
diff --git a/test/test_problems_activations.py b/test/test_problems_activations.py
index fd94462ab..75cfffb22 100644
--- a/test/test_problems_activations.py
+++ b/test/test_problems_activations.py
@@ -17,7 +17,6 @@
 
 for act_name, act_cls in ACTIVATIONS.items():
     for lin_name, lin_cls in LINEARS.items():
-
         TEST_PROBLEMS[
             "{}{}-regression".format(lin_name, act_name)
         ] = make_regression_problem(
diff --git a/test/test_problems_bn.py b/test/test_problems_bn.py
index a337a44a3..724b763d2 100644
--- a/test/test_problems_bn.py
+++ b/test/test_problems_bn.py
@@ -28,7 +28,6 @@ def bn_layer2():
 TEST_PROBLEMS = {}
 
 for lin_name, lin_cls in LINEARS.items():
-
     TEST_PROBLEMS["{}-bn-regression".format(lin_name)] = make_regression_problem(
         INPUT_SHAPE,
         single_linear_layer(TEST_SETTINGS, lin_cls, activation_cls=None)
diff --git a/test/test_problems_kfacs.py b/test/test_problems_kfacs.py
index 39c9e08cd..924c7d223 100644
--- a/test/test_problems_kfacs.py
+++ b/test/test_problems_kfacs.py
@@ -29,7 +29,6 @@
 }
 for act_name, act_cls in ACTIVATIONS.items():
     for lin_name, lin_cls in LINEARS.items():
-
         TEST_PROBLEMS[
             "{}{}-classification".format(lin_name, act_name)
         ] = make_classification_problem(
diff --git a/test/test_problems_linear.py b/test/test_problems_linear.py
index 94fec9946..ad785f4a5 100644
--- a/test/test_problems_linear.py
+++ b/test/test_problems_linear.py
@@ -16,7 +16,6 @@
 TEST_PROBLEMS = {}
 
 for lin_name, lin_cls in LINEARS.items():
-
     TEST_PROBLEMS["{}-regression".format(lin_name)] = make_regression_problem(
         INPUT_SHAPE, single_linear_layer(TEST_SETTINGS, lin_cls, activation_cls=None)
     )

From 7b0b71266a5448d7f7a8371a34e1cc9aec2dbfe9 Mon Sep 17 00:00:00 2001
From: Felix Dangel <48687646+f-dangel@users.noreply.github.com>
Date: Sun, 5 Mar 2023 23:28:42 +0100
Subject: [PATCH 26/29] [DOC] Use case for `retain_graph` (#302)

Adds a tutorial for BackPACK's `retain_graph` option. It shows
how to distribute the GGN diagonal computation of an auto-
encoder architecture over multiple backward passes to reduce
peak memory.

This use case recently came up in a discussion with @wiseodd
on Laplace approximations for auto-encoders (or any large
output neural network with square loss).

* [ADD] Prototype of `retain_graph` example

* [DOC] Add comments to retain_graph example

* [REF] Improve comments

* [REF] Improve title format
---
 .../use_cases/example_retain_graph.py         | 365 ++++++++++++++++++
 1 file changed, 365 insertions(+)
 create mode 100644 docs_src/examples/use_cases/example_retain_graph.py

diff --git a/docs_src/examples/use_cases/example_retain_graph.py b/docs_src/examples/use_cases/example_retain_graph.py
new file mode 100644
index 000000000..b43bbe09f
--- /dev/null
+++ b/docs_src/examples/use_cases/example_retain_graph.py
@@ -0,0 +1,365 @@
+r"""BackPACK's retain_graph option
+==================================
+
+This tutorial demonstrates how to perform multiple backward passes through the
+same computation graph with BackPACK. This option can be useful if you run into
+out-of-memory errors. If your computation can be chunked, you might consider
+distributing it onto multiple backward passes to reduce peak memory.
+
+Our use case for such a quantity is the GGN diagonal of an auto-encoder's
+reconstruction error.
+
+But first, the imports:
+"""
+
+from functools import partial
+from time import time
+from typing import List
+
+from memory_profiler import memory_usage
+from torch import Tensor, allclose, manual_seed, rand, zeros_like
+from torch.nn import Conv2d, ConvTranspose2d, Flatten, MSELoss, Sequential, Sigmoid
+
+from backpack import backpack, extend
+from backpack.custom_module.slicing import Slicing
+from backpack.extensions import DiagGGNExact
+
+# make deterministic
+manual_seed(0)
+
+# %%
+#
+# Setup
+# -----
+#
+# Let :math:`f_{\mathbf{\theta}}` denote the auto-encoder, and
+# :math:`\mathbf{x'} = f_{\mathbf{\theta}}(\mathbf{x}) \in \mathbb{R}^M` its
+# reconstruction of an input :math:`\mathbf{x} \in \mathbb{R}^M`. The
+# associated reconstruction error is measured by the mean squared error
+#
+# .. math::
+#     \ell(\mathbf{\theta})
+#     =
+#     \frac{1}{M}
+#     \left\lVert f_{\mathbf{\theta}}(\mathbf{x}) - \mathbf{x} \right\rVert^2_2\,.
+#
+# On a batch of :math:`N` examples, :math:`\mathbf{x}_1, \dots, \mathbf{x}_N`,
+# the loss is
+#
+# .. math::
+#     \mathcal{L}(\mathbf{\theta})
+#     =
+#     \frac{1}{N} \frac{1}{M}
+#     \sum_{n=1}^N
+#     \left\lVert f_{\mathbf{\theta}}(\mathbf{x}_n) - \mathbf{x}_n \right\rVert^2_2\,.
+#
+# Let's create a toy model and data:
+
+# data
+batch_size, channels, spatial_dims = 5, 3, (32, 32)
+X = rand(batch_size, channels, *spatial_dims)
+
+# model (auto-encoder)
+hidden_channels = 10
+
+encoder = Sequential(
+    Conv2d(channels, hidden_channels, 3),
+    Sigmoid(),
+)
+decoder = Sequential(
+    ConvTranspose2d(hidden_channels, channels, 3),
+    Flatten(),
+)
+model = Sequential(
+    encoder,
+    decoder,
+)
+loss_func = MSELoss()
+
+# %%
+#
+# We will use BackPACK to compute the GGN diagonal of the mini-batch loss. To
+# do that, we need to :py:func:`extend <backpack.extend>` model and loss
+# function.
+
+model = extend(model)
+loss_func = extend(loss_func)
+
+# %%
+#
+# GGN diagonal in one backward pass
+# ---------------------------------
+#
+# As usual, we can compute the GGN diagonal for the mini-batch loss in a single
+# backward pass. The following function does that:
+
+
+def diag_ggn_one_pass() -> List[Tensor]:
+    """Compute the GGN diagonal in a single backward pass.
+
+    Returns:
+        GGN diagonal in parameter list format.
+    """
+    reconstruction = model(X)
+    error = loss_func(reconstruction, X.flatten(start_dim=1))
+
+    with backpack(DiagGGNExact()):
+        error.backward()
+
+    return [p.diag_ggn_exact.clone() for p in model.parameters() if p.requires_grad]
+
+
+# %%
+#
+# Let's run it and determine (i) its peak memory consumption and (ii) its run
+# time.
+
+print("GGN diagonal in one backward pass:")
+start = time()
+max_mem, diag_ggn = memory_usage(
+    diag_ggn_one_pass, interval=1e-3, max_usage=True, retval=True
+)
+end = time()
+
+print(f"\tPeak memory [MiB]: {max_mem:.2e}")
+print(f"\tTime [s]: {end-start:.2e}")
+
+# %%
+#
+# The memory consumption is pretty high, although our model is relatively
+# small! If we make the model deeper, or increase the mini-batch size, we will
+# quickly run out of memory.
+#
+# This is because computing the GGN diagonal scales with the network's output
+# dimension. For classification settings like MNIST and CIFAR-10, this number
+# is relatively small (:code:`10`). But for an auto-encoder, this number is the
+# input dimension :code:`M`, which in our case is
+
+print(f"Output dimension: {model(X).shape[1:].numel()}")
+
+# %%
+#
+# We will now take a look at how to circumvent the high peak memory by
+# distributing the computation over multiple backward passes.
+
+# %%
+#
+# GGN diagonal in chunks
+# ----------------------
+#
+# The GGN diagonal computation can be distributed across multiple backward
+# passes. This greatly reduces peak memory consumption.
+#
+# To see this, let's consider the GGN diagonal for a single example
+# :math:`\mathbf{x}`,
+#
+# .. math::
+#     \mathrm{diag}
+#     \left(
+#     \left[
+#     \mathbf{J}_{\mathbf{\theta}} f_{\mathbf{\theta}}(\mathbf{x})
+#     \right]^\top
+#     \left[
+#     \frac{2}{M} \mathbf{I}_{M\times M}
+#     \right]
+#     \mathbf{J}_{\mathbf{\theta}} f_{\mathbf{\theta}}(\mathbf{x})
+#     \right)\,,
+#
+# with the :math:`M \times |\mathbf{\theta}|` Jacobian
+# :math:`\mathbf{J}_{\mathbf{\theta}} f_{\mathbf{\theta}}(\mathbf{x})` of the
+# model, and :math:`\frac{2}{M} \mathbf{I}_{M\times M}` the mean squared
+# error's Hessian w.r.t. the reconstructed input. Here you can see that the
+# memory consumption scales with the output dimension, as we need to compute
+# :code:`M` vector-Jacobian products.
+#
+# Let :math:`S`, the chunk size, be a number that divides the output dimension
+# :math:`M`. Then, we can decompose the above computation into chunks:
+#
+# .. math::
+#     \frac{S}{M}
+#     \left\{
+#     \mathrm{diag}
+#     \left(
+#     \left[
+#     \mathbf{J}_{\mathbf{\theta}} f_{\mathbf{\theta}}(\mathbf{x})
+#     \right]^\top_{:, 0:S}
+#     \left[
+#     \frac{2}{S} \mathbf{I}_{S\times S}
+#     \right]
+#     \left[
+#     \mathbf{J}_{\mathbf{\theta}} f_{\mathbf{\theta}}(\mathbf{x})
+#     \right]_{0:S, :}
+#     \right) \right.
+#     \\
+#     +
+#     \left.
+#     \mathrm{diag}
+#     \left(
+#     \left[
+#     \mathbf{J}_{\mathbf{\theta}} f_{\mathbf{\theta}}(\mathbf{x})
+#     \right]^\top_{:, S: 2S}
+#     \left[
+#     \frac{2}{S} \mathbf{I}_{S\times S}
+#     \right]
+#     \left[
+#     \mathbf{J}_{\mathbf{\theta}} f_{\mathbf{\theta}}(\mathbf{x})
+#     \right]_{:, S:2S}
+#     \right)
+#     \right.
+#     \\
+#     +
+#     \left.
+#     \mathrm{diag}
+#     \left(
+#     \left[
+#     \mathbf{J}_{\mathbf{\theta}} f_{\mathbf{\theta}}(\mathbf{x})
+#     \right]^\top_{:, 2S: 3S}
+#     \left[
+#     \frac{2}{S} \mathbf{I}_{S\times S}
+#     \right]
+#     \left[
+#     \mathbf{J}_{\mathbf{\theta}} f_{\mathbf{\theta}}(\mathbf{x})
+#     \right]_{:, 2S: 3S}
+#     \right)
+#     +
+#     \dots
+#     \right\}\,.
+#
+# Each summand is the GGN diagonal of the mean squared error on a chunk
+#
+# .. math::
+#     \tilde{\ell}(\mathbf{\theta})
+#     =
+#     \frac{1}{S}
+#     \lVert
+#     [f_{\mathbf{\theta}}(\mathbf{x})]_{i S: (i+1) S}
+#     -
+#     [\mathbf{x}]_{i S: (i+1) S}
+#     \rVert_2^2\,,
+#     \qquad i = 0, 1, \dots, \frac{M}{S} - 1\,,
+#
+# and its memory consumption scales with :math:`S < M`.
+#
+# In summary, the computation split works as follows:
+#
+# - Compute :math:`f_{\mathbf{\theta}}(\mathbf{x})` in a single forward pass.
+#
+# - Compute the reconstruction error for a chunk and its GGN in one backward
+#   pass.
+#
+# - Repeat the last step for the other chunks. Accumulate the GGN diagonals
+#   over all chunks.
+#
+# (This carries over to the mini-batch case in a straightforward fashion. We
+# avoid the presentation here because of the involved notation, though.)
+#
+# Note that because we perform multiple backward passes, we need to tell
+# PyTorch (and BackPACK) to retain the graph.
+#
+# To slice out a chunk, we use BackPACK's :py:class:`Slicing
+# <backpack.custom_module.slicing>` module.
+#
+# Here is the implementation:
+
+
+def diag_ggn_multiple_passes(num_chunks: int) -> List[Tensor]:
+    """Compute the GGN diagonal in multiple backward passes.
+
+    Uses less memory than ``diag_ggn_one_pass`` if ``num_chunks > 1``.
+    Does the same as ``diag_ggn_one_pass`` for ``num_chunks = 1``.
+
+    Args:
+        num_chunks: Number of backward passes. Must divide the model's output dimension.
+
+    Returns:
+        GGN diagonal in parameter list format.
+
+    Raises:
+        ValueError:
+            If ``num_chunks`` does not divide the model's output dimension.
+        NotImplementedError:
+            If the model does not return a batched vector (the slicing logic is only
+            implemented for batched vectors, i.e. 2d tensors).
+    """
+    reconstruction = model(X)
+
+    if reconstruction.numel() % num_chunks != 0:
+        raise ValueError("Network output must be divisible by number of chunks.")
+    if reconstruction.dim() != 2:
+        raise NotImplementedError("Slicing logic only implemented for 2d outputs.")
+
+    chunk_size = reconstruction.shape[1:].numel() // num_chunks
+    diag_ggn_exact = [zeros_like(p) for p in model.parameters()]
+
+    for idx in range(num_chunks):
+        # set up the layer that extracts the current slice
+        slicing = (slice(None), slice(idx * chunk_size, (idx + 1) * chunk_size))
+        chunk_module = extend(Slicing(slicing))
+
+        # compute the chunk's loss
+        sliced_reconstruction = chunk_module(reconstruction)
+        sliced_X = X.flatten(start_dim=1)[slicing]
+        slice_error = loss_func(sliced_reconstruction, sliced_X)
+
+        # compute its GGN diagonal ...
+        with backpack(DiagGGNExact(), retain_graph=True):
+            slice_error.backward(retain_graph=True)
+
+        # ... and accumulate it
+        for p_idx, p in enumerate(model.parameters()):
+            diag_ggn_exact[p_idx] += p.diag_ggn_exact
+
+    # fix normalization
+    return [ggn / num_chunks for ggn in diag_ggn_exact]
+
+
+# %%
+#
+# Let's benchmark peak memory and run time for different numbers of chunks:
+
+num_chunks = [1, 4, 16, 64]
+
+for n in num_chunks:
+    print(f"GGN diagonal in {n} backward passes:")
+    start = time()
+    max_mem, diag_ggn_chunk = memory_usage(
+        partial(diag_ggn_multiple_passes, n), interval=1e-3, max_usage=True, retval=True
+    )
+    end = time()
+    print(f"\tPeak memory [MiB]: {max_mem:.2e}")
+    print(f"\tTime [s]: {end-start:.2e}")
+
+    correct = [
+        allclose(diag1, diag2, rtol=5e-3, atol=5e-5)
+        for diag1, diag2 in zip(diag_ggn, diag_ggn_chunk)
+    ]
+    print(f"\tCorrect: {correct}")
+
+    if not all(correct):
+        raise RuntimeError("Mismatch in GGN diagonals.")
+
+# %%
+#
+# We can see that using more chunks consistently decreases the peak memory.
+# Even run time decreases up to a sweet spot where increasing the number of
+# chunks further eventually slows down the computation. The details of this
+# trade-off will depend on your model and compute architecture.
+#
+# Concluding remarks
+# ------------------
+#
+# Here, we considered chunking the computation along the auto-encoder's output
+# dimension. There are other ways to achieve the desired effect of reducing
+# peak memory:
+#
+# - In the mini-batch setting, we could only consider a subset of mini-batch
+#   samples at each backpropagation. This can be done with the optional
+#   :code:`subsampling` argument in many BackPACK's extensions. See the
+#   :ref:`mini-batch sub-sampling tutorial <Mini-batch sub-sampling>`. This
+#   technique can be combined with the above.
+#
+# - We could turn off the gradient computation (and thereby BackPACK's
+#   computation) for all but a subgroup of parameters by setting their
+#   :code:`requires_grad` attribute to :code:`False` and compute the GGN
+#   diagonal only for these. However, for this to work we will need to perform
+#   a new forward pass for each parameter subgroup.

From 726453223d406686613507d0522714853945ed84 Mon Sep 17 00:00:00 2001
From: Felix Dangel <48687646+f-dangel@users.noreply.github.com>
Date: Mon, 26 Jun 2023 17:58:53 -0400
Subject: [PATCH 27/29] [REQ] Support PyTorch 2.x (#307)

* [REQ] Remove upper version restrictions for `torch` and `torchvision`

* [REQ] Bump python to 3.8+

* [REF] Replace `Tensor.symeig` with `torch.linalg.eigh`

* [CI] Replace `python3.7` with `python3.8`

* [REF] Try fixing syntax for `flake8` in `setup.cfg`

* [TEST] Skip double-backward of LSTM for PyTorch2.0.1

See https://github.com/pytorch/pytorch/issues/99413

* [FIX] flake8

* [TEST] Skip `jac_mat_prod` for LSTM in PyTorch2.0.1

double-backward not supported https://github.com/pytorch/pytorch/issues/99413

* [CI] Use python3.8 in RTD build

* [CI] Skip LSTM for PyTorch2.0.1 in DiagGGN tests

* [FIX] Imports

* [FIX] Turn off MKLDNN in RNN example

---------

Co-authored-by: Felix Dangel <felix.dangel@vectorinstitute.ai>
---
 .conda_env.yml                                |  9 ++--
 .github/workflows/lint.yaml                   | 28 ++++++-------
 .github/workflows/test.yaml                   |  5 ++-
 .readthedocs.yml                              |  2 +-
 README-dev.md                                 |  2 +-
 README.md                                     |  2 +-
 backpack/utils/kroneckers.py                  |  3 +-
 docs_src/examples/use_cases/example_rnn.py    | 12 ++++++
 setup.cfg                                     | 41 +++++++++++--------
 test/converter/test_converter.py              |  2 +
 test/core/derivatives/derivatives_test.py     |  3 ++
 test/extensions/problem.py                    |  2 +-
 .../diag_ggn/test_batch_diag_ggn.py           |  5 ++-
 .../secondorder/diag_ggn/test_diag_ggn.py     |  5 ++-
 test/utils/skip_test.py                       | 25 ++++++++++-
 15 files changed, 99 insertions(+), 47 deletions(-)

diff --git a/.conda_env.yml b/.conda_env.yml
index 1a9bcefc9..52d2e3fc3 100644
--- a/.conda_env.yml
+++ b/.conda_env.yml
@@ -3,10 +3,7 @@ channels:
   - pytorch
   - defaults
 dependencies:
-  - pip=19.3.1
-  - python=3.7.6
+  - pip=21.2.4
+  - python=3.8.5
   - pip:
-    - -e .
-    - -e .[lint]
-    - -e .[test]
-    - -e .[docs]
+    - -e .[lint,test,doc]
diff --git a/.github/workflows/lint.yaml b/.github/workflows/lint.yaml
index f77039b29..af35589c8 100644
--- a/.github/workflows/lint.yaml
+++ b/.github/workflows/lint.yaml
@@ -15,10 +15,10 @@ jobs:
     runs-on: ubuntu-latest
     steps:
     - uses: actions/checkout@v1
-    - name: Set up Python 3.7
+    - name: Set up Python 3.8
       uses: actions/setup-python@v1
       with:
-        python-version: 3.7
+        python-version: 3.8
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip
@@ -30,10 +30,10 @@ jobs:
     runs-on: ubuntu-latest
     steps:
     - uses: actions/checkout@v1
-    - name: Set up Python 3.7
+    - name: Set up Python 3.8
       uses: actions/setup-python@v1
       with:
-        python-version: 3.7
+        python-version: 3.8
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip
@@ -45,10 +45,10 @@ jobs:
     runs-on: ubuntu-latest
     steps:
     - uses: actions/checkout@v1
-    - name: Set up Python 3.7
+    - name: Set up Python 3.8
       uses: actions/setup-python@v1
       with:
-        python-version: 3.7
+        python-version: 3.8
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip
@@ -61,10 +61,10 @@ jobs:
     runs-on: ubuntu-latest
     steps:
     - uses: actions/checkout@v1
-    - name: Set up Python 3.7
+    - name: Set up Python 3.8
       uses: actions/setup-python@v1
       with:
-        python-version: 3.7
+        python-version: 3.8
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip
@@ -77,10 +77,10 @@ jobs:
     runs-on: ubuntu-latest
     steps:
     - uses: actions/checkout@v1
-    - name: Set up Python 3.7
+    - name: Set up Python 3.8
       uses: actions/setup-python@v1
       with:
-        python-version: 3.7
+        python-version: 3.8
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip
@@ -92,10 +92,10 @@ jobs:
     runs-on: ubuntu-latest
     steps:
     - uses: actions/checkout@v1
-    - name: Set up Python 3.7
+    - name: Set up Python 3.8
       uses: actions/setup-python@v1
       with:
-        python-version: 3.7
+        python-version: 3.8
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip
@@ -107,10 +107,10 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v1
-      - name: Set up Python 3.7
+      - name: Set up Python 3.8
         uses: actions/setup-python@v1
         with:
-          python-version: 3.7
+          python-version: 3.8
       - name: Install dependencies
         run: |
           python -m pip install --upgrade pip
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 5458f6241..44af53e7c 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -15,17 +15,18 @@ jobs:
     name: "py${{ matrix.python-version }} torch${{  matrix.pytorch-version}}"
     runs-on: ubuntu-latest
     env:
-      USING_COVERAGE: '3.7,3.9'
+      USING_COVERAGE: '3.9'
 
     strategy:
       matrix:
-        python-version: [3.7, 3.8, 3.9]
+        python-version: [3.8, 3.9]
         pytorch-version:
           - "==1.9.1"
           - "==1.10.1"
           - "==1.11.0"
           - "==1.12.1"
           - "==1.13.1"
+          - "==2.0.1"
           - "" # latest
     steps:
     - uses: actions/checkout@v1
diff --git a/.readthedocs.yml b/.readthedocs.yml
index 5e0b148fd..32cd568e7 100644
--- a/.readthedocs.yml
+++ b/.readthedocs.yml
@@ -7,7 +7,7 @@ sphinx:
   configuration: docs_src/rtd/conf.py
 
 python:
-  version: 3.7
+  version: 3.8
   install:
     - method: pip
       path: .
diff --git a/README-dev.md b/README-dev.md
index 913b035e5..88c328924 100644
--- a/README-dev.md
+++ b/README-dev.md
@@ -1,7 +1,7 @@
 # <img alt="BackPACK" src="./logo/backpack_logo_torch.svg" height="90"> BackPACK developer manual
 
 ## General standards
-- Python version: support 3.7+, use 3.7 for development
+- Python version: support 3.8+, use 3.8 for development
 - `git` [branching model](https://nvie.com/posts/a-successful-git-branching-model/)
 - Docstring style:  [Google](https://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_google.html)
 - Test runner: [`pytest`](https://docs.pytest.org/en/latest/)
diff --git a/README.md b/README.md
index 14cdeea8c..25956cf77 100644
--- a/README.md
+++ b/README.md
@@ -2,7 +2,7 @@
 
 [![Travis](https://travis-ci.org/f-dangel/backpack.svg?branch=master)](https://travis-ci.org/f-dangel/backpack)
 [![Coveralls](https://coveralls.io/repos/github/f-dangel/backpack/badge.svg?branch=master)](https://coveralls.io/github/f-dangel/backpack)
-[![Python 3.7+](https://img.shields.io/badge/python-3.7+-blue.svg)](https://www.python.org/downloads/release/python-370/)
+[![Python 3.8+](https://img.shields.io/badge/python-3.8+-blue.svg)](https://www.python.org/downloads/release/python-370/)
 
 BackPACK is built on top of [PyTorch](https://github.com/pytorch/pytorch). It efficiently computes quantities other than the gradient.
 
diff --git a/backpack/utils/kroneckers.py b/backpack/utils/kroneckers.py
index af61a79a5..6f4c5af97 100644
--- a/backpack/utils/kroneckers.py
+++ b/backpack/utils/kroneckers.py
@@ -1,4 +1,5 @@
 from torch import einsum
+from torch.linalg import eigh
 
 from backpack.utils.unsqueeze import kfacmp_unsqueeze_if_missing_dim
 
@@ -101,7 +102,7 @@ def sym_mat_inv(mat, shift, truncate=1e-8):
         Computed by eigenvalue decomposition. Eigenvalues with small
         absolute values are truncated.
         """
-        eigvals, eigvecs = mat.symeig(eigenvectors=True)
+        eigvals, eigvecs = eigh(mat)
         eigvals.add_(shift)
         inv_eigvals = 1.0 / eigvals
         inv_truncate = 1.0 / truncate
diff --git a/docs_src/examples/use_cases/example_rnn.py b/docs_src/examples/use_cases/example_rnn.py
index 283b1a900..5e1f2a374 100644
--- a/docs_src/examples/use_cases/example_rnn.py
+++ b/docs_src/examples/use_cases/example_rnn.py
@@ -22,9 +22,12 @@
 #    Not all extensions support RNNs (yet). Please create a feature request in the
 #    repository if the extension you need is not supported.
 
+from pkg_resources import packaging
+
 # %%
 # Let's get the imports out of the way.
 from torch import (
+    _C,
     allclose,
     cat,
     device,
@@ -41,11 +44,20 @@
 from backpack.custom_module.permute import Permute
 from backpack.custom_module.reduce_tuple import ReduceTuple
 from backpack.extensions import BatchGrad, DiagGGNExact
+from backpack.utils import TORCH_VERSION
 from backpack.utils.examples import autograd_diag_ggn_exact
 
 manual_seed(0)
 DEVICE = device("cpu")  # Verification via autograd only works on CPU
 
+# %%
+#
+# .. note::
+#    Due to `#99413 <https://github.com/pytorch/pytorch/issues/99413>`_, we have to disable
+#    MKLDNN for PyTorch 2.0.1 to get the double-backward through LSTMs working.
+if TORCH_VERSION == packaging.version.parse("2.0.1"):
+    _C._set_mkldnn_enabled(False)
+
 
 # %%
 # For this demo, we will use the Tolstoi Char RNN from
diff --git a/setup.cfg b/setup.cfg
index 3816c8205..8be33bc68 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -22,9 +22,9 @@ classifiers =
     Development Status :: 4 - Beta
     License :: OSI Approved :: MIT License
     Operating System :: OS Independent
-    Programming Language :: Python :: 3.7
     Programming Language :: Python :: 3.8
     Programming Language :: Python :: 3.9
+    Programming Language :: Python :: 3.10
 
 [options]
 zip_safe = False
@@ -34,12 +34,12 @@ setup_requires =
   setuptools_scm
 # Dependencies of the project (semicolon/line-separated):
 install_requires =
-    torch >= 1.9.0, < 1.13.0
-    torchvision >= 0.7.0, < 1.0.0
+    torch >= 1.9.0
+    torchvision >= 0.7.0
     einops >= 0.3.0, < 1.0.0
     unfoldNd >= 0.2.0, < 1.0.0
 # Require a specific Python version, e.g. Python 2.7 or >= 3.4
-python_requires = >=3.7
+python_requires = >=3.8
 
 [options.packages.find]
 exclude = test*
@@ -96,19 +96,28 @@ use_parentheses=True
 select = B,C,E,F,P,W,B9
 max-line-length = 88
 max-complexity = 10
+
+# E501, # max-line-length
+# # ignored because pytorch uses dict
+# C408, # use {} instead of dict()
+# # Not Black-compatible
+# E203, # whitespace before :
+# E231, # missing whitespace after ','
+# W291, # trailing whitespace
+# W503, # line break before binary operator
+# W504, # line break after binary operator
+# B905, # 'zip()' without an explicit 'strict=' parameter
+# B028, # No explicit stacklevel keyword argument found (warn)
 ignore =
-	# replaced by B950 (max-line-length + 10%)
-	E501, # max-line-length
-	# ignored because pytorch uses dict
-	C408, # use {} instead of dict()
-	# Not Black-compatible
-	E203, # whitespace before :
-	E231, # missing whitespace after ','
-	W291, # trailing whitespace
-	W503, # line break before binary operator
-	W504, # line break after binary operator
-  B905, # 'zip()' without an explicit 'strict=' parameter
-  B028, # No explicit stacklevel keyword argument found (warn)
+	E501,
+	C408,
+	E203,
+	E231,
+	W291,
+	W503,
+	W504,
+  B905,
+  B028,
 exclude = docs, build, .git, docs_src/rtd, docs_src/rtd_output, .eggs
 
 # Differences with pytorch
diff --git a/test/converter/test_converter.py b/test/converter/test_converter.py
index 860637910..4f7054e1e 100644
--- a/test/converter/test_converter.py
+++ b/test/converter/test_converter.py
@@ -5,6 +5,7 @@
 """
 from test.converter.converter_cases import CONVERTER_MODULES, ConverterModule
 from test.core.derivatives.utils import classification_targets, regression_targets
+from test.utils.skip_test import skip_torch_2_0_1_lstm
 from typing import Tuple
 
 from pytest import fixture
@@ -31,6 +32,7 @@ def model_and_input(request) -> Tuple[Module, Tensor, Module]:
     """
     manual_seed(0)
     model: ConverterModule = request.param()
+    skip_torch_2_0_1_lstm(model)
     inputs: Tensor = model.input_fn()
     loss_fn: Module = model.loss_fn()
     yield model, inputs, loss_fn
diff --git a/test/core/derivatives/derivatives_test.py b/test/core/derivatives/derivatives_test.py
index 226529c06..e5219ed2d 100644
--- a/test/core/derivatives/derivatives_test.py
+++ b/test/core/derivatives/derivatives_test.py
@@ -6,6 +6,7 @@
 - Jacobian-matrix products with respect to layer parameters
 - Transposed Jacobian-matrix products with respect to layer parameters
 """
+
 from contextlib import nullcontext
 from test.automated_test import check_sizes, check_sizes_and_values
 from test.core.derivatives.batch_norm_settings import BATCH_NORM_SETTINGS
@@ -27,6 +28,7 @@
     skip_BCEWithLogitsLoss,
     skip_BCEWithLogitsLoss_non_binary_labels,
     skip_subsampling_conflict,
+    skip_torch_2_0_1_lstm,
 )
 from typing import List, Union
 from warnings import warn
@@ -136,6 +138,7 @@ def test_jac_mat_prod(problem: DerivativesTestProblem, V: int = 3) -> None:
         V: Number of vectorized Jacobian-vector products. Default: ``3``.
     """
     problem.set_up()
+    skip_torch_2_0_1_lstm(problem.module)
     mat = rand(V, *problem.input_shape).to(problem.device)
 
     backpack_res = BackpackDerivatives(problem).jac_mat_prod(mat)
diff --git a/test/extensions/problem.py b/test/extensions/problem.py
index a260aee06..0036b654d 100644
--- a/test/extensions/problem.py
+++ b/test/extensions/problem.py
@@ -227,7 +227,7 @@ def collect_data(self, savefield: str) -> List[Any]:
             else:
                 if hasattr(p, savefield):
                     raise RuntimeError(
-                        f"Found non-differentiable parameter with attribute '{savefield}'."
+                        f"Found non-differentiable parameter with attribute {savefield}."
                     )
 
         return data
diff --git a/test/extensions/secondorder/diag_ggn/test_batch_diag_ggn.py b/test/extensions/secondorder/diag_ggn/test_batch_diag_ggn.py
index c7ca263d6..b965ef1d9 100644
--- a/test/extensions/secondorder/diag_ggn/test_batch_diag_ggn.py
+++ b/test/extensions/secondorder/diag_ggn/test_batch_diag_ggn.py
@@ -5,7 +5,7 @@
 from test.extensions.problem import make_test_problems
 from test.extensions.secondorder.diag_ggn.diag_ggn_settings import DiagGGN_SETTINGS
 from test.utils.skip_extension_test import skip_BCEWithLogitsLoss_non_binary_labels
-from test.utils.skip_test import skip_adaptive_avg_pool3d_cuda
+from test.utils.skip_test import skip_adaptive_avg_pool3d_cuda, skip_torch_2_0_1_lstm
 
 import pytest
 
@@ -23,6 +23,7 @@ def test_diag_ggn_exact_batch(problem, request):
     """
     skip_adaptive_avg_pool3d_cuda(request)
     problem.set_up()
+    skip_torch_2_0_1_lstm(problem.model)
 
     backpack_res = BackpackExtensions(problem).diag_ggn_exact_batch()
     autograd_res = AutogradExtensions(problem).diag_ggn_exact_batch()
@@ -47,6 +48,7 @@ def test_diag_ggn_mc_batch_light(problem):
     """
     problem.set_up()
     skip_BCEWithLogitsLoss_non_binary_labels(problem)
+    skip_torch_2_0_1_lstm(problem.model)
 
     backpack_res = BackpackExtensions(problem).diag_ggn_exact_batch()
     mc_samples = 6000
@@ -70,6 +72,7 @@ def test_diag_ggn_mc_batch(problem):
     """
     problem.set_up()
     skip_BCEWithLogitsLoss_non_binary_labels(problem)
+    skip_torch_2_0_1_lstm(problem.model)
 
     backpack_res = BackpackExtensions(problem).diag_ggn_exact_batch()
     mc_samples = 300000
diff --git a/test/extensions/secondorder/diag_ggn/test_diag_ggn.py b/test/extensions/secondorder/diag_ggn/test_diag_ggn.py
index b50fda1d1..d21d63d3a 100644
--- a/test/extensions/secondorder/diag_ggn/test_diag_ggn.py
+++ b/test/extensions/secondorder/diag_ggn/test_diag_ggn.py
@@ -5,7 +5,7 @@
 from test.extensions.problem import make_test_problems
 from test.extensions.secondorder.diag_ggn.diag_ggn_settings import DiagGGN_SETTINGS
 from test.utils.skip_extension_test import skip_BCEWithLogitsLoss_non_binary_labels
-from test.utils.skip_test import skip_adaptive_avg_pool3d_cuda
+from test.utils.skip_test import skip_adaptive_avg_pool3d_cuda, skip_torch_2_0_1_lstm
 
 import pytest
 
@@ -23,6 +23,7 @@ def test_diag_ggn(problem, request):
     """
     skip_adaptive_avg_pool3d_cuda(request)
     problem.set_up()
+    skip_torch_2_0_1_lstm(problem.model)
 
     backpack_res = BackpackExtensions(problem).diag_ggn()
     autograd_res = AutogradExtensions(problem).diag_ggn()
@@ -47,6 +48,7 @@ def test_diag_ggn_mc_light(problem):
     """
     problem.set_up()
     skip_BCEWithLogitsLoss_non_binary_labels(problem)
+    skip_torch_2_0_1_lstm(problem.model)
 
     backpack_res = BackpackExtensions(problem).diag_ggn()
     mc_samples = 3000
@@ -70,6 +72,7 @@ def test_diag_ggn_mc(problem):
     """
     problem.set_up()
     skip_BCEWithLogitsLoss_non_binary_labels(problem)
+    skip_torch_2_0_1_lstm(problem.model)
 
     backpack_res = BackpackExtensions(problem).diag_ggn()
     mc_samples = 300000
diff --git a/test/utils/skip_test.py b/test/utils/skip_test.py
index a6ba4a4f9..304656101 100644
--- a/test/utils/skip_test.py
+++ b/test/utils/skip_test.py
@@ -4,10 +4,31 @@
 from test.extensions.problem import ExtensionsTestProblem
 from typing import List, Union
 
+from pkg_resources import packaging
 from pytest import skip
-from torch.nn import BatchNorm1d, BatchNorm2d, BatchNorm3d, BCEWithLogitsLoss
+from torch.nn import (
+    LSTM,
+    BatchNorm1d,
+    BatchNorm2d,
+    BatchNorm3d,
+    BCEWithLogitsLoss,
+    Module,
+)
 
-from backpack.utils import ADAPTIVE_AVG_POOL_BUG
+from backpack.utils import ADAPTIVE_AVG_POOL_BUG, TORCH_VERSION
+
+
+def skip_torch_2_0_1_lstm(module: Module):
+    """Skip if module contains LSTMs and we are using PyTorch 2.0.1.
+
+    Args:
+        module: Neural network
+    """
+    # double-backward not supported https://github.com/pytorch/pytorch/issues/99413
+    TORCH_VERSION_2_0_1 = TORCH_VERSION == packaging.version.parse("2.0.1")
+    lstm = any(isinstance(m, LSTM) for m in module.modules())
+    if lstm and TORCH_VERSION_2_0_1:
+        skip("Double-backward not supported for LSTM in PyTorch 2.0.1 (#99413)")
 
 
 def skip_adaptive_avg_pool3d_cuda(request) -> None:

From d2304e51e6da894a12ca9745105c29cd08a2ad39 Mon Sep 17 00:00:00 2001
From: Felix Dangel <48687646+f-dangel@users.noreply.github.com>
Date: Mon, 26 Jun 2023 18:39:37 -0400
Subject: [PATCH 28/29] [DOC] Update changelog, prepare `1.6.0` release (#308)

Co-authored-by: Felix Dangel <felix.dangel@vectorinstitute.ai>
---
 changelog.md | 23 ++++++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

diff --git a/changelog.md b/changelog.md
index 5d6172670..2882b980e 100644
--- a/changelog.md
+++ b/changelog.md
@@ -6,6 +6,25 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+## [1.6.0] - 2023-06-26
+
+With this patch, BackPACK supports `torch==2.x` and deprecates `python3.7`
+([PR](https://github.com/f-dangel/backpack/pull/307)).
+
+### Added/New
+- Example use case for `retain_graph`
+  ([PR](https://github.com/f-dangel/backpack/pull/302))
+
+### Internal
+- Update code to latest `black` and `flake8`
+  ([PR](https://github.com/f-dangel/backpack/pull/301))
+- Fix examples on splash page
+  ([PR](https://github.com/f-dangel/backpack/pull/298))
+
+## [1.5.2] - 2022-12-19
+
+This patch adds support for `torch>=1.13.0` to BackPACK.
+
 ## [1.5.1] - 2022-11-03
 
 This patch fixes temporary compatibility issues with the latest PyTorch release.
@@ -386,7 +405,9 @@ co-authoring many PRs shipped in this release.
 
 Initial release
 
-[Unreleased]: https://github.com/f-dangel/backpack/compare/v1.5.1...HEAD
+[Unreleased]: https://github.com/f-dangel/backpack/compare/v1.6.0...HEAD
+[1.6.0]: https://github.com/f-dangel/backpack/compare/1.5.2...1.6.0
+[1.5.2]: https://github.com/f-dangel/backpack/compare/1.5.2...1.5.1
 [1.5.1]: https://github.com/f-dangel/backpack/compare/1.5.1...1.5.0
 [1.5.0]: https://github.com/f-dangel/backpack/compare/1.5.0...1.4.0
 [1.4.0]: https://github.com/f-dangel/backpack/compare/1.4.0...1.3.0

From ee1dc4f118d0f401f493572d3d815faa3fdb3305 Mon Sep 17 00:00:00 2001
From: Felix Dangel <48687646+f-dangel@users.noreply.github.com>
Date: Wed, 12 Jul 2023 19:27:18 +0200
Subject: [PATCH 29/29] [DOC] Update changelog (#311)

Co-authored-by: Felix Dangel <felix.dangel@vectorinstitute.ai>
---
 changelog.md | 57 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 56 insertions(+), 1 deletion(-)

diff --git a/changelog.md b/changelog.md
index 2882b980e..da8d00832 100644
--- a/changelog.md
+++ b/changelog.md
@@ -9,17 +9,72 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ## [1.6.0] - 2023-06-26
 
 With this patch, BackPACK supports `torch==2.x` and deprecates `python3.7`
-([PR](https://github.com/f-dangel/backpack/pull/307)).
+([PR](https://github.com/f-dangel/backpack/pull/307)). BackPACK now supports
+`BCEWithLogitsLoss`, and supports KFAC for N-dimensional (transpose)
+convolutions.
 
 ### Added/New
 - Example use case for `retain_graph`
   ([PR](https://github.com/f-dangel/backpack/pull/302))
+- Example for computing batched Jacobians
+  ([PR](https://github.com/f-dangel/backpack/pull/267))
+- Support `BCEWithLogitsLoss`
+  - `BatchDiagHessian` and `DiagHessian` extensions
+    ([PR](https://github.com/f-dangel/backpack/pull/279))
+  - `BatchDiagGGN` (exact, MC) and `DiagGGN` (exact, MC) extensions
+    ([PR](https://github.com/f-dangel/backpack/pull/280))
+  - `KFAC` extension
+    ([PR](https://github.com/f-dangel/backpack/pull/283))
+  - `KFLR` extension
+    ([PR](https://github.com/f-dangel/backpack/pull/284))
+- Extend `KFAC, KFLR, KFRA` to N-dimensional convolutions (and transpose
+  convolutions) ([blog post](https://fdangel.com/posts/kfac_explained.html))
+  - Support for `Conv{1,2,3}d`
+    ([PR](https://github.com/f-dangel/backpack/pull/291))
+  - Support for `ConvTranspose{1,2,3}d`
+    ([PR](https://github.com/f-dangel/backpack/pull/292))
+
+### Fixed/Removed
+- Use correct imports on website code samples
+  ([PR](https://github.com/f-dangel/backpack/pull/262))
 
 ### Internal
 - Update code to latest `black` and `flake8`
   ([PR](https://github.com/f-dangel/backpack/pull/301))
 - Fix examples on splash page
   ([PR](https://github.com/f-dangel/backpack/pull/298))
+- Abstract negative log-likelihood losses in core
+  ([PR](https://github.com/f-dangel/backpack/pull/252))
+  - Apply to `MSELoss` derivatives
+    ([PR](https://github.com/f-dangel/backpack/pull/252))
+  - Apply to `CrossEntropyLoss` derivatives
+    ([PR](https://github.com/f-dangel/backpack/pull/256))
+  - Implement `BCEWithLogitsLoss` derivatives
+    ([PR](https://github.com/f-dangel/backpack/pull/257))
+- Implement second-order derivatives of `BCEWithLogitsLoss`
+  ([PR](https://github.com/f-dangel/backpack/pull/271))
+- Implement optimized sampled gradients for `BCEWithLogitsLoss`
+  ([PR](https://github.com/f-dangel/backpack/pull/278))
+- Use batch size 1 in KFAC tests for ResNets
+  ([PR](https://github.com/f-dangel/backpack/pull/265))
+- Test with PyTorch `1.9.0` and `1.12.0` and make tests compatible
+  ([PR](https://github.com/f-dangel/backpack/pull/277))
+- Use `unfoldNd` package
+  - For input unfolding of convolutions
+    ([PR](https://github.com/f-dangel/backpack/pull/285))
+  - For input unfolding of transpose convolutions
+    ([PR](https://github.com/f-dangel/backpack/pull/287))
+- Fully-document utility functions for convolutions
+  ([PR](https://github.com/f-dangel/backpack/pull/286))
+- Make output shape of unfolded input of transpose convolution consistent with
+  convolution case
+  ([PR](https://github.com/f-dangel/backpack/pull/289))
+- Fully-document `HBPConv2d` class
+  ([PR](https://github.com/f-dangel/backpack/pull/290))
+- Fix support for PyTorch 1.13
+  ([PR](https://github.com/f-dangel/backpack/pull/296))
+- Update linting and formatting to latest `black`, `flake8`
+  ([PR](https://github.com/f-dangel/backpack/pull/301))
 
 ## [1.5.2] - 2022-12-19