april-tools
diff --git a/‎cirkit/backend/torch/compiler.py
+9-3 b/‎cirkit/backend/torch/compiler.py
+9-3
diff --git a/‎cirkit/backend/torch/initializers.py
+10-10 b/‎cirkit/backend/torch/initializers.py
+10-10
diff --git a/‎cirkit/backend/torch/layers/__init__.py
-3 b/‎cirkit/backend/torch/layers/__init__.py
-3
diff --git a/‎cirkit/backend/torch/layers/inner.py
+23-108 b/‎cirkit/backend/torch/layers/inner.py
+23-108
diff --git a/‎cirkit/backend/torch/layers/input.py
+13-6 b/‎cirkit/backend/torch/layers/input.py
+13-6
diff --git a/‎cirkit/backend/torch/layers/optimized.py
+4-9 b/‎cirkit/backend/torch/layers/optimized.py
+4-9
@@ -21,7 +21,7 @@
     match_optimization_patterns,
     optimize_graph,
 )
-from cirkit.backend.torch.initializers import stacked_initializer_
+from cirkit.backend.torch.initializers import foldwise_initializer_
 from cirkit.backend.torch.layers import TorchInputLayer, TorchLayer
 from cirkit.backend.torch.layers.input import TorchConstantLayer
 from cirkit.backend.torch.optimization.layers import (
@@ -370,7 +370,7 @@ def _fold_parameter_nodes_group(
             num_folds=len(group),
             requires_grad=group[0].requires_grad,
             initializer_=functools.partial(
-                stacked_initializer_, initializers=list(map(lambda p: p.initializer, group))
+                foldwise_initializer_, initializers=list(map(lambda p: p.initializer, group))
             ),
             dtype=group[0].dtype,
         )
@@ -548,6 +548,7 @@ def _match_layer_pattern(
     outcomings_fn: Callable[[TorchLayer], Sequence[TorchLayer]],
 ) -> LayerOptMatch | None:
     ppatterns = pattern.ppatterns()
+    cpatterns = pattern.cpatterns()
     pattern_entries = pattern.entries()
     num_entries = len(pattern_entries)
     matched_layers = []
@@ -566,7 +567,12 @@ def _match_layer_pattern(
         if len(out_nodes) > 1 and lid != 0:
             return None
 
-        # Second, attempt to match the patterns specified for its parameters
+        # Second, attempt to match the configuration patterns for the layer
+        for cname, cvalue in cpatterns[lid].items():
+            if layer.config[cname] != cvalue:
+                return None
+
+        # Third, attempt to match the patterns specified for its parameters
         lpmatches = {}
         for pname, ppattern in ppatterns[lid].items():
             pgraph = layer.params[pname]
 
@@ -7,7 +7,14 @@
 InitializerFunc = Callable[[Tensor], Tensor]
 
 
-def copy_from_ndarray_(tensor: torch.Tensor, *, array: np.ndarray) -> Tensor:
+def foldwise_initializer_(t: Tensor, *, initializers: list[InitializerFunc | None]) -> Tensor:
+    for i, initializer_ in enumerate(initializers):
+        if initializer_ is not None:
+            initializer_(t[i])
+    return t
+
+
+def copy_from_ndarray_(tensor: Tensor, *, array: np.ndarray) -> Tensor:
     t = torch.from_numpy(array)
     default_float_dtype = torch.get_default_dtype()
     if t.is_floating_point():
@@ -21,7 +28,7 @@ def copy_from_ndarray_(tensor: torch.Tensor, *, array: np.ndarray) -> Tensor:
     return tensor.copy_(t)
 
 
-def dirichlet_(tensor: torch.Tensor, alpha: float | list[float], *, dim: int = -1) -> Tensor:
+def dirichlet_(tensor: Tensor, alpha: float | list[float], *, dim: int = -1) -> Tensor:
     shape = tensor.shape
     if len(shape) == 0:
         raise ValueError(
@@ -35,15 +42,8 @@ def dirichlet_(tensor: torch.Tensor, alpha: float | list[float], *, dim: int = -
             raise ValueError(
                 "The selected dim of the tensor and the size of concentration parameters do not match"
             )
-        concentration = torch.tensor(alpha)
+        concentration = Tensor(alpha)
     dirichlet = torch.distributions.Dirichlet(concentration)
     samples = dirichlet.sample(torch.Size([d for i, d in enumerate(shape) if i != dim]))
     tensor.copy_(torch.transpose(samples, dim, -1))
     return tensor
-
-
-def stacked_initializer_(t: Tensor, *, initializers: list[InitializerFunc | None]) -> Tensor:
-    for i, initializer_ in enumerate(initializers):
-        if initializer_ is not None:
-            initializer_(t[i])
-    return t
@@ -1,10 +1,7 @@
 from .base import TorchLayer as TorchLayer
-from .inner import TorchDenseLayer as TorchDenseLayer
 from .inner import TorchHadamardLayer as TorchHadamardLayer
 from .inner import TorchInnerLayer as TorchInnerLayer
 from .inner import TorchKroneckerLayer as TorchKroneckerLayer
-from .inner import TorchMixingLayer as TorchMixingLayer
-from .inner import TorchProductLayer as TorchProductLayer
 from .inner import TorchSumLayer as TorchSumLayer
 from .input import TorchCategoricalLayer as TorchCategoricalLayer
 from .input import TorchConstantValueLayer as TorchLogPartitionLayer
 
@@ -46,15 +46,7 @@ def sample(self, x: Tensor) -> tuple[Tensor, Tensor | None]:
         raise TypeError(f"Sampling not implemented for {type(self)}")
 
 
-class TorchProductLayer(TorchInnerLayer, ABC):
-    ...
-
-
-class TorchSumLayer(TorchInnerLayer, ABC):
-    ...
-
-
-class TorchHadamardLayer(TorchProductLayer):
+class TorchHadamardLayer(TorchInnerLayer):
     """The Hadamard product layer."""
 
     def __init__(
@@ -110,7 +102,7 @@ def sample(self, x: Tensor) -> tuple[Tensor, None]:
         return x, None
 
 
-class TorchKroneckerLayer(TorchProductLayer):
+class TorchKroneckerLayer(TorchInnerLayer):
     """The Kronecker product layer."""
 
     def __init__(
@@ -171,13 +163,14 @@ def sample(self, x: Tensor) -> tuple[Tensor, Tensor | None]:
         return torch.flatten(x, start_dim=2, end_dim=3), None
 
 
-class TorchDenseLayer(TorchSumLayer):
-    """The sum layer for dense sum within a layer."""
+class TorchSumLayer(TorchInnerLayer):
+    """The sum layer."""
 
     def __init__(
         self,
         num_input_units: int,
         num_output_units: int,
+        arity: int = 1,
         *,
         weight: TorchParameter,
         semiring: Semiring | None = None,
@@ -192,91 +185,7 @@ def __init__(
             num_folds (int): The number of channels. Defaults to 1.
         """
         assert weight.num_folds == num_folds
-        assert weight.shape == (num_output_units, num_input_units)
-        super().__init__(
-            num_input_units, num_output_units, arity=1, semiring=semiring, num_folds=num_folds
-        )
-        self.weight = weight
-
-    @property
-    def config(self) -> Mapping[str, Any]:
-        return {"num_input_units": self.num_input_units, "num_output_units": self.num_output_units}
-
-    @property
-    def params(self) -> Mapping[str, TorchParameter]:
-        return {"weight": self.weight}
-
-    def forward(self, x: Tensor) -> Tensor:
-        """Run forward pass.
-
-        Args:
-            x (Tensor): The input to this layer, shape (F, H, B, Ki).
-
-        Returns:
-            Tensor: The output of this layer, shape (F, B, Ko).
-        """
-        x = x.squeeze(dim=1)  # shape (F, H=1, B, Ki) -> (F, B, Ki).
-        weight = self.weight()
-        return self.semiring.einsum(
-            "fbi,foi->fbo", inputs=(x,), operands=(weight,), dim=-1, keepdim=True
-        )  # shape (F, B, Ko).
-
-    def sample(self, x: Tensor) -> tuple[Tensor, Tensor]:
-        weight = self.weight()
-        negative = torch.any(weight < 0.0)
-        if negative:
-            raise ValueError("Sampling only works with positive weights")
-        normalized = torch.allclose(torch.sum(weight, dim=-1), torch.ones(1, device=weight.device))
-        if not normalized:
-            raise ValueError("Sampling only works with a normalized parametrization")
-
-        # x: (F, H, C, K, num_samples, D)
-        c = x.shape[2]
-        d = x.shape[-1]
-        num_samples = x.shape[-2]
-
-        # mixing_distribution: (F, O, K)
-        mixing_distribution = torch.distributions.Categorical(probs=weight)
-
-        mixing_samples = mixing_distribution.sample((num_samples,))
-        mixing_samples = E.rearrange(mixing_samples, "n f o -> f o n")
-        mixing_indices = E.repeat(mixing_samples, "f o n -> f a c o n d", a=self.arity, c=c, d=d)
-
-        x = torch.gather(x, dim=-3, index=mixing_indices)
-        x = x[:, 0]
-        return x, mixing_samples
-
-
-class TorchMixingLayer(TorchSumLayer):
-    """The sum layer for mixture among layers.
-
-    It can also be used as a sparse sum within a layer when arity=1.
-    """
-
-    def __init__(
-        self,
-        num_input_units: int,
-        num_output_units: int,
-        arity: int = 2,
-        *,
-        weight: TorchParameter,
-        semiring: Semiring | None = None,
-        num_folds: int = 1,
-    ) -> None:
-        """Init class.
-
-        Args:
-            num_input_units (int): The number of input units.
-            num_output_units (int): The number of output units, must be the same as input.
-            arity (int, optional): The arity of the layer. Defaults to 2.
-            weight (TorchParameter): The reparameterization for layer parameters.
-            num_folds (int): The number of channels. Defaults to 1.
-        """
-        assert (
-            num_output_units == num_input_units
-        ), "The number of input and output units must be the same for MixingLayer."
-        assert weight.num_folds == num_folds
-        assert weight.shape == (num_output_units, arity)
+        assert weight.shape == (num_output_units, arity * num_input_units)
         super().__init__(
             num_input_units, num_output_units, arity=arity, semiring=semiring, num_folds=num_folds
         )
@@ -303,11 +212,13 @@ def forward(self, x: Tensor) -> Tensor:
         Returns:
             Tensor: The output of this layer, shape (F, B, Ko).
         """
-        # shape (F, H, B, K) -> (F, B, K).
+        # x: (F, H, B, Ki) -> (F, B, H * Ki)
+        # weight: (F, Ko, H * Ki)
+        x = x.permute(0, 2, 1, 3).flatten(start_dim=2)
         weight = self.weight()
         return self.semiring.einsum(
-            "fhbk,fkh->fbk", inputs=(x,), operands=(weight,), dim=1, keepdim=False
-        )
+            "fbi,foi->fbo", inputs=(x,), operands=(weight,), dim=-1, keepdim=True
+        )  # shape (F, B, Ko).
 
     def sample(self, x: Tensor) -> tuple[Tensor, Tensor]:
         weight = self.weight()
@@ -318,18 +229,22 @@ def sample(self, x: Tensor) -> tuple[Tensor, Tensor]:
         if not normalized:
             raise ValueError("Sampling only works with a normalized parametrization")
 
-        # x: (F, H, C, K, num_samples, D)
-        c = x.shape[2]
-        k = x.shape[-3]
-        d = x.shape[-1]
-        num_samples = x.shape[-2]
+        # x: (F, H, C, Ki, num_samples, D) -> (F, C, H * Ki, num_samples, D)
+        x = x.permute(0, 2, 1, 3, 4, 5).flatten(2, 3)
+        c = x.shape[1]
+        num_samples = x.shape[3]
+        d = x.shape[4]
 
-        # mixing_distribution: (F, O, K)
+        # mixing_distribution: (F, Ko, H * Ki)
         mixing_distribution = torch.distributions.Categorical(probs=weight)
 
+        # mixing_samples: (num_samples, F, Ko) -> (F, Ko, num_samples)
         mixing_samples = mixing_distribution.sample((num_samples,))
         mixing_samples = E.rearrange(mixing_samples, "n f k -> f k n")
-        mixing_indices = E.repeat(mixing_samples, "f k n -> f 1 c k n d", c=c, k=k, d=d)
 
-        x = torch.gather(x, 1, mixing_indices)[:, 0]
+        # mixing_indices: (F, C, Ko, num_samples, D)
+        mixing_indices = E.repeat(mixing_samples, "f k n -> f c k n d", c=c, d=d)
+
+        # x: (F, C, Ko, num_samples, D)
+        x = torch.gather(x, dim=2, index=mixing_indices)
         return x, mixing_samples
@@ -166,12 +166,19 @@ def params(self) -> Mapping[str, TorchParameter]:
     def forward(self, x: Tensor) -> Tensor:
         if x.is_floating_point():
             x = x.long()  # The input to Embedding should be discrete
-        x = F.one_hot(x, self.num_states)  # (F, C, B, 1 num_states)
-        x = x.squeeze(dim=3)  # (F, C, B, num_states)
+        x = x.squeeze(dim=3)  # (F, C, B)
         weight = self.weight()
-        x = torch.einsum("fcbi,fkci->fbkc", x.to(weight.dtype), weight)
-        x = self.semiring.map_from(x, SumProductSemiring)
-        return self.semiring.prod(x, dim=-1)  # (F, B, K)
+        if self.num_channels == 1:
+            idx_fold = torch.arange(self.num_folds, device=weight.device)
+            x = weight[:, :, 0][idx_fold[:, None], :, x[:, 0]]
+            x = self.semiring.map_from(x, SumProductSemiring)
+        else:
+            idx_fold = torch.arange(self.num_folds, device=weight.device)[:, None, None]
+            idx_channel = torch.arange(self.num_channels, device=weight.device)[None, :, None]
+            x = weight[idx_fold, :, idx_channel, x]
+            x = self.semiring.map_from(x, SumProductSemiring)
+            x = self.semiring.prod(x, dim=1)
+        return x  # (F, B, K)
 
 
 class TorchExpFamilyLayer(TorchInputLayer, ABC):
@@ -332,7 +339,7 @@ def log_unnormalized_likelihood(self, x: Tensor) -> Tensor:
             x = logits[:, :, 0][idx_fold[:, None], :, x[:, 0]]
         else:
             idx_fold = torch.arange(self.num_folds, device=logits.device)[:, None, None]
-            idx_channel = torch.arange(self.num_channels)[None, :, None]
+            idx_channel = torch.arange(self.num_channels, device=logits.device)[None, :, None]
             x = torch.sum(logits[idx_fold, :, idx_channel, x], dim=1)
         return x
 
 
@@ -1,21 +1,16 @@
-from abc import ABC
 from collections.abc import Mapping
 from typing import Any
 
 import einops as E
 import torch
 from torch import Tensor
 
-from cirkit.backend.torch.layers import TorchInnerLayer, TorchSumLayer
+from cirkit.backend.torch.layers import TorchInnerLayer
 from cirkit.backend.torch.parameters.parameter import TorchParameter
 from cirkit.backend.torch.semiring import Semiring
 
 
-class TorchSumProductLayer(TorchInnerLayer, ABC):
-    ...
-
-
-class TorchTuckerLayer(TorchSumProductLayer):
+class TorchTuckerLayer(TorchInnerLayer):
     """The Tucker (2) layer, which is a fused dense-kronecker.
 
     A ternary einsum is used to fuse the sum and product.
@@ -81,7 +76,7 @@ def forward(self, x: Tensor) -> Tensor:
         )
 
 
-class TorchCPTLayer(TorchSumProductLayer):
+class TorchCPTLayer(TorchInnerLayer):
     """The Candecomp Parafac (collapsed) layer, which is a fused dense-hadamard.
 
     The fusion actually does not gain anything, and is just a plain connection. We don't because \
@@ -173,7 +168,7 @@ def sample(self, x: Tensor) -> tuple[Tensor, Tensor]:
         return x, mixing_samples
 
 
-class TorchTensorDotLayer(TorchSumLayer):
+class TorchTensorDotLayer(TorchInnerLayer):
     """The sum layer for dense sum within a layer."""
 
     def __init__(