huggingface · DN6 · Feb 7, 2024 · Feb 7, 2024 · Apr 8, 2024 · yiyixuxu
diff --git a/src/diffusers/models/unets/unet_3d_blocks.py b/src/diffusers/models/unets/unet_3d_blocks.py
@@ -497,8 +497,8 @@ def __init__(
             )
             attentions.append(
                 Transformer2DModel(
-                    out_channels // num_attention_heads,
                     num_attention_heads,
+                    out_channels // num_attention_heads,
                     in_channels=out_channels,
                     num_layers=1,
                     cross_attention_dim=cross_attention_dim,
@@ -510,8 +510,8 @@ def __init__(
             )
             temp_attentions.append(
                 TransformerTemporalModel(
-                    out_channels // num_attention_heads,
                     num_attention_heads,
+                    out_channels // num_attention_heads,
                     in_channels=out_channels,
                     num_layers=1,
                     cross_attention_dim=cross_attention_dim,
@@ -731,8 +731,8 @@ def __init__(
             )
             attentions.append(
                 Transformer2DModel(
-                    out_channels // num_attention_heads,
                     num_attention_heads,
+                    out_channels // num_attention_heads,
                     in_channels=out_channels,
                     num_layers=1,
                     cross_attention_dim=cross_attention_dim,
@@ -744,8 +744,8 @@ def __init__(
             )
             temp_attentions.append(
                 TransformerTemporalModel(
-                    out_channels // num_attention_heads,
                     num_attention_heads,
+                    out_channels // num_attention_heads,
                     in_channels=out_channels,
                     num_layers=1,
                     cross_attention_dim=cross_attention_dim,

diff --git a/src/diffusers/models/unets/unet_3d_condition.py b/src/diffusers/models/unets/unet_3d_condition.py
@@ -136,13 +136,19 @@ def __init__(
                 "At the moment it is not possible to define the number of attention heads via `num_attention_heads` because of a naming issue as described in https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131. Passing `num_attention_heads` will only be supported in diffusers v0.19."
             )
 
+        if isinstance(attention_head_dim, int):
+            num_attention_heads = [out_channels // attention_head_dim for out_channels in block_out_channels]
+        else:
+            num_attention_heads = [
+                out_channels // attn_dim for out_channels, attn_dim in zip(block_out_channels, attention_head_dim)
+            ]
+
         # If `num_attention_heads` is not defined (which is the case for most models)
         # it will default to `attention_head_dim`. This looks weird upon first reading it and it is.
         # The reason for this behavior is to correct for incorrectly named variables that were introduced
         # when this library was created. The incorrect naming was only discovered much later in https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131
         # Changing `attention_head_dim` to `num_attention_heads` for 40,000+ configurations is too backwards breaking
         # which is why we correct for the naming here.
-        num_attention_heads = num_attention_heads or attention_head_dim
 
         # Check inputs
         if len(down_block_types) != len(up_block_types):