NotSoBot
diff --git a/‎docs/source/en/api/pipelines/qwenimage.md‎
Lines changed: 2 additions & 36 deletions b/‎docs/source/en/api/pipelines/qwenimage.md‎
Lines changed: 2 additions & 36 deletions
diff --git a/‎examples/dreambooth/train_dreambooth_lora_qwen_image.py‎
Lines changed: 2 additions & 0 deletions b/‎examples/dreambooth/train_dreambooth_lora_qwen_image.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/diffusers/models/attention_dispatch.py‎
Lines changed: 2 additions & 76 deletions b/‎src/diffusers/models/attention_dispatch.py‎
Lines changed: 2 additions & 76 deletions
diff --git a/‎src/diffusers/models/controlnets/controlnet_qwenimage.py‎
Lines changed: 16 additions & 55 deletions b/‎src/diffusers/models/controlnets/controlnet_qwenimage.py‎
Lines changed: 16 additions & 55 deletions
@@ -108,46 +108,12 @@ pipe = QwenImageEditPlusPipeline.from_pretrained(
 image_1 = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/grumpy.jpg")
 image_2 = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/peng.png")
 image = pipe(
-    image=[image_1, image_2],
-    prompt='''put the penguin and the cat at a game show called "Qwen Edit Plus Games"''',
+    image=[image_1, image_2], 
+    prompt='''put the penguin and the cat at a game show called "Qwen Edit Plus Games"''', 
     num_inference_steps=50
 ).images[0]
 ```
 
-## Performance
-
-### torch.compile
-
-Using `torch.compile` on the transformer provides ~2.4x speedup (A100 80GB: 4.70s → 1.93s):
-
-```python
-import torch
-from diffusers import QwenImagePipeline
-
-pipe = QwenImagePipeline.from_pretrained("Qwen/Qwen-Image", torch_dtype=torch.bfloat16).to("cuda")
-pipe.transformer = torch.compile(pipe.transformer)
-
-# First call triggers compilation (~7s overhead)
-# Subsequent calls run at ~2.4x faster
-image = pipe("a cat", num_inference_steps=50).images[0]
-```
-
-### Batched Inference with Variable-Length Prompts
-
-When using classifier-free guidance (CFG) with prompts of different lengths, the pipeline properly handles padding through attention masking. This ensures padding tokens do not influence the generated output.
-
-```python
-# CFG with different prompt lengths works correctly
-image = pipe(
-    prompt="A cat",
-    negative_prompt="blurry, low quality, distorted",
-    true_cfg_scale=3.5,
-    num_inference_steps=50,
-).images[0]
-```
-
-For detailed benchmark scripts and results, see [this gist](https://gist.github.com/cdutr/bea337e4680268168550292d7819dc2f).
-
 ## QwenImagePipeline
 
 [[autodoc]] QwenImagePipeline
 
@@ -1513,12 +1513,14 @@ def get_sigmas(timesteps, n_dim=4, dtype=torch.float32):
                     height=model_input.shape[3],
                     width=model_input.shape[4],
                 )
+                print(f"{prompt_embeds_mask.sum(dim=1).tolist()=}")
                 model_pred = transformer(
                     hidden_states=packed_noisy_model_input,
                     encoder_hidden_states=prompt_embeds,
                     encoder_hidden_states_mask=prompt_embeds_mask,
                     timestep=timesteps / 1000,
                     img_shapes=img_shapes,
+                    txt_seq_lens=prompt_embeds_mask.sum(dim=1).tolist(),
                     return_dict=False,
                 )[0]
                 model_pred = QwenImagePipeline._unpack_latents(
 
@@ -2128,43 +2128,6 @@ def score_mod(score, batch_idx, head_idx, q_idx, kv_idx):
     return out
 
 
-def _prepare_additive_attn_mask(
-    attn_mask: torch.Tensor, target_dtype: torch.dtype, reshape_4d: bool = True
-) -> torch.Tensor:
-    """
-    Convert a 2D attention mask to an additive mask, optionally reshaping to 4D for SDPA.
-
-    This helper is used by both native SDPA and xformers backends to handle both boolean and additive masks.
-
-    Args:
-        attn_mask: 2D tensor [batch_size, seq_len_k]
-                   - Boolean: True means attend, False means mask out
-                   - Additive: 0.0 means attend, -inf means mask out
-        target_dtype: The dtype to convert the mask to (usually query.dtype)
-        reshape_4d: If True, reshape from [batch_size, seq_len_k] to [batch_size, 1, 1, seq_len_k] for broadcasting
-
-    Returns:
-        Additive mask tensor where 0.0 means attend and -inf means mask out. Shape is [batch_size, seq_len_k] if
-        reshape_4d=False, or [batch_size, 1, 1, seq_len_k] if reshape_4d=True.
-    """
-    # Check if the mask is boolean or already additive
-    if attn_mask.dtype == torch.bool:
-        # Convert boolean to additive: True -> 0.0, False -> -inf
-        attn_mask = torch.where(attn_mask, 0.0, float("-inf"))
-        # Convert to target dtype
-        attn_mask = attn_mask.to(dtype=target_dtype)
-    else:
-        # Already additive mask - just ensure correct dtype
-        attn_mask = attn_mask.to(dtype=target_dtype)
-
-    # Optionally reshape to 4D for broadcasting in attention mechanisms
-    if reshape_4d:
-        batch_size, seq_len_k = attn_mask.shape
-        attn_mask = attn_mask.view(batch_size, 1, 1, seq_len_k)
-
-    return attn_mask
-
-
 @_AttentionBackendRegistry.register(
     AttentionBackendName.NATIVE,
     constraints=[_check_device, _check_shape],
@@ -2184,19 +2147,6 @@ def _native_attention(
 ) -> torch.Tensor:
     if return_lse:
         raise ValueError("Native attention backend does not support setting `return_lse=True`.")
-
-    # Reshape 2D mask to 4D for SDPA
-    # SDPA accepts both boolean masks (torch.bool) and additive masks (float)
-    if (
-        attn_mask is not None
-        and attn_mask.ndim == 2
-        and attn_mask.shape[0] == query.shape[0]
-        and attn_mask.shape[1] == key.shape[1]
-    ):
-        # Just reshape [batch_size, seq_len_k] -> [batch_size, 1, 1, seq_len_k]
-        # SDPA handles both boolean and additive masks correctly
-        attn_mask = attn_mask.unsqueeze(1).unsqueeze(1)
-
     if _parallel_config is None:
         query, key, value = (x.permute(0, 2, 1, 3) for x in (query, key, value))
         out = torch.nn.functional.scaled_dot_product_attention(
@@ -2763,34 +2713,10 @@ def _xformers_attention(
         attn_mask = xops.LowerTriangularMask()
     elif attn_mask is not None:
         if attn_mask.ndim == 2:
-            # Convert 2D mask to 4D for xformers
-            # Mask can be boolean (True=attend, False=mask) or additive (0.0=attend, -inf=mask)
-            # xformers requires 4D additive masks [batch, heads, seq_q, seq_k]
-            # Need memory alignment - create larger tensor and slice for alignment
-            original_seq_len = attn_mask.size(1)
-            aligned_seq_len = ((original_seq_len + 7) // 8) * 8  # Round up to multiple of 8
-
-            # Create aligned 4D tensor and slice to ensure proper memory layout
-            aligned_mask = torch.zeros(
-                (batch_size, num_heads_q, seq_len_q, aligned_seq_len),
-                dtype=query.dtype,
-                device=query.device,
-            )
-            # Convert to 4D additive mask (handles both boolean and additive inputs)
-            mask_additive = _prepare_additive_attn_mask(
-                attn_mask, target_dtype=query.dtype
-            )  # [batch, 1, 1, seq_len_k]
-            # Broadcast to [batch, heads, seq_q, seq_len_k]
-            aligned_mask[:, :, :, :original_seq_len] = mask_additive
-            # Mask out the padding (already -inf from zeros -> where with default)
-            aligned_mask[:, :, :, original_seq_len:] = float("-inf")
-
-            # Slice to actual size with proper alignment
-            attn_mask = aligned_mask[:, :, :, :seq_len_kv]
+            attn_mask = attn_mask.view(attn_mask.size(0), 1, attn_mask.size(1), 1)
         elif attn_mask.ndim != 4:
             raise ValueError("Only 2D and 4D attention masks are supported for xformers attention.")
-        elif attn_mask.ndim == 4:
-            attn_mask = attn_mask.expand(batch_size, num_heads_q, seq_len_q, seq_len_kv).type_as(query)
+        attn_mask = attn_mask.expand(batch_size, num_heads_q, seq_len_q, seq_len_kv).type_as(query)
 
     if enable_gqa:
         if num_heads_q % num_heads_kv != 0:
 
@@ -20,7 +20,7 @@
 
 from ...configuration_utils import ConfigMixin, register_to_config
 from ...loaders import FromOriginalModelMixin, PeftAdapterMixin
-from ...utils import USE_PEFT_BACKEND, BaseOutput, deprecate, logging, scale_lora_layers, unscale_lora_layers
+from ...utils import USE_PEFT_BACKEND, BaseOutput, logging, scale_lora_layers, unscale_lora_layers
 from ..attention import AttentionMixin
 from ..cache_utils import CacheMixin
 from ..controlnets.controlnet import zero_module
@@ -31,7 +31,6 @@
     QwenImageTransformerBlock,
     QwenTimestepProjEmbeddings,
     RMSNorm,
-    compute_text_seq_len_from_mask,
 )
 
 
@@ -137,7 +136,7 @@ def forward(
         return_dict: bool = True,
     ) -> Union[torch.FloatTensor, Transformer2DModelOutput]:
         """
-        The [`QwenImageControlNetModel`] forward method.
+        The [`FluxTransformer2DModel`] forward method.
 
         Args:
             hidden_states (`torch.FloatTensor` of shape `(batch size, channel, height, width)`):
@@ -148,39 +147,24 @@ def forward(
                 The scale factor for ControlNet outputs.
             encoder_hidden_states (`torch.FloatTensor` of shape `(batch size, sequence_len, embed_dims)`):
                 Conditional embeddings (embeddings computed from the input conditions such as prompts) to use.
-            encoder_hidden_states_mask (`torch.Tensor` of shape `(batch_size, text_sequence_length)`, *optional*):
-                Mask for the encoder hidden states. Expected to have 1.0 for valid tokens and 0.0 for padding tokens.
-                Used in the attention processor to prevent attending to padding tokens. The mask can have any pattern
-                (not just contiguous valid tokens followed by padding) since it's applied element-wise in attention.
+            pooled_projections (`torch.FloatTensor` of shape `(batch_size, projection_dim)`): Embeddings projected
+                from the embeddings of input conditions.
             timestep ( `torch.LongTensor`):
                 Used to indicate denoising step.
-            img_shapes (`List[Tuple[int, int, int]]`, *optional*):
-                Image shapes for RoPE computation.
-            txt_seq_lens (`List[int]`, *optional*):
-                **Deprecated**. Not needed anymore, we use `encoder_hidden_states` instead to infer text sequence
-                length.
+            block_controlnet_hidden_states: (`list` of `torch.Tensor`):
+                A list of tensors that if specified are added to the residuals of transformer blocks.
             joint_attention_kwargs (`dict`, *optional*):
                 A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
                 `self.processor` in
                 [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
             return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~models.controlnet.ControlNetOutput`] instead of a plain tuple.
+                Whether or not to return a [`~models.transformer_2d.Transformer2DModelOutput`] instead of a plain
+                tuple.
 
         Returns:
-            If `return_dict` is True, a [`~models.controlnet.ControlNetOutput`] is returned, otherwise a `tuple` where
-            the first element is the controlnet block samples.
+            If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a
+            `tuple` where the first element is the sample tensor.
         """
-        # Handle deprecated txt_seq_lens parameter
-        if txt_seq_lens is not None:
-            deprecate(
-                "txt_seq_lens",
-                "0.39.0",
-                "Passing `txt_seq_lens` to `QwenImageControlNetModel.forward()` is deprecated and will be removed in "
-                "version 0.39.0. The text sequence length is now automatically inferred from `encoder_hidden_states` "
-                "and `encoder_hidden_states_mask`.",
-                standard_warn=False,
-            )
-
         if joint_attention_kwargs is not None:
             joint_attention_kwargs = joint_attention_kwargs.copy()
             lora_scale = joint_attention_kwargs.pop("scale", 1.0)
@@ -202,47 +186,32 @@ def forward(
 
         temb = self.time_text_embed(timestep, hidden_states)
 
-        # Use the encoder_hidden_states sequence length for RoPE computation and normalize mask
-        text_seq_len, _, encoder_hidden_states_mask = compute_text_seq_len_from_mask(
-            encoder_hidden_states, encoder_hidden_states_mask
-        )
-
-        image_rotary_emb = self.pos_embed(img_shapes, max_txt_seq_len=text_seq_len, device=hidden_states.device)
+        image_rotary_emb = self.pos_embed(img_shapes, txt_seq_lens, device=hidden_states.device)
 
         timestep = timestep.to(hidden_states.dtype)
         encoder_hidden_states = self.txt_norm(encoder_hidden_states)
         encoder_hidden_states = self.txt_in(encoder_hidden_states)
 
-        # Construct joint attention mask once to avoid reconstructing in every block
-        block_attention_kwargs = joint_attention_kwargs.copy() if joint_attention_kwargs is not None else {}
-        if encoder_hidden_states_mask is not None:
-            # Build joint mask: [text_mask, all_ones_for_image]
-            batch_size, image_seq_len = hidden_states.shape[:2]
-            image_mask = torch.ones((batch_size, image_seq_len), dtype=torch.bool, device=hidden_states.device)
-            joint_attention_mask = torch.cat([encoder_hidden_states_mask, image_mask], dim=1)
-            block_attention_kwargs["attention_mask"] = joint_attention_mask
-
         block_samples = ()
-        for block in self.transformer_blocks:
+        for index_block, block in enumerate(self.transformer_blocks):
             if torch.is_grad_enabled() and self.gradient_checkpointing:
                 encoder_hidden_states, hidden_states = self._gradient_checkpointing_func(
                     block,
                     hidden_states,
                     encoder_hidden_states,
-                    None,  # Don't pass encoder_hidden_states_mask (using attention_mask instead)
+                    encoder_hidden_states_mask,
                     temb,
                     image_rotary_emb,
-                    block_attention_kwargs,
                 )
 
             else:
                 encoder_hidden_states, hidden_states = block(
                     hidden_states=hidden_states,
                     encoder_hidden_states=encoder_hidden_states,
-                    encoder_hidden_states_mask=None,  # Don't pass (using attention_mask instead)
+                    encoder_hidden_states_mask=encoder_hidden_states_mask,
                     temb=temb,
                     image_rotary_emb=image_rotary_emb,
-                    joint_attention_kwargs=block_attention_kwargs,
+                    joint_attention_kwargs=joint_attention_kwargs,
                 )
             block_samples = block_samples + (hidden_states,)
 
@@ -298,15 +267,6 @@ def forward(
         joint_attention_kwargs: Optional[Dict[str, Any]] = None,
         return_dict: bool = True,
     ) -> Union[QwenImageControlNetOutput, Tuple]:
-        if txt_seq_lens is not None:
-            deprecate(
-                "txt_seq_lens",
-                "0.39.0",
-                "Passing `txt_seq_lens` to `QwenImageMultiControlNetModel.forward()` is deprecated and will be "
-                "removed in version 0.39.0. The text sequence length is now automatically inferred from "
-                "`encoder_hidden_states` and `encoder_hidden_states_mask`.",
-                standard_warn=False,
-            )
         # ControlNet-Union with multiple conditions
         # only load one ControlNet for saving memories
         if len(self.nets) == 1:
@@ -321,6 +281,7 @@ def forward(
                     encoder_hidden_states_mask=encoder_hidden_states_mask,
                     timestep=timestep,
                     img_shapes=img_shapes,
+                    txt_seq_lens=txt_seq_lens,
                     joint_attention_kwargs=joint_attention_kwargs,
                     return_dict=return_dict,
                 )