1
1
diff --git a/src/diffusers/models/transformer_2d.py b/src/diffusers/models/transformer_2d.py
2
- index 24abf54d..3fa7df5f 100644
2
+ index 24abf54d6..3fa7df5f3 100644
3
3
--- a/src/diffusers/models/transformer_2d.py
4
4
+++ b/src/diffusers/models/transformer_2d.py
5
5
@@ -385,7 +385,7 @@ class Transformer2DModel(ModelMixin, ConfigMixin):
@@ -21,7 +21,7 @@ index 24abf54d..3fa7df5f 100644
21
21
output = hidden_states + residual
22
22
elif self.is_input_vectorized:
23
23
diff --git a/src/diffusers/models/unet_2d_condition.py b/src/diffusers/models/unet_2d_condition.py
24
- index f248b243..27d4802d 100644
24
+ index f248b243f..7c83d2cf5 100644
25
25
--- a/src/diffusers/models/unet_2d_condition.py
26
26
+++ b/src/diffusers/models/unet_2d_condition.py
27
27
@@ -799,8 +799,8 @@ class UNet2DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin)
@@ -34,8 +34,17 @@ index f248b243..27d4802d 100644
34
34
attention_mask: Optional[torch.Tensor] = None,
35
35
cross_attention_kwargs: Optional[Dict[str, Any]] = None,
36
36
added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None,
37
+ @@ -808,7 +808,7 @@ class UNet2DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin)
38
+ mid_block_additional_residual: Optional[torch.Tensor] = None,
39
+ down_intrablock_additional_residuals: Optional[Tuple[torch.Tensor]] = None,
40
+ encoder_attention_mask: Optional[torch.Tensor] = None,
41
+ - return_dict: bool = True,
42
+ + return_dict: bool = False,
43
+ ) -> Union[UNet2DConditionOutput, Tuple]:
44
+ r"""
45
+ The [`UNet2DConditionModel`] forward method.
37
46
diff --git a/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py b/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py
38
- index ff5eea2d..10ea4af1 100644
47
+ index ff5eea2d5..8a9461c87 100644
39
48
--- a/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py
40
49
+++ b/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py
41
50
@@ -701,17 +701,33 @@ class LatentConsistencyModelPipeline(
@@ -58,16 +67,16 @@ index ff5eea2d..10ea4af1 100644
58
67
+ model_pred = self.traced_unet(
59
68
+ latents.to(memory_format=torch.channels_last).to(dtype=self.precision),
60
69
+ t,
61
- + encoder_hidden_states= prompt_embeds.to(dtype=self.precision),
62
- + timestep_cond= w_embedding.to(dtype=self.precision)
63
- + )['sample' ]
70
+ + prompt_embeds.to(dtype=self.precision),
71
+ + w_embedding.to(dtype=self.precision)
72
+ + )[0 ]
64
73
+ elif hasattr(self, 'precision'):
65
74
+ model_pred = self.unet(
66
75
+ latents.to(memory_format=torch.channels_last).to(dtype=self.precision),
67
76
+ t,
68
- + encoder_hidden_states= prompt_embeds.to(dtype=self.precision),
69
- + timestep_cond= w_embedding.to(dtype=self.precision)
70
- + )['sample' ]
77
+ + prompt_embeds.to(dtype=self.precision),
78
+ + w_embedding.to(dtype=self.precision)
79
+ + )[0 ]
71
80
+ else:
72
81
+ model_pred = self.unet(
73
82
+ latents,
@@ -91,7 +100,7 @@ index ff5eea2d..10ea4af1 100644
91
100
if not output_type == "latent":
92
101
image = self.vae.decode(denoised / self.vae.config.scaling_factor, return_dict=False)[0]
93
102
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
94
- index 9911cbe7..98c7f2ab 100644
103
+ index 9911cbe75..a4e7101e3 100644
95
104
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
96
105
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
97
106
@@ -832,19 +832,33 @@ class StableDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, Lo
@@ -116,14 +125,14 @@ index 9911cbe7..98c7f2ab 100644
116
125
+ noise_pred = self.traced_unet(
117
126
+ latent_model_input.to(memory_format=torch.channels_last).to(dtype=self.precision),
118
127
+ t,
119
- + encoder_hidden_states= prompt_embeds.to(dtype=self.precision)
120
- + )['sample' ]
128
+ + prompt_embeds.to(dtype=self.precision)
129
+ + )[0 ]
121
130
+ elif hasattr(self, 'precision'):
122
131
+ noise_pred = self.unet(
123
132
+ latent_model_input.to(memory_format=torch.channels_last).to(dtype=self.precision),
124
133
+ t,
125
- + encoder_hidden_states= prompt_embeds.to(dtype=self.precision)
126
- + )['sample' ]
134
+ + prompt_embeds.to(dtype=self.precision)
135
+ + )[0 ]
127
136
+ else:
128
137
+ noise_pred = self.unet(
129
138
+ latent_model_input,
0 commit comments