Skip to content

Commit 2dd2ded

Browse files
committed
Do not re-encode the exact same image twice
1 parent 75af1bd commit 2dd2ded

File tree

1 file changed

+14
-21
lines changed

1 file changed

+14
-21
lines changed

stable-diffusion.cpp

Lines changed: 14 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -49,8 +49,7 @@ const char* sampling_methods_str[] = {
4949
"iPNDM_v",
5050
"LCM",
5151
"DDIM \"trailing\"",
52-
"TCD"
53-
};
52+
"TCD"};
5453

5554
/*================================================== Helper Functions ================================================*/
5655

@@ -683,7 +682,7 @@ class StableDiffusionGGML {
683682
float curr_multiplier = kv.second;
684683
lora_state_diff[lora_name] -= curr_multiplier;
685684
}
686-
685+
687686
size_t rm = lora_state_diff.size() - lora_state.size();
688687
if (rm != 0) {
689688
LOG_INFO("Attempting to apply %lu LoRAs (removing %lu applied LoRAs)", lora_state.size(), rm);
@@ -806,7 +805,6 @@ class StableDiffusionGGML {
806805
float skip_layer_start = 0.01,
807806
float skip_layer_end = 0.2,
808807
ggml_tensor* noise_mask = nullptr) {
809-
810808
// TODO (Pix2Pix): separate image guidance params (right now it's reusing distilled guidance)
811809

812810
float img_cfg_scale = guidance;
@@ -834,7 +832,7 @@ class StableDiffusionGGML {
834832

835833
bool has_unconditioned = cfg_scale != 1.0 && uncond.c_crossattn != NULL;
836834
bool has_img_guidance = version == VERSION_INSTRUCT_PIX2PIX && cfg_scale != img_cfg_scale;
837-
has_unconditioned = has_unconditioned || has_img_guidance;
835+
has_unconditioned = has_unconditioned || has_img_guidance;
838836
bool has_skiplayer = slg_scale != 0.0 && skip_layers.size() > 0;
839837

840838
// denoise wrapper
@@ -988,7 +986,7 @@ class StableDiffusionGGML {
988986
int64_t i3 = i / out_cond->ne[0] * out_cond->ne[1] * out_cond->ne[2];
989987
float scale = min_cfg + (cfg_scale - min_cfg) * (i3 * 1.0f / ne3);
990988
} else {
991-
if(has_img_guidance){
989+
if (has_img_guidance) {
992990
latent_result = negative_data[i] + img_cfg_scale * (img_cond_data[i] - negative_data[i]) + cfg_scale * (positive_data[i] - img_cond_data[i]);
993991
} else {
994992
latent_result = negative_data[i] + cfg_scale * (positive_data[i] - negative_data[i]);
@@ -1393,7 +1391,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
13931391
sd_ctx->sd->diffusion_model->get_adm_in_channels());
13941392

13951393
SDCondition uncond;
1396-
if (cfg_scale != 1.0 || sd_ctx->sd->version == VERSION_INSTRUCT_PIX2PIX && cfg_scale!=guidance) {
1394+
if (cfg_scale != 1.0 || sd_ctx->sd->version == VERSION_INSTRUCT_PIX2PIX && cfg_scale != guidance) {
13971395
bool force_zero_embeddings = false;
13981396
if (sd_version_is_sdxl(sd_ctx->sd->version) && negative_prompt.size() == 0) {
13991397
force_zero_embeddings = true;
@@ -1739,6 +1737,14 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx,
17391737

17401738
sd_image_to_tensor(init_image.data, init_img);
17411739

1740+
ggml_tensor* init_latent = NULL;
1741+
if (!sd_ctx->sd->use_tiny_autoencoder) {
1742+
ggml_tensor* moments = sd_ctx->sd->encode_first_stage(work_ctx, init_img);
1743+
init_latent = sd_ctx->sd->get_first_stage_encoding(work_ctx, moments);
1744+
} else {
1745+
init_latent = sd_ctx->sd->encode_first_stage(work_ctx, init_img);
1746+
}
1747+
17421748
ggml_tensor* masked_image;
17431749

17441750
if (sd_version_is_inpaint(sd_ctx->sd->version)) {
@@ -1786,12 +1792,7 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx,
17861792
}
17871793
} else if (sd_ctx->sd->version == VERSION_INSTRUCT_PIX2PIX) {
17881794
// Not actually masked, we're just highjacking the masked_image variable since it will be used the same way
1789-
if (!sd_ctx->sd->use_tiny_autoencoder) {
1790-
ggml_tensor* moments = sd_ctx->sd->encode_first_stage(work_ctx, init_img);
1791-
masked_image = sd_ctx->sd->get_first_stage_encoding(work_ctx, moments);
1792-
} else {
1793-
masked_image = sd_ctx->sd->encode_first_stage(work_ctx, init_img);
1794-
}
1795+
masked_image = init_latent;
17951796
} else {
17961797
// LOG_WARN("Inpainting with a base model is not great");
17971798
masked_image = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width / 8, height / 8, 1, 1);
@@ -1805,14 +1806,6 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx,
18051806
}
18061807
}
18071808

1808-
ggml_tensor* init_latent = NULL;
1809-
if (!sd_ctx->sd->use_tiny_autoencoder) {
1810-
ggml_tensor* moments = sd_ctx->sd->encode_first_stage(work_ctx, init_img);
1811-
init_latent = sd_ctx->sd->get_first_stage_encoding(work_ctx, moments);
1812-
} else {
1813-
init_latent = sd_ctx->sd->encode_first_stage(work_ctx, init_img);
1814-
}
1815-
18161809
print_ggml_tensor(init_latent, true);
18171810
size_t t1 = ggml_time_ms();
18181811
LOG_INFO("encode_first_stage completed, taking %.2fs", (t1 - t0) * 1.0f / 1000);

0 commit comments

Comments
 (0)