@@ -806,6 +806,11 @@ class StableDiffusionGGML {
806
806
float skip_layer_start = 0.01 ,
807
807
float skip_layer_end = 0.2 ,
808
808
ggml_tensor* noise_mask = nullptr ) {
809
+
810
+ // TODO (Pix2Pix): separate image guidance params (right now it's reusing distilled guidance)
811
+
812
+ float img_cfg_scale = guidance;
813
+
809
814
LOG_DEBUG (" Sample" );
810
815
struct ggml_init_params params;
811
816
size_t data_size = ggml_row_size (init_latent->type , init_latent->ne [0 ]);
@@ -828,12 +833,15 @@ class StableDiffusionGGML {
828
833
struct ggml_tensor * noised_input = ggml_dup_tensor (work_ctx, noise);
829
834
830
835
bool has_unconditioned = cfg_scale != 1.0 && uncond.c_crossattn != NULL ;
836
+ bool has_img_guidance = version == VERSION_INSTRUCT_PIX2PIX && cfg_scale != img_cfg_scale;
837
+ has_unconditioned = has_unconditioned || has_img_guidance;
831
838
bool has_skiplayer = slg_scale != 0.0 && skip_layers.size () > 0 ;
832
839
833
840
// denoise wrapper
834
- struct ggml_tensor * out_cond = ggml_dup_tensor (work_ctx, x);
835
- struct ggml_tensor * out_uncond = NULL ;
836
- struct ggml_tensor * out_skip = NULL ;
841
+ struct ggml_tensor * out_cond = ggml_dup_tensor (work_ctx, x);
842
+ struct ggml_tensor * out_uncond = NULL ;
843
+ struct ggml_tensor * out_skip = NULL ;
844
+ struct ggml_tensor * out_img_cond = NULL ;
837
845
838
846
if (has_unconditioned) {
839
847
out_uncond = ggml_dup_tensor (work_ctx, x);
@@ -846,6 +854,9 @@ class StableDiffusionGGML {
846
854
LOG_WARN (" SLG is incompatible with %s models" , model_version_to_str[version]);
847
855
}
848
856
}
857
+ if (has_img_guidance) {
858
+ out_img_cond = ggml_dup_tensor (work_ctx, x);
859
+ }
849
860
struct ggml_tensor * denoised = ggml_dup_tensor (work_ctx, x);
850
861
851
862
auto denoise = [&](ggml_tensor* input, float sigma, int step) -> ggml_tensor* {
@@ -927,6 +938,22 @@ class StableDiffusionGGML {
927
938
negative_data = (float *)out_uncond->data ;
928
939
}
929
940
941
+ float * img_cond_data = NULL ;
942
+ if (has_img_guidance) {
943
+ diffusion_model->compute (n_threads,
944
+ noised_input,
945
+ timesteps,
946
+ uncond.c_crossattn ,
947
+ cond.c_concat ,
948
+ uncond.c_vector ,
949
+ guidance_tensor,
950
+ -1 ,
951
+ controls,
952
+ control_strength,
953
+ &out_img_cond);
954
+ img_cond_data = (float *)out_img_cond->data ;
955
+ }
956
+
930
957
int step_count = sigmas.size ();
931
958
bool is_skiplayer_step = has_skiplayer && step > (int )(skip_layer_start * step_count) && step < (int )(skip_layer_end * step_count);
932
959
float * skip_layer_data = NULL ;
@@ -961,7 +988,11 @@ class StableDiffusionGGML {
961
988
int64_t i3 = i / out_cond->ne [0 ] * out_cond->ne [1 ] * out_cond->ne [2 ];
962
989
float scale = min_cfg + (cfg_scale - min_cfg) * (i3 * 1 .0f / ne3);
963
990
} else {
964
- latent_result = negative_data[i] + cfg_scale * (positive_data[i] - negative_data[i]);
991
+ if (has_img_guidance){
992
+ latent_result = negative_data[i] + img_cfg_scale * (img_cond_data[i] - negative_data[i]) + cfg_scale * (positive_data[i] - img_cond_data[i]);
993
+ } else {
994
+ latent_result = negative_data[i] + cfg_scale * (positive_data[i] - negative_data[i]);
995
+ }
965
996
}
966
997
}
967
998
if (is_skiplayer_step) {
@@ -1362,7 +1393,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
1362
1393
sd_ctx->sd ->diffusion_model ->get_adm_in_channels ());
1363
1394
1364
1395
SDCondition uncond;
1365
- if (cfg_scale != 1.0 ) {
1396
+ if (cfg_scale != 1.0 || sd_ctx-> sd -> version == VERSION_INSTRUCT_PIX2PIX && cfg_scale!=guidance ) {
1366
1397
bool force_zero_embeddings = false ;
1367
1398
if (sd_version_is_sdxl (sd_ctx->sd ->version ) && negative_prompt.size () == 0 ) {
1368
1399
force_zero_embeddings = true ;
0 commit comments