@@ -314,7 +314,7 @@ class StableDiffusionGGML {
314
314
// TODO: shift_factor
315
315
}
316
316
317
- if ( version == VERSION_FLEX_2) {
317
+ if ( sd_version_is_control ( version)) {
318
318
// Might need vae encode for control cond
319
319
vae_decode_only = false ;
320
320
}
@@ -840,7 +840,7 @@ class StableDiffusionGGML {
840
840
int start_merge_step,
841
841
SDCondition id_cond,
842
842
std::vector<ggml_tensor*> ref_latents = {},
843
- ggml_tensor* denoise_mask = nullptr ) {
843
+ ggml_tensor* denoise_mask = nullptr ) {
844
844
std::vector<int > skip_layers (guidance.slg .layers , guidance.slg .layers + guidance.slg .layer_count );
845
845
846
846
// TODO (Pix2Pix): separate image guidance params (right now it's reusing distilled guidance)
@@ -1512,6 +1512,17 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
1512
1512
int W = width / 8 ;
1513
1513
int H = height / 8 ;
1514
1514
LOG_INFO (" sampling using %s method" , sampling_methods_str[sample_method]);
1515
+
1516
+ struct ggml_tensor * control_latent = NULL ;
1517
+ if (sd_version_is_control (sd_ctx->sd ->version ) && image_hint != NULL ) {
1518
+ if (!sd_ctx->sd ->use_tiny_autoencoder ) {
1519
+ struct ggml_tensor * control_moments = sd_ctx->sd ->encode_first_stage (work_ctx, image_hint);
1520
+ control_latent = sd_ctx->sd ->get_first_stage_encoding (work_ctx, control_moments);
1521
+ } else {
1522
+ control_latent = sd_ctx->sd ->encode_first_stage (work_ctx, image_hint);
1523
+ }
1524
+ }
1525
+
1515
1526
if (sd_version_is_inpaint (sd_ctx->sd ->version )) {
1516
1527
int64_t mask_channels = 1 ;
1517
1528
if (sd_ctx->sd ->version == VERSION_FLUX_FILL) {
@@ -1544,50 +1555,44 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
1544
1555
}
1545
1556
}
1546
1557
}
1547
- if (sd_ctx->sd ->version == VERSION_FLEX_2 && image_hint != NULL && sd_ctx->sd ->control_net == NULL ) {
1558
+
1559
+ if (sd_ctx->sd ->version == VERSION_FLEX_2 && control_latent != NULL && sd_ctx->sd ->control_net == NULL ) {
1548
1560
bool no_inpaint = concat_latent == NULL ;
1549
1561
if (no_inpaint) {
1550
1562
concat_latent = ggml_new_tensor_4d (work_ctx, GGML_TYPE_F32, init_latent->ne [0 ], init_latent->ne [1 ], mask_channels + init_latent->ne [2 ], 1 );
1551
1563
}
1552
1564
// fill in the control image here
1553
- struct ggml_tensor * control_latents = NULL ;
1554
- if (!sd_ctx->sd ->use_tiny_autoencoder ) {
1555
- struct ggml_tensor * control_moments = sd_ctx->sd ->encode_first_stage (work_ctx, image_hint);
1556
- control_latents = sd_ctx->sd ->get_first_stage_encoding (work_ctx, control_moments);
1557
- } else {
1558
- control_latents = sd_ctx->sd ->encode_first_stage (work_ctx, image_hint);
1559
- }
1560
- for (int64_t x = 0 ; x < concat_latent->ne [0 ]; x++) {
1561
- for (int64_t y = 0 ; y < concat_latent->ne [1 ]; y++) {
1565
+ for (int64_t x = 0 ; x < control_latent->ne [0 ]; x++) {
1566
+ for (int64_t y = 0 ; y < control_latent->ne [1 ]; y++) {
1562
1567
if (no_inpaint) {
1563
- for (int64_t c = 0 ; c < concat_latent->ne [2 ] - control_latents ->ne [2 ]; c++) {
1568
+ for (int64_t c = 0 ; c < concat_latent->ne [2 ] - control_latent ->ne [2 ]; c++) {
1564
1569
// 0x16,1x1,0x16
1565
1570
ggml_tensor_set_f32 (concat_latent, c == init_latent->ne [2 ], x, y, c);
1566
1571
}
1567
1572
}
1568
- for (int64_t c = 0 ; c < control_latents ->ne [2 ]; c++) {
1569
- float v = ggml_tensor_get_f32 (control_latents , x, y, c);
1570
- ggml_tensor_set_f32 (concat_latent, v, x, y, concat_latent->ne [2 ] - control_latents ->ne [2 ] + c);
1573
+ for (int64_t c = 0 ; c < control_latent ->ne [2 ]; c++) {
1574
+ float v = ggml_tensor_get_f32 (control_latent , x, y, c);
1575
+ ggml_tensor_set_f32 (concat_latent, v, x, y, concat_latent->ne [2 ] - control_latent ->ne [2 ] + c);
1571
1576
}
1572
1577
}
1573
1578
}
1574
- // Disable controlnet
1575
- image_hint = NULL ;
1576
1579
} else if (concat_latent == NULL ) {
1577
1580
concat_latent = empty_latent;
1578
1581
}
1579
1582
cond.c_concat = concat_latent;
1580
1583
uncond.c_concat = empty_latent;
1581
1584
denoise_mask = NULL ;
1582
- } else if (sd_version_is_edit (sd_ctx->sd ->version )) {
1585
+ } else if (sd_version_is_edit (sd_ctx->sd ->version ) || sd_version_is_control (sd_ctx-> sd -> version ) ) {
1583
1586
auto empty_latent = ggml_new_tensor_4d (work_ctx, GGML_TYPE_F32, init_latent->ne [0 ], init_latent->ne [1 ], init_latent->ne [2 ], init_latent->ne [3 ]);
1584
1587
ggml_set_f32 (empty_latent, 0 );
1585
1588
uncond.c_concat = empty_latent;
1589
+ if (sd_version_is_control (sd_ctx->sd ->version ) && control_latent != NULL && sd_ctx->sd ->control_net == NULL ) {
1590
+ concat_latent = control_latent;
1591
+ }
1586
1592
if (concat_latent == NULL ) {
1587
1593
concat_latent = empty_latent;
1588
1594
}
1589
- cond.c_concat = concat_latent;
1590
-
1595
+ cond.c_concat = concat_latent;
1591
1596
}
1592
1597
for (int b = 0 ; b < batch_count; b++) {
1593
1598
int64_t sampling_start = ggml_time_ms ();
@@ -1870,7 +1875,7 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx,
1870
1875
ggml_tensor* masked_latent = NULL ;
1871
1876
if (!sd_ctx->sd ->use_tiny_autoencoder ) {
1872
1877
ggml_tensor* moments = sd_ctx->sd ->encode_first_stage (work_ctx, masked_img);
1873
- masked_latent = sd_ctx->sd ->get_first_stage_encoding (work_ctx, moments);
1878
+ masked_latent = sd_ctx->sd ->get_first_stage_encoding (work_ctx, moments);
1874
1879
} else {
1875
1880
masked_latent = sd_ctx->sd ->encode_first_stage (work_ctx, masked_img);
1876
1881
}
@@ -1941,8 +1946,8 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx,
1941
1946
} else {
1942
1947
concat_latent = init_latent;
1943
1948
}
1944
- }
1945
-
1949
+ }
1950
+
1946
1951
{
1947
1952
// LOG_WARN("Inpainting with a base model is not great");
1948
1953
denoise_mask = ggml_new_tensor_4d (work_ctx, GGML_TYPE_F32, width / 8 , height / 8 , 1 , 1 );
0 commit comments