Instruct-p2p support

stduhpf · stduhpf · commit 63a6df325b9f · 2025-05-15T20:23:10.000+02:00
diff --git a/model.cpp b/model.cpp
@@ -1539,6 +1539,7 @@ SDVersion ModelLoader::get_sd_version() {
         }
     }
     bool is_inpaint = input_block_weight.ne[2] == 9;
+    bool is_ip2p =  input_block_weight.ne[2] == 8;
     if (is_xl) {
         if (is_inpaint) {
             return VERSION_SDXL_INPAINT;
@@ -1558,6 +1559,9 @@ SDVersion ModelLoader::get_sd_version() {
         if (is_inpaint) {
             return VERSION_SD1_INPAINT;
         }
+        if(is_ip2p) {
+            return VERSION_INSTRUCT_PIX2PIX;
+        }
         return VERSION_SD1;
     } else if (token_embedding_weight.ne[0] == 1024) {
         if (is_inpaint) {
diff --git a/model.h b/model.h
@@ -21,6 +21,7 @@
 enum SDVersion {
     VERSION_SD1,
     VERSION_SD1_INPAINT,
+    VERSION_INSTRUCT_PIX2PIX,
     VERSION_SD2,
     VERSION_SD2_INPAINT,
     VERSION_SDXL,
@@ -47,7 +48,7 @@ static inline bool sd_version_is_sd3(SDVersion version) {
 }
 
 static inline bool sd_version_is_sd1(SDVersion version) {
-    if (version == VERSION_SD1 || version == VERSION_SD1_INPAINT) {
+    if (version == VERSION_SD1 || version == VERSION_SD1_INPAINT || version == VERSION_INSTRUCT_PIX2PIX) {
         return true;
     }
     return false;
diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
@@ -27,6 +27,7 @@
 const char* model_version_to_str[] = {
     "SD 1.x",
     "SD 1.x Inpaint",
+    "Instruct-Pix2Pix",
     "SD 2.x",
     "SD 2.x Inpaint",
     "SDXL",
@@ -1430,9 +1431,16 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
         }
         cond.c_concat   = masked_image;
         uncond.c_concat = masked_image;
+        // noise_mask = masked_image;
+    } else if (sd_ctx->sd->version == VERSION_INSTRUCT_PIX2PIX) {
+        cond.c_concat  = masked_image;
+        auto empty_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, masked_image->ne[0], masked_image->ne[1], masked_image->ne[2], masked_image->ne[3]);
+        ggml_set_f32(empty_img, 0);
+        uncond.c_concat = empty_img;
     } else {
         noise_mask = masked_image;
     }
+
     for (int b = 0; b < batch_count; b++) {
         int64_t sampling_start = ggml_time_ms();
         int64_t cur_seed       = seed + b;
@@ -1745,6 +1753,14 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx,
                 }
             }
         }
+    } else if (sd_ctx->sd->version == VERSION_INSTRUCT_PIX2PIX) {
+        // Not actually masked, we're just highjacking the masked_image variable since it will be used the same way
+        if (!sd_ctx->sd->use_tiny_autoencoder) {
+            ggml_tensor* moments = sd_ctx->sd->encode_first_stage(work_ctx, init_img);
+            masked_image         = sd_ctx->sd->get_first_stage_encoding(work_ctx, moments);
+        } else {
+            masked_image = sd_ctx->sd->encode_first_stage(work_ctx, init_img);
+        }
     } else {
         // LOG_WARN("Inpainting with a base model is not great");
         masked_image = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width / 8, height / 8, 1, 1);
diff --git a/unet.hpp b/unet.hpp
@@ -207,6 +207,8 @@ class UnetModelBlock : public GGMLBlock {
         }
         if (sd_version_is_inpaint(version)) {
             in_channels = 9;
+        } else if (version == VERSION_INSTRUCT_PIX2PIX) {
+            in_channels = 8;
         }
 
         // dims is always 2

Original file line number	Diff line number	Diff line change
`@@ -1539,6 +1539,7 @@ SDVersion ModelLoader::get_sd_version() {`
`1539`	`1539`	`}`
`1540`	`1540`	`}`
`1541`	`1541`	`bool is_inpaint = input_block_weight.ne[2] == 9;`
	`1542`	`+ bool is_ip2p = input_block_weight.ne[2] == 8;`
`1542`	`1543`	`if (is_xl) {`
`1543`	`1544`	`if (is_inpaint) {`
`1544`	`1545`	`return VERSION_SDXL_INPAINT;`
`@@ -1558,6 +1559,9 @@ SDVersion ModelLoader::get_sd_version() {`
`1558`	`1559`	`if (is_inpaint) {`
`1559`	`1560`	`return VERSION_SD1_INPAINT;`
`1560`	`1561`	`}`
	`1562`	`+ if(is_ip2p) {`
	`1563`	`+ return VERSION_INSTRUCT_PIX2PIX;`
	`1564`	`+ }`
`1561`	`1565`	`return VERSION_SD1;`
`1562`	`1566`	`} else if (token_embedding_weight.ne[0] == 1024) {`
`1563`	`1567`	`if (is_inpaint) {`
Original file line number	Diff line number	Diff line change
`@@ -207,6 +207,8 @@ class UnetModelBlock : public GGMLBlock {`
`207`	`207`	`}`
`208`	`208`	`if (sd_version_is_inpaint(version)) {`
`209`	`209`	`in_channels = 9;`
	`210`	`+ } else if (version == VERSION_INSTRUCT_PIX2PIX) {`
	`211`	`+ in_channels = 8;`
`210`	`212`	`}`
`211`	`213`
`212`	`214`	`// dims is always 2`