feat: SDXS-09 support and update doc (#1356)

akleine · web-flow · commit d73b4198a4f7 · 2026-04-17T01:11:44.000+08:00
diff --git a/docs/distilled_sd.md b/docs/distilled_sd.md
@@ -87,51 +87,32 @@ pipe.save_pretrained("segmindtiny-sd", safe_serialization=True)
 ```bash
 python convert_diffusers_to_original_stable_diffusion.py \
       --model_path  ./segmindtiny-sd \
-      --checkpoint_path ./segmind_tiny-sd.ckpt --half
+      --checkpoint_path ./segmind_tiny-sd.safetensors  --half --use_safetensors
 ```
 
-The file segmind_tiny-sd.ckpt will be generated and is now ready for use with sd.cpp. You can follow a similar process for the other models mentioned above.
+The file segmind_tiny-sd.safetensors will be generated and is now ready for use with sd.cpp. You can follow a similar process for the other models mentioned above.
 
 
-##### Another available .ckpt file:
-
- * https://huggingface.co/ClashSAN/small-sd/resolve/main/tinySDdistilled.ckpt
-
-To use this file, you must first adjust its non-contiguous tensors:
-
-```python
-import torch
-ckpt = torch.load("tinySDdistilled.ckpt", map_location=torch.device('cpu'))
-for key, value in ckpt['state_dict'].items():
-    if isinstance(value, torch.Tensor):
-        ckpt['state_dict'][key] = value.contiguous()
-torch.save(ckpt, "tinySDdistilled_fixed.ckpt")
-```
-
-
-### SDXS-512
+### SDXS-512-DreamShaper
 
 Another very tiny and **incredibly fast**  model is SDXS by IDKiro et al.  The authors refer to it as *"Real-Time One-Step Latent Diffusion Models with Image Conditions"*. For details read the paper: https://arxiv.org/pdf/2403.16627 . Once again the authors removed some more blocks of U-Net part and unlike other SD1 models they use an adjusted _AutoEncoderTiny_ instead of default _AutoEncoderKL_ for the VAE part.
+##### Some ready-to-run SDXS-512 model files are available online, such as:
 
-##### 1. Download the diffusers model from  Hugging Face using Python:
-
-```python
-from diffusers import StableDiffusionPipeline
-pipe = StableDiffusionPipeline.from_pretrained("IDKiro/sdxs-512-dreamshaper")
-pipe.save_pretrained(save_directory="sdxs")
-```
-##### 2. Create a safetensors file
-
-```bash
-python convert_diffusers_to_original_stable_diffusion.py \
-    --model_path  sdxs  --checkpoint_path sdxs.safetensors --half --use_safetensors
-```
-
-##### 3. Run the model as follows:
+* https://huggingface.co/akleine/sdxs-512
+* https://huggingface.co/concedo/sdxs-512-tinySDdistilled-GGUF
 
+##### Run the model as follows:
 ```bash
 ~/stable-diffusion.cpp/build/bin/sd-cli -m sdxs.safetensors -p "portrait of a lovely cat" \
   --cfg-scale 1 --steps 1
 ```
+Both options: ``` --cfg-scale 1 ``` and  ``` --steps 1 ``` are mandatory here.
+
+### SDXS-512-0.9
+
+Even though the name "SDXS-512-0.9" is similar to "SDXS-512-DreamShaper", it is *completely different* but also **incredibly fast**. Sometimes it is preferred, so try it yourself.
+##### Download a ready-to-run file from here:
+
+* https://huggingface.co/akleine/sdxs-09
 
-Both options: ``` --cfg-scale 1 ``` and  ``` --steps 1 ``` are mandatory here.                                                 
+For the use of this model, both options ``` --cfg-scale 1 ``` and ``` --steps 1 ``` are again absolutely necessary.
diff --git a/src/common_block.hpp b/src/common_block.hpp
@@ -277,6 +277,7 @@ class CrossAttention : public GGMLBlock {
     int64_t context_dim;
     int64_t n_head;
     int64_t d_head;
+    bool xtra_dim = false;
 
 public:
     CrossAttention(int64_t query_dim,
@@ -288,7 +289,11 @@ class CrossAttention : public GGMLBlock {
           query_dim(query_dim),
           context_dim(context_dim) {
         int64_t inner_dim = d_head * n_head;
-
+        if (context_dim == 320 && d_head == 320) {
+            // LOG_DEBUG("CrossAttention: temp set dim to 1024 for sdxs_09");
+            xtra_dim    = true;
+            context_dim = 1024;
+        }
         blocks["to_q"] = std::shared_ptr<GGMLBlock>(new Linear(query_dim, inner_dim, false));
         blocks["to_k"] = std::shared_ptr<GGMLBlock>(new Linear(context_dim, inner_dim, false));
         blocks["to_v"] = std::shared_ptr<GGMLBlock>(new Linear(context_dim, inner_dim, false));
@@ -313,10 +318,16 @@ class CrossAttention : public GGMLBlock {
         int64_t n_context = context->ne[1];
         int64_t inner_dim = d_head * n_head;
 
-        auto q = to_q->forward(ctx, x);        // [N, n_token, inner_dim]
+        auto q = to_q->forward(ctx, x);  // [N, n_token, inner_dim]
+        if (xtra_dim) {
+            // LOG_DEBUG("CrossAttention: temp set dim to 1024 for sdxs_09");
+            context->ne[0] = 1024;  // patch dim
+        }
         auto k = to_k->forward(ctx, context);  // [N, n_context, inner_dim]
         auto v = to_v->forward(ctx, context);  // [N, n_context, inner_dim]
-
+        if (xtra_dim) {
+            context->ne[0] = 320;  // reset dim to orig
+        }
         x = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, q, k, v, n_head, nullptr, false, ctx->flash_attn_enabled);  // [N, n_token, inner_dim]
 
         x = to_out_0->forward(ctx, x);  // [N, n_token, query_dim]
diff --git a/src/model.cpp b/src/model.cpp
@@ -1019,6 +1019,7 @@ SDVersion ModelLoader::get_sd_version() {
     bool has_middle_block_1          = false;
     bool has_output_block_311        = false;
     bool has_output_block_71         = false;
+    bool has_attn_1024               = false;
 
     for (auto& [name, tensor_storage] : tensor_storage_map) {
         if (!(is_xl)) {
@@ -1091,6 +1092,10 @@ SDVersion ModelLoader::get_sd_version() {
         if (tensor_storage.name.find("model.diffusion_model.output_blocks.7.1") != std::string::npos ||
             tensor_storage.name.find("unet.up_blocks.2.attentions.1") != std::string::npos) {
             has_output_block_71 = true;
+            if (tensor_storage.name.find("model.diffusion_model.output_blocks.7.1.transformer_blocks.0.attn1.to_k.weight") != std::string::npos) {
+                if (tensor_storage.ne[0] == 1024)
+                    has_attn_1024 = true;
+            }
         }
         if (tensor_storage.name == "cond_stage_model.transformer.text_model.embeddings.token_embedding.weight" ||
             tensor_storage.name == "cond_stage_model.model.token_embedding.weight" ||
@@ -1164,7 +1169,7 @@ SDVersion ModelLoader::get_sd_version() {
         }
         if (!has_middle_block_1) {
             if (!has_output_block_71) {
-                return VERSION_SDXS;
+                return VERSION_SDXS_512_DS;
             }
             return VERSION_SD1_TINY_UNET;
         }
@@ -1174,7 +1179,7 @@ SDVersion ModelLoader::get_sd_version() {
             return VERSION_SD2_INPAINT;
         }
         if (!has_middle_block_1) {
-            return VERSION_SD2_TINY_UNET;
+            return has_attn_1024 ? VERSION_SDXS_09 : VERSION_SD2_TINY_UNET;
         }
         return VERSION_SD2;
     }
diff --git a/src/model.h b/src/model.h
@@ -28,7 +28,8 @@ enum SDVersion {
     VERSION_SD2,
     VERSION_SD2_INPAINT,
     VERSION_SD2_TINY_UNET,
-    VERSION_SDXS,
+    VERSION_SDXS_512_DS,
+    VERSION_SDXS_09,
     VERSION_SDXL,
     VERSION_SDXL_INPAINT,
     VERSION_SDXL_PIX2PIX,
@@ -55,14 +56,14 @@ enum SDVersion {
 };
 
 static inline bool sd_version_is_sd1(SDVersion version) {
-    if (version == VERSION_SD1 || version == VERSION_SD1_INPAINT || version == VERSION_SD1_PIX2PIX || version == VERSION_SD1_TINY_UNET || version == VERSION_SDXS) {
+    if (version == VERSION_SD1 || version == VERSION_SD1_INPAINT || version == VERSION_SD1_PIX2PIX || version == VERSION_SD1_TINY_UNET || version == VERSION_SDXS_512_DS) {
         return true;
     }
     return false;
 }
 
 static inline bool sd_version_is_sd2(SDVersion version) {
-    if (version == VERSION_SD2 || version == VERSION_SD2_INPAINT || version == VERSION_SD2_TINY_UNET) {
+    if (version == VERSION_SD2 || version == VERSION_SD2_INPAINT || version == VERSION_SD2_TINY_UNET || version == VERSION_SDXS_09) {
         return true;
     }
     return false;
diff --git a/src/name_conversion.cpp b/src/name_conversion.cpp
@@ -1120,7 +1120,7 @@ std::string convert_tensor_name(std::string name, SDVersion version) {
         for (const auto& prefix : first_stage_model_prefix_vec) {
             if (starts_with(name, prefix)) {
                 name = convert_first_stage_model_name(name.substr(prefix.size()), prefix);
-                if (version == VERSION_SDXS) {
+                if (version == VERSION_SDXS_512_DS || version == VERSION_SDXS_09) {
                     name = "tae." + name;
                 } else {
                     name = prefix + name;
diff --git a/src/stable-diffusion.cpp b/src/stable-diffusion.cpp
@@ -30,7 +30,8 @@ const char* model_version_to_str[] = {
     "SD 2.x",
     "SD 2.x Inpaint",
     "SD 2.x Tiny UNet",
-    "SDXS",
+    "SDXS (512-DS)",
+    "SDXS (09)",
     "SDXL",
     "SDXL Inpaint",
     "SDXL Instruct-Pix2Pix",
@@ -414,7 +415,7 @@ class StableDiffusionGGML {
         }
 
         bool tae_preview_only = sd_ctx_params->tae_preview_only;
-        if (version == VERSION_SDXS) {
+        if (version == VERSION_SDXS_512_DS || version == VERSION_SDXS_09) {
             tae_preview_only = false;
             use_tae          = true;
         }
diff --git a/src/unet.hpp b/src/unet.hpp
@@ -217,11 +217,11 @@ class UnetModelBlock : public GGMLBlock {
         } else if (sd_version_is_unet_edit(version)) {
             in_channels = 8;
         }
-        if (version == VERSION_SD1_TINY_UNET || version == VERSION_SD2_TINY_UNET || version == VERSION_SDXS) {
+        if (version == VERSION_SD1_TINY_UNET || version == VERSION_SD2_TINY_UNET || version == VERSION_SDXS_512_DS || version == VERSION_SDXS_09) {
             num_res_blocks = 1;
             channel_mult   = {1, 2, 4};
             tiny_unet      = true;
-            if (version == VERSION_SDXS) {
+            if (version == VERSION_SDXS_512_DS) {
                 attention_resolutions = {4, 2};  // here just like SDXL
             }
         }
@@ -264,6 +264,10 @@ class UnetModelBlock : public GGMLBlock {
             if (version == VERSION_SVD) {
                 return new SpatialVideoTransformer(in_channels, n_head, d_head, depth, context_dim, use_linear_projection);
             } else {
+                if (version == VERSION_SDXS_09 && n_head == 5) {
+                    n_head = 1;    // to carry a special case of sdxs_09 into CrossAttentionLayer,
+                    d_head = 320;  // works as long the product remains equal (5*64 == 1*320)
+                }
                 return new SpatialTransformer(in_channels, n_head, d_head, depth, context_dim, use_linear_projection);
             }
         };

Original file line number	Diff line number	Diff line change
`@@ -1019,6 +1019,7 @@ SDVersion ModelLoader::get_sd_version() {`
`1019`	`1019`	`bool has_middle_block_1 = false;`
`1020`	`1020`	`bool has_output_block_311 = false;`
`1021`	`1021`	`bool has_output_block_71 = false;`
	`1022`	`+ bool has_attn_1024 = false;`
`1022`	`1023`
`1023`	`1024`	`for (auto& [name, tensor_storage] : tensor_storage_map) {`
`1024`	`1025`	`if (!(is_xl)) {`
`@@ -1091,6 +1092,10 @@ SDVersion ModelLoader::get_sd_version() {`
`1091`	`1092`	`if (tensor_storage.name.find("model.diffusion_model.output_blocks.7.1") != std::string::npos \|\|`
`1092`	`1093`	`tensor_storage.name.find("unet.up_blocks.2.attentions.1") != std::string::npos) {`
`1093`	`1094`	`has_output_block_71 = true;`
	`1095`	`+ if (tensor_storage.name.find("model.diffusion_model.output_blocks.7.1.transformer_blocks.0.attn1.to_k.weight") != std::string::npos) {`
	`1096`	`+ if (tensor_storage.ne[0] == 1024)`
	`1097`	`+ has_attn_1024 = true;`
	`1098`	`+ }`
`1094`	`1099`	`}`
`1095`	`1100`	`if (tensor_storage.name == "cond_stage_model.transformer.text_model.embeddings.token_embedding.weight" \|\|`
`1096`	`1101`	`tensor_storage.name == "cond_stage_model.model.token_embedding.weight" \|\|`
`@@ -1164,7 +1169,7 @@ SDVersion ModelLoader::get_sd_version() {`
`1164`	`1169`	`}`
`1165`	`1170`	`if (!has_middle_block_1) {`
`1166`	`1171`	`if (!has_output_block_71) {`
`1167`		`- return VERSION_SDXS;`
	`1172`	`+ return VERSION_SDXS_512_DS;`
`1168`	`1173`	`}`
`1169`	`1174`	`return VERSION_SD1_TINY_UNET;`
`1170`	`1175`	`}`
`@@ -1174,7 +1179,7 @@ SDVersion ModelLoader::get_sd_version() {`
`1174`	`1179`	`return VERSION_SD2_INPAINT;`
`1175`	`1180`	`}`
`1176`	`1181`	`if (!has_middle_block_1) {`
`1177`		`- return VERSION_SD2_TINY_UNET;`
	`1182`	`+ return has_attn_1024 ? VERSION_SDXS_09 : VERSION_SD2_TINY_UNET;`
`1178`	`1183`	`}`
`1179`	`1184`	`return VERSION_SD2;`
`1180`	`1185`	`}`
Original file line number	Diff line number	Diff line change
`@@ -217,11 +217,11 @@ class UnetModelBlock : public GGMLBlock {`
`217`	`217`	`} else if (sd_version_is_unet_edit(version)) {`
`218`	`218`	`in_channels = 8;`
`219`	`219`	`}`
`220`		`- if (version == VERSION_SD1_TINY_UNET \|\| version == VERSION_SD2_TINY_UNET \|\| version == VERSION_SDXS) {`
	`220`	`+ if (version == VERSION_SD1_TINY_UNET \|\| version == VERSION_SD2_TINY_UNET \|\| version == VERSION_SDXS_512_DS \|\| version == VERSION_SDXS_09) {`
`221`	`221`	`num_res_blocks = 1;`
`222`	`222`	`channel_mult = {1, 2, 4};`
`223`	`223`	`tiny_unet = true;`
`224`		`- if (version == VERSION_SDXS) {`
	`224`	`+ if (version == VERSION_SDXS_512_DS) {`
`225`	`225`	`attention_resolutions = {4, 2}; // here just like SDXL`
`226`	`226`	`}`
`227`	`227`	`}`
`@@ -264,6 +264,10 @@ class UnetModelBlock : public GGMLBlock {`
`264`	`264`	`if (version == VERSION_SVD) {`
`265`	`265`	`return new SpatialVideoTransformer(in_channels, n_head, d_head, depth, context_dim, use_linear_projection);`
`266`	`266`	`} else {`
	`267`	`+ if (version == VERSION_SDXS_09 && n_head == 5) {`
	`268`	`+ n_head = 1; // to carry a special case of sdxs_09 into CrossAttentionLayer,`
	`269`	`+ d_head = 320; // works as long the product remains equal (564 == 1320)`
	`270`	`+ }`
`267`	`271`	`return new SpatialTransformer(in_channels, n_head, d_head, depth, context_dim, use_linear_projection);`
`268`	`272`	`}`
`269`	`273`	`};`