huggingface · jimexist · Jan 22, 2025
diff --git a/router/src/config.rs b/router/src/config.rs
@@ -88,7 +88,12 @@ impl LlavaNext {
     pub fn get_number_of_features(&self, height: usize, width: usize) -> usize {
         let image_size = self.vision_config.image_size;
         let patch_size = self.vision_config.patch_size;
-        assert!(image_size % patch_size == 0);
+        if image_size % patch_size != 0 {
+            warn!(
+                "Image size {} is not divisible by patch size {}, will round down",
+                image_size, patch_size
+            );
+        }
         let npatches = image_size / patch_size;
         // Dimensions are intentionally swapped to be bug-compatible with
         // upstream: https://github.com/LLaVA-VL/LLaVA-NeXT/issues/59
@@ -271,4 +276,26 @@ mod test {
         let slots = config.get_number_of_features(1067, 1600);
         assert_eq!(slots, 2144);
     }
+
+    #[test]
+    fn test_uneven_division() {
+        let config = LlavaNext {
+            text_config: TextConfig {},
+            vision_config: VisionConfig {
+                image_size: 337, // Intentionally uneven
+                patch_size: 14,
+            },
+            image_grid_pinpoints: vec![
+                (336, 672),
+                (672, 336),
+                (672, 672),
+                (1008, 336),
+                (336, 1008),
+            ],
+        };
+
+        // Should still work even with uneven division
+        let slots = config.get_number_of_features(640, 640);
+        assert_eq!(slots, 2928);
+    }
 }
diff --git a/server/text_generation_server/models/vlm_causal_lm.py b/server/text_generation_server/models/vlm_causal_lm.py
@@ -170,7 +170,10 @@ def get_number_of_features(height: int, width: int, config) -> int:
     image_size = config.vision_config.image_size
     patch_size = config.vision_config.patch_size
 
-    assert image_size % patch_size == 0
+    if image_size % patch_size != 0:
+        logger.warning(
+            f"Image size {image_size} is not divisible by patch size {patch_size}"
+        )
 
     npatches = image_size // patch_size
 
@@ -520,9 +523,9 @@ def forward(
         cuda_graph["input_lengths"].zero_()
         cuda_graph["input_lengths"][: input_lengths.shape[0]] = input_lengths
         cuda_graph["cache_lengths"].zero_()
-        cuda_graph["cache_lengths"][
-            : cache_lengths_tensor.shape[0]
-        ] = cache_lengths_tensor
+        cuda_graph["cache_lengths"][: cache_lengths_tensor.shape[0]] = (
+            cache_lengths_tensor
+        )
 
         with self._forward_context(
             block_tables=cuda_graph["block_tables"],