roboflow · soumik12345 · Jul 21, 2025 · Jul 21, 2025 · Jul 21, 2025 · Jul 25, 2025
@@ -44,6 +44,7 @@
     from_florence_2,
     from_google_gemini_2_0,
     from_google_gemini_2_5,
+    from_kosmos,
     from_moondream,
     from_paligemma,
     from_qwen_2_5_vl,
@@ -832,6 +833,7 @@ def from_lmm(cls, lmm: LMM | str, result: str | dict, **kwargs: Any) -> Detectio
         | Google Gemini 2.5   | `GOOGLE_GEMINI_2_5`  | detection, segmentation | `resolution_wh`             | `classes`           |
         | Moondream           | `MOONDREAM`          | detection               | `resolution_wh`             |                     |
         | DeepSeek-VL2        | `DEEPSEEK_VL_2`      | detection               | `resolution_wh`             | `classes`           |
+        | Kosmos 2            | `KOSMOS_2`           | detection               | `resolution_wh`             | `classes`           |
 
         Args:
             lmm (Union[LMM, str]): The type of LMM (Large Multimodal Model) to use.
@@ -1160,6 +1162,52 @@ def from_lmm(cls, lmm: LMM | str, result: str | dict, **kwargs: Any) -> Detectio
             detections.data
             # {'class_name': array(['The giraffe at the back', 'The giraffe at the front'], dtype='<U24')}
             ```
+
+        !!! example "Kosmos-2"
+
+            ??? tip "Prompt engineering"
+                To get the best results from Kosmos-2, use optimized prompts that leverage
+                its object detection and visual grounding capabilities effectively.
+
+                **For general object detection, use the following user prompt:**
+
+                ```
+                <grounding>Detect the cats
+                ```
+
+                **For object detection, use the following user prompt:**
+
+                ```
+                <grounding>Describe this image in detail
+                ```
+
+            ```python
+            import supervision as sv
+
+            kosmos_result = (
+                'An image of a small statue of a cat, with a gramophone and a man walking past in the background.',
+                [
+                    ('a small statue of a cat', (12, 35), [(0.265625, 0.015625, 0.703125, 0.984375)]),
+                    ('a gramophone', (42, 54), [(0.234375, 0.015625, 0.703125, 0.515625)]),
+                    ('a man', (59, 64), [(0.015625, 0.390625, 0.171875, 0.984375)])
+                ]
+            )
+            detections = sv.Detections.from_vlm(
+                sv.VLM.KOSMOS_2,
+                kosmos_result,
+                resolution_wh=image.size,
+            )
+            detections.xyxy
+            # array([[310.78125,  11.625  , 822.65625, 732.375],
+            #        [274.21875,  11.625  , 822.65625, 383.625],
+            #        [ 18.28125, 290.625  , 201.09375, 732.375]])
+
+            detections.class_id
+            # array([0, 1, 2])
+
+            detections.data
+            # {'class_name': array(['a small statue of a cat', 'a gramophone', 'a man'])}
+            ```
         """  # noqa: E501
 
         # filler logic mapping old from_lmm to new from_vlm
@@ -1209,6 +1257,7 @@ def from_vlm(cls, vlm: VLM | str, result: str | dict, **kwargs: Any) -> Detectio
         | Google Gemini 2.5   | `GOOGLE_GEMINI_2_5`  | detection, segmentation | `resolution_wh`             | `classes`           |
         | Moondream           | `MOONDREAM`          | detection               | `resolution_wh`             |                     |
         | DeepSeek-VL2        | `DEEPSEEK_VL_2`      | detection               | `resolution_wh`             | `classes`           |
+        | Kosmos 2            | `KOSMOS_2`           | detection               | `resolution_wh`             | `classes`           |
 
         Args:
             vlm (Union[VLM, str]): The type of VLM (Vision Language Model) to use.
@@ -1538,6 +1587,52 @@ def from_vlm(cls, vlm: VLM | str, result: str | dict, **kwargs: Any) -> Detectio
             # {'class_name': array(['The giraffe at the back', 'The giraffe at the front'], dtype='<U24')}
             ```
 
+        !!! example "Kosmos-2"
+
+            ??? tip "Prompt engineering"
+                To get the best results from Kosmos-2, use optimized prompts that leverage
+                its object detection and visual grounding capabilities effectively.
+
+                **For general object detection, use the following user prompt:**
+
+                ```
+                <grounding>Detect the cats
+                ```
+
+                **For object detection, use the following user prompt:**
+
+                ```
+                <grounding>Describe this image in detail
+                ```
+
+            ```python
+
+            import supervision as sv
+
+            kosmos_result = (
+                'An image of a small statue of a cat, with a gramophone and a man walking past in the background.',
+                [
+                    ('a small statue of a cat', (12, 35), [(0.265625, 0.015625, 0.703125, 0.984375)]),
+                    ('a gramophone', (42, 54), [(0.234375, 0.015625, 0.703125, 0.515625)]),
+                    ('a man', (59, 64), [(0.015625, 0.390625, 0.171875, 0.984375)])
+                ]
+            )
+            detections = sv.Detections.from_vlm(
+                sv.VLM.KOSMOS_2,
+                kosmos_result,
+                resolution_wh=image.size,
+            )
+            detections.xyxy
+            # array([[310.78125,  11.625  , 822.65625, 732.375],
+            #        [274.21875,  11.625  , 822.65625, 383.625],
+            #        [ 18.28125, 290.625  , 201.09375, 732.375]])
+
+            detections.class_id
+            # array([0, 1, 2])
+
+            detections.data
+            # {'class_name': array(['a small statue of a cat', 'a gramophone', 'a man'])}
+            ```
         """  # noqa: E501
 
         vlm = validate_vlm_parameters(vlm, result, kwargs)
@@ -1592,6 +1687,12 @@ def from_vlm(cls, vlm: VLM | str, result: str | dict, **kwargs: Any) -> Detectio
                 data=data,
             )
 
+        if vlm == VLM.KOSMOS_2:
+            xyxy, class_id, class_name = from_kosmos(result, **kwargs)
+            return cls(
+                xyxy=xyxy, class_id=class_id, data={CLASS_NAME_DATA_FIELD: class_name}
+            )
+
         return cls.empty()
 
     @classmethod

@@ -5,7 +5,7 @@
 import json
 import re
 from enum import Enum
-from typing import Any
+from typing import Any, get_origin
 
 import numpy as np
 from PIL import Image
@@ -40,6 +40,7 @@ class LMM(Enum):
     GOOGLE_GEMINI_2_0 = "gemini_2_0"
     GOOGLE_GEMINI_2_5 = "gemini_2_5"
     MOONDREAM = "moondream"
+    KOSMOS_2 = "kosmos_2"
 
     @classmethod
     def list(cls):
@@ -81,6 +82,7 @@ class VLM(Enum):
     GOOGLE_GEMINI_2_0 = "gemini_2_0"
     GOOGLE_GEMINI_2_5 = "gemini_2_5"
     MOONDREAM = "moondream"
+    KOSMOS_2 = "kosmos_2"
 
     @classmethod
     def list(cls):
@@ -110,6 +112,9 @@ def from_value(cls, value: VLM | str) -> VLM:
     VLM.GOOGLE_GEMINI_2_0: str,
     VLM.GOOGLE_GEMINI_2_5: str,
     VLM.MOONDREAM: dict,
+    VLM.KOSMOS_2: tuple[
+        str, list[tuple[str, tuple[int, int], list[tuple[int, int, int, int]]]]
+    ],
 }
 
 REQUIRED_ARGUMENTS: dict[VLM, list[str]] = {
@@ -120,6 +125,7 @@ def from_value(cls, value: VLM | str) -> VLM:
     VLM.GOOGLE_GEMINI_2_0: ["resolution_wh"],
     VLM.GOOGLE_GEMINI_2_5: ["resolution_wh"],
     VLM.MOONDREAM: ["resolution_wh"],
+    VLM.KOSMOS_2: ["resolution_wh"],
 }
 
 ALLOWED_ARGUMENTS: dict[VLM, list[str]] = {
@@ -130,6 +136,7 @@ def from_value(cls, value: VLM | str) -> VLM:
     VLM.GOOGLE_GEMINI_2_0: ["resolution_wh", "classes"],
     VLM.GOOGLE_GEMINI_2_5: ["resolution_wh", "classes"],
     VLM.MOONDREAM: ["resolution_wh"],
+    VLM.KOSMOS_2: ["resolution_wh", "classes"],
 }
 
 SUPPORTED_TASKS_FLORENCE_2 = [
@@ -169,9 +176,11 @@ def validate_vlm_parameters(vlm: VLM | str, result: Any, kwargs: dict[str, Any])
                 f"Invalid vlm value: {vlm}. Must be one of {[e.value for e in VLM]}"
             )
 
-    if not isinstance(result, RESULT_TYPES[vlm]):
+    expected_type = RESULT_TYPES[vlm]
+    origin_type = get_origin(expected_type) or expected_type
+    if not isinstance(result, origin_type):
         raise ValueError(
-            f"Invalid VLM result type: {type(result)}. Must be {RESULT_TYPES[vlm]}"
+            f"Invalid VLM result type: {type(result)}. Must be {expected_type}"
         )
 
     required_args = REQUIRED_ARGUMENTS.get(vlm, [])
@@ -795,3 +804,59 @@ def from_moondream(
         return np.empty((0, 4))
 
     return np.array(denormalize_xyxy, dtype=float)
+
+
+def from_kosmos(
+    result: tuple[
+        str, list[tuple[str, tuple[int, int], list[tuple[int, int, int, int]]]]
+    ],
+    resolution_wh: tuple[int, int],
+    classes: list[str] | None = None,
+) -> tuple[np.ndarray]:
+    """
+    Parse and scale bounding boxes from kosmos-2 result.
+
+    The result is a tuple of a string and a list of tuples.
+    The first element of the tuple is the caption.
+    The second element of the tuple is a list of tuples containing the class name,
+    the start and end index of the class name in the caption,
+    and the bounding box coordinates normalized to the range [0, 1].
+
+    The result is supposed to be in the following format:
+    ```python
+    result = (
+        'An image of a small statue of a cat, with a gramophone and a man walking past in the background.',
+        [
+            ('a small statue of a cat', (12, 35), [(0.265625, 0.015625, 0.703125, 0.984375)]),
+            ('a gramophone', (42, 54), [(0.234375, 0.015625, 0.703125, 0.515625)]),
+            ('a man', (59, 64), [(0.015625, 0.390625, 0.171875, 0.984375)])
+        ]
+    )
+    ```
+
+    Args:
+        result: The result from the kosmos-2 model.
+        resolution_wh: (output_width, output_height) to which we rescale the boxes.
+        classes: Optional list of valid class names. If provided, returned boxes/labels
+            are filtered to only those classes found here.
+
+    Returns:
+        xyxy (np.ndarray): An array of shape `(n, 4)` containing
+            the bounding boxes coordinates in format `[x1, y1, x2, y2]`
+        class_id (np.ndarray): An array of shape `(n,)` containing
+            the class indices for each bounding box
+        class_name (np.ndarray): An array of shape `(n,)` containing
+            the class labels for each bounding box
+    """  # noqa: E501
+    _, entity_locations = result
+    xyxy, class_names = [], []
+    for item in entity_locations:
+        class_name = item[0]
+
+        if classes is not None and class_name not in classes:
+            continue
+
+        bbox = item[2][0]
+        xyxy.append(denormalize_boxes(np.array(bbox), resolution_wh=resolution_wh))
+        class_names.append(class_name)
+    return np.array(xyxy).reshape(-1, 4), np.array(range(len(xyxy))), class_names