diff --git a/supervision/detection/core.py b/supervision/detection/core.py
index ffe5ed3fc..a73968366 100644
--- a/supervision/detection/core.py
+++ b/supervision/detection/core.py
@@ -44,6 +44,7 @@
     from_florence_2,
     from_google_gemini_2_0,
     from_google_gemini_2_5,
+    from_kosmos,
     from_moondream,
     from_paligemma,
     from_qwen_2_5_vl,
@@ -832,6 +833,7 @@ def from_lmm(cls, lmm: LMM | str, result: str | dict, **kwargs: Any) -> Detectio
         | Google Gemini 2.5   | `GOOGLE_GEMINI_2_5`  | detection, segmentation | `resolution_wh`             | `classes`           |
         | Moondream           | `MOONDREAM`          | detection               | `resolution_wh`             |                     |
         | DeepSeek-VL2        | `DEEPSEEK_VL_2`      | detection               | `resolution_wh`             | `classes`           |
+        | Kosmos 2            | `KOSMOS_2`           | detection               | `resolution_wh`             | `classes`           |
 
         Args:
             lmm (Union[LMM, str]): The type of LMM (Large Multimodal Model) to use.
@@ -1160,6 +1162,52 @@ def from_lmm(cls, lmm: LMM | str, result: str | dict, **kwargs: Any) -> Detectio
             detections.data
             # {'class_name': array(['The giraffe at the back', 'The giraffe at the front'], dtype='<U24')}
             ```
+
+        !!! example "Kosmos-2"
+
+            ??? tip "Prompt engineering"
+                To get the best results from Kosmos-2, use optimized prompts that leverage
+                its object detection and visual grounding capabilities effectively.
+
+                **For general object detection, use the following user prompt:**
+
+                ```
+                <grounding>Detect the cats
+                ```
+
+                **For object detection, use the following user prompt:**
+
+                ```
+                <grounding>Describe this image in detail
+                ```
+
+            ```python
+            import supervision as sv
+
+            kosmos_result = (
+                'An image of a small statue of a cat, with a gramophone and a man walking past in the background.',
+                [
+                    ('a small statue of a cat', (12, 35), [(0.265625, 0.015625, 0.703125, 0.984375)]),
+                    ('a gramophone', (42, 54), [(0.234375, 0.015625, 0.703125, 0.515625)]),
+                    ('a man', (59, 64), [(0.015625, 0.390625, 0.171875, 0.984375)])
+                ]
+            )
+            detections = sv.Detections.from_vlm(
+                sv.VLM.KOSMOS_2,
+                kosmos_result,
+                resolution_wh=image.size,
+            )
+            detections.xyxy
+            # array([[310.78125,  11.625  , 822.65625, 732.375],
+            #        [274.21875,  11.625  , 822.65625, 383.625],
+            #        [ 18.28125, 290.625  , 201.09375, 732.375]])
+
+            detections.class_id
+            # array([0, 1, 2])
+
+            detections.data
+            # {'class_name': array(['a small statue of a cat', 'a gramophone', 'a man'])}
+            ```
         """  # noqa: E501
 
         # filler logic mapping old from_lmm to new from_vlm
@@ -1209,6 +1257,7 @@ def from_vlm(cls, vlm: VLM | str, result: str | dict, **kwargs: Any) -> Detectio
         | Google Gemini 2.5   | `GOOGLE_GEMINI_2_5`  | detection, segmentation | `resolution_wh`             | `classes`           |
         | Moondream           | `MOONDREAM`          | detection               | `resolution_wh`             |                     |
         | DeepSeek-VL2        | `DEEPSEEK_VL_2`      | detection               | `resolution_wh`             | `classes`           |
+        | Kosmos 2            | `KOSMOS_2`           | detection               | `resolution_wh`             | `classes`           |
 
         Args:
             vlm (Union[VLM, str]): The type of VLM (Vision Language Model) to use.
@@ -1538,6 +1587,52 @@ def from_vlm(cls, vlm: VLM | str, result: str | dict, **kwargs: Any) -> Detectio
             # {'class_name': array(['The giraffe at the back', 'The giraffe at the front'], dtype='<U24')}
             ```
 
+        !!! example "Kosmos-2"
+
+            ??? tip "Prompt engineering"
+                To get the best results from Kosmos-2, use optimized prompts that leverage
+                its object detection and visual grounding capabilities effectively.
+
+                **For general object detection, use the following user prompt:**
+
+                ```
+                <grounding>Detect the cats
+                ```
+
+                **For object detection, use the following user prompt:**
+
+                ```
+                <grounding>Describe this image in detail
+                ```
+
+            ```python
+
+            import supervision as sv
+
+            kosmos_result = (
+                'An image of a small statue of a cat, with a gramophone and a man walking past in the background.',
+                [
+                    ('a small statue of a cat', (12, 35), [(0.265625, 0.015625, 0.703125, 0.984375)]),
+                    ('a gramophone', (42, 54), [(0.234375, 0.015625, 0.703125, 0.515625)]),
+                    ('a man', (59, 64), [(0.015625, 0.390625, 0.171875, 0.984375)])
+                ]
+            )
+            detections = sv.Detections.from_vlm(
+                sv.VLM.KOSMOS_2,
+                kosmos_result,
+                resolution_wh=image.size,
+            )
+            detections.xyxy
+            # array([[310.78125,  11.625  , 822.65625, 732.375],
+            #        [274.21875,  11.625  , 822.65625, 383.625],
+            #        [ 18.28125, 290.625  , 201.09375, 732.375]])
+
+            detections.class_id
+            # array([0, 1, 2])
+
+            detections.data
+            # {'class_name': array(['a small statue of a cat', 'a gramophone', 'a man'])}
+            ```
         """  # noqa: E501
 
         vlm = validate_vlm_parameters(vlm, result, kwargs)
@@ -1592,6 +1687,12 @@ def from_vlm(cls, vlm: VLM | str, result: str | dict, **kwargs: Any) -> Detectio
                 data=data,
             )
 
+        if vlm == VLM.KOSMOS_2:
+            xyxy, class_id, class_name = from_kosmos(result, **kwargs)
+            return cls(
+                xyxy=xyxy, class_id=class_id, data={CLASS_NAME_DATA_FIELD: class_name}
+            )
+
         return cls.empty()
 
     @classmethod
diff --git a/supervision/detection/vlm.py b/supervision/detection/vlm.py
index 71207554e..528701e39 100644
--- a/supervision/detection/vlm.py
+++ b/supervision/detection/vlm.py
@@ -5,7 +5,7 @@
 import json
 import re
 from enum import Enum
-from typing import Any
+from typing import Any, get_origin
 
 import numpy as np
 from PIL import Image
@@ -40,6 +40,7 @@ class LMM(Enum):
     GOOGLE_GEMINI_2_0 = "gemini_2_0"
     GOOGLE_GEMINI_2_5 = "gemini_2_5"
     MOONDREAM = "moondream"
+    KOSMOS_2 = "kosmos_2"
 
     @classmethod
     def list(cls):
@@ -81,6 +82,7 @@ class VLM(Enum):
     GOOGLE_GEMINI_2_0 = "gemini_2_0"
     GOOGLE_GEMINI_2_5 = "gemini_2_5"
     MOONDREAM = "moondream"
+    KOSMOS_2 = "kosmos_2"
 
     @classmethod
     def list(cls):
@@ -110,6 +112,9 @@ def from_value(cls, value: VLM | str) -> VLM:
     VLM.GOOGLE_GEMINI_2_0: str,
     VLM.GOOGLE_GEMINI_2_5: str,
     VLM.MOONDREAM: dict,
+    VLM.KOSMOS_2: tuple[
+        str, list[tuple[str, tuple[int, int], list[tuple[int, int, int, int]]]]
+    ],
 }
 
 REQUIRED_ARGUMENTS: dict[VLM, list[str]] = {
@@ -120,6 +125,7 @@ def from_value(cls, value: VLM | str) -> VLM:
     VLM.GOOGLE_GEMINI_2_0: ["resolution_wh"],
     VLM.GOOGLE_GEMINI_2_5: ["resolution_wh"],
     VLM.MOONDREAM: ["resolution_wh"],
+    VLM.KOSMOS_2: ["resolution_wh"],
 }
 
 ALLOWED_ARGUMENTS: dict[VLM, list[str]] = {
@@ -130,6 +136,7 @@ def from_value(cls, value: VLM | str) -> VLM:
     VLM.GOOGLE_GEMINI_2_0: ["resolution_wh", "classes"],
     VLM.GOOGLE_GEMINI_2_5: ["resolution_wh", "classes"],
     VLM.MOONDREAM: ["resolution_wh"],
+    VLM.KOSMOS_2: ["resolution_wh", "classes"],
 }
 
 SUPPORTED_TASKS_FLORENCE_2 = [
@@ -169,9 +176,11 @@ def validate_vlm_parameters(vlm: VLM | str, result: Any, kwargs: dict[str, Any])
                 f"Invalid vlm value: {vlm}. Must be one of {[e.value for e in VLM]}"
             )
 
-    if not isinstance(result, RESULT_TYPES[vlm]):
+    expected_type = RESULT_TYPES[vlm]
+    origin_type = get_origin(expected_type) or expected_type
+    if not isinstance(result, origin_type):
         raise ValueError(
-            f"Invalid VLM result type: {type(result)}. Must be {RESULT_TYPES[vlm]}"
+            f"Invalid VLM result type: {type(result)}. Must be {expected_type}"
         )
 
     required_args = REQUIRED_ARGUMENTS.get(vlm, [])
@@ -795,3 +804,59 @@ def from_moondream(
         return np.empty((0, 4))
 
     return np.array(denormalize_xyxy, dtype=float)
+
+
+def from_kosmos(
+    result: tuple[
+        str, list[tuple[str, tuple[int, int], list[tuple[int, int, int, int]]]]
+    ],
+    resolution_wh: tuple[int, int],
+    classes: list[str] | None = None,
+) -> tuple[np.ndarray]:
+    """
+    Parse and scale bounding boxes from kosmos-2 result.
+
+    The result is a tuple of a string and a list of tuples.
+    The first element of the tuple is the caption.
+    The second element of the tuple is a list of tuples containing the class name,
+    the start and end index of the class name in the caption,
+    and the bounding box coordinates normalized to the range [0, 1].
+
+    The result is supposed to be in the following format:
+    ```python
+    result = (
+        'An image of a small statue of a cat, with a gramophone and a man walking past in the background.',
+        [
+            ('a small statue of a cat', (12, 35), [(0.265625, 0.015625, 0.703125, 0.984375)]),
+            ('a gramophone', (42, 54), [(0.234375, 0.015625, 0.703125, 0.515625)]),
+            ('a man', (59, 64), [(0.015625, 0.390625, 0.171875, 0.984375)])
+        ]
+    )
+    ```
+
+    Args:
+        result: The result from the kosmos-2 model.
+        resolution_wh: (output_width, output_height) to which we rescale the boxes.
+        classes: Optional list of valid class names. If provided, returned boxes/labels
+            are filtered to only those classes found here.
+
+    Returns:
+        xyxy (np.ndarray): An array of shape `(n, 4)` containing
+            the bounding boxes coordinates in format `[x1, y1, x2, y2]`
+        class_id (np.ndarray): An array of shape `(n,)` containing
+            the class indices for each bounding box
+        class_name (np.ndarray): An array of shape `(n,)` containing
+            the class labels for each bounding box
+    """  # noqa: E501
+    _, entity_locations = result
+    xyxy, class_names = [], []
+    for item in entity_locations:
+        class_name = item[0]
+
+        if classes is not None and class_name not in classes:
+            continue
+
+        bbox = item[2][0]
+        xyxy.append(denormalize_boxes(np.array(bbox), resolution_wh=resolution_wh))
+        class_names.append(class_name)
+    return np.array(xyxy).reshape(-1, 4), np.array(range(len(xyxy))), class_names
diff --git a/test/detection/test_vlm.py b/test/detection/test_vlm.py
index 8a8240e98..6f0a3ee52 100644
--- a/test/detection/test_vlm.py
+++ b/test/detection/test_vlm.py
@@ -214,7 +214,7 @@ def test_from_paligemma(
         ),  # no snippet
         (
             does_not_raise(),
-            "```json\nnot valid json\n```",
+            "```json\ninvalid json\n```",
             (640, 640),
             (1280, 720),
             None,
@@ -227,7 +227,7 @@ def test_from_paligemma(
             (1280, 720),
             None,
             (np.empty((0, 4)), None, np.empty(0, dtype=str)),
-        ),  # empty list
+        ),  # empty JSON array
         (
             does_not_raise(),
             """```json
@@ -1232,3 +1232,117 @@ def test_from_deepseek_vl_2(
             detections.data[CLASS_NAME_DATA_FIELD],
             expected_detections.data[CLASS_NAME_DATA_FIELD],
         )
+
+
+@pytest.mark.parametrize(
+    "exception, result, resolution_wh, classes, expected_results",
+    [
+        (
+            DoesNotRaise(),
+            ("", []),
+            (1000, 1000),
+            None,
+            (np.empty((0, 4)), None, np.empty(0).astype(str)),
+        ),  # empty result
+        (
+            DoesNotRaise(),
+            ("An image of a cat and a dog.", []),
+            (1000, 1000),
+            None,
+            (np.empty((0, 4)), None, np.empty(0).astype(str)),
+        ),  # caption but no detections
+        (
+            DoesNotRaise(),
+            ("An image of a cat.", [("a cat", (12, 17), [(0.2, 0.3, 0.6, 0.7)])]),
+            (1000, 1000),
+            None,
+            (
+                np.array([[200.0, 300.0, 600.0, 700.0]]),
+                np.array([0]),
+                np.array(["a cat"]).astype(str),
+            ),
+        ),  # single detection
+        (
+            DoesNotRaise(),
+            (
+                "An image of a cat and a dog.",
+                [
+                    ("a cat", (12, 17), [(0.2, 0.3, 0.6, 0.7)]),
+                    ("a dog", (23, 28), [(0.5, 0.6, 0.8, 0.9)]),
+                ],
+            ),
+            (1000, 1000),
+            None,
+            (
+                np.array([[200.0, 300.0, 600.0, 700.0], [500.0, 600.0, 800.0, 900.0]]),
+                np.array([0, 1]),
+                np.array(["a cat", "a dog"]).astype(str),
+            ),
+        ),  # multiple detections
+        (
+            DoesNotRaise(),
+            (
+                "An image of a cat and a dog.",
+                [
+                    ("a cat", (12, 17), [(0.2, 0.3, 0.6, 0.7)]),
+                    ("a dog", (23, 28), [(0.5, 0.6, 0.8, 0.9)]),
+                ],
+            ),
+            (500, 500),
+            None,
+            (
+                np.array([[100.0, 150.0, 300.0, 350.0], [250.0, 300.0, 400.0, 450.0]]),
+                np.array([0, 1]),
+                np.array(["a cat", "a dog"]).astype(str),
+            ),
+        ),  # different resolution
+        (
+            DoesNotRaise(),
+            (
+                "An image of a cat and a dog.",
+                [
+                    ("a cat", (12, 17), [(0.2, 0.3, 0.6, 0.7)]),
+                    ("a dog", (23, 28), [(0.5, 0.6, 0.8, 0.9)]),
+                ],
+            ),
+            (1000, 1000),
+            ["a dog"],
+            (
+                np.array([[500.0, 600.0, 800.0, 900.0]]),
+                np.array([0]),
+                np.array(["a dog"]).astype(str),
+            ),
+        ),  # with class filtering
+    ],
+)
+def test_kosmos_2(
+    exception,
+    result: tuple[
+        str, list[tuple[str, tuple[int, int], list[tuple[float, float, float, float]]]]
+    ],
+    resolution_wh: tuple[int, int],
+    classes: list[str] | None,
+    expected_results: tuple[np.ndarray, np.ndarray | None, np.ndarray],
+):
+    with exception:
+        detections = Detections.from_vlm(
+            vlm=VLM.KOSMOS_2,
+            result=result,
+            resolution_wh=resolution_wh,
+            classes=classes,
+        )
+
+        xyxy, class_id, class_name = expected_results
+
+        assert len(detections) == len(xyxy)
+
+        if len(detections) == 0:
+            return
+
+        assert np.allclose(detections.xyxy, xyxy)
+        if class_id is not None:
+            assert np.array_equal(detections.class_id, class_id)
+        assert np.array_equal(
+            detections.data[CLASS_NAME_DATA_FIELD],
+            class_name,
+        )