diff --git a/supervision/detection/core.py b/supervision/detection/core.py index ffe5ed3fc..a73968366 100644 --- a/supervision/detection/core.py +++ b/supervision/detection/core.py @@ -44,6 +44,7 @@ from_florence_2, from_google_gemini_2_0, from_google_gemini_2_5, + from_kosmos, from_moondream, from_paligemma, from_qwen_2_5_vl, @@ -832,6 +833,7 @@ def from_lmm(cls, lmm: LMM | str, result: str | dict, **kwargs: Any) -> Detectio | Google Gemini 2.5 | `GOOGLE_GEMINI_2_5` | detection, segmentation | `resolution_wh` | `classes` | | Moondream | `MOONDREAM` | detection | `resolution_wh` | | | DeepSeek-VL2 | `DEEPSEEK_VL_2` | detection | `resolution_wh` | `classes` | + | Kosmos 2 | `KOSMOS_2` | detection | `resolution_wh` | `classes` | Args: lmm (Union[LMM, str]): The type of LMM (Large Multimodal Model) to use. @@ -1160,6 +1162,52 @@ def from_lmm(cls, lmm: LMM | str, result: str | dict, **kwargs: Any) -> Detectio detections.data # {'class_name': array(['The giraffe at the back', 'The giraffe at the front'], dtype='Detect the cats + ``` + + **For object detection, use the following user prompt:** + + ``` + Describe this image in detail + ``` + + ```python + import supervision as sv + + kosmos_result = ( + 'An image of a small statue of a cat, with a gramophone and a man walking past in the background.', + [ + ('a small statue of a cat', (12, 35), [(0.265625, 0.015625, 0.703125, 0.984375)]), + ('a gramophone', (42, 54), [(0.234375, 0.015625, 0.703125, 0.515625)]), + ('a man', (59, 64), [(0.015625, 0.390625, 0.171875, 0.984375)]) + ] + ) + detections = sv.Detections.from_vlm( + sv.VLM.KOSMOS_2, + kosmos_result, + resolution_wh=image.size, + ) + detections.xyxy + # array([[310.78125, 11.625 , 822.65625, 732.375], + # [274.21875, 11.625 , 822.65625, 383.625], + # [ 18.28125, 290.625 , 201.09375, 732.375]]) + + detections.class_id + # array([0, 1, 2]) + + detections.data + # {'class_name': array(['a small statue of a cat', 'a gramophone', 'a man'])} + ``` """ # noqa: E501 # filler logic mapping old from_lmm to new from_vlm @@ -1209,6 +1257,7 @@ def from_vlm(cls, vlm: VLM | str, result: str | dict, **kwargs: Any) -> Detectio | Google Gemini 2.5 | `GOOGLE_GEMINI_2_5` | detection, segmentation | `resolution_wh` | `classes` | | Moondream | `MOONDREAM` | detection | `resolution_wh` | | | DeepSeek-VL2 | `DEEPSEEK_VL_2` | detection | `resolution_wh` | `classes` | + | Kosmos 2 | `KOSMOS_2` | detection | `resolution_wh` | `classes` | Args: vlm (Union[VLM, str]): The type of VLM (Vision Language Model) to use. @@ -1538,6 +1587,52 @@ def from_vlm(cls, vlm: VLM | str, result: str | dict, **kwargs: Any) -> Detectio # {'class_name': array(['The giraffe at the back', 'The giraffe at the front'], dtype='Detect the cats + ``` + + **For object detection, use the following user prompt:** + + ``` + Describe this image in detail + ``` + + ```python + + import supervision as sv + + kosmos_result = ( + 'An image of a small statue of a cat, with a gramophone and a man walking past in the background.', + [ + ('a small statue of a cat', (12, 35), [(0.265625, 0.015625, 0.703125, 0.984375)]), + ('a gramophone', (42, 54), [(0.234375, 0.015625, 0.703125, 0.515625)]), + ('a man', (59, 64), [(0.015625, 0.390625, 0.171875, 0.984375)]) + ] + ) + detections = sv.Detections.from_vlm( + sv.VLM.KOSMOS_2, + kosmos_result, + resolution_wh=image.size, + ) + detections.xyxy + # array([[310.78125, 11.625 , 822.65625, 732.375], + # [274.21875, 11.625 , 822.65625, 383.625], + # [ 18.28125, 290.625 , 201.09375, 732.375]]) + + detections.class_id + # array([0, 1, 2]) + + detections.data + # {'class_name': array(['a small statue of a cat', 'a gramophone', 'a man'])} + ``` """ # noqa: E501 vlm = validate_vlm_parameters(vlm, result, kwargs) @@ -1592,6 +1687,12 @@ def from_vlm(cls, vlm: VLM | str, result: str | dict, **kwargs: Any) -> Detectio data=data, ) + if vlm == VLM.KOSMOS_2: + xyxy, class_id, class_name = from_kosmos(result, **kwargs) + return cls( + xyxy=xyxy, class_id=class_id, data={CLASS_NAME_DATA_FIELD: class_name} + ) + return cls.empty() @classmethod diff --git a/supervision/detection/vlm.py b/supervision/detection/vlm.py index 71207554e..528701e39 100644 --- a/supervision/detection/vlm.py +++ b/supervision/detection/vlm.py @@ -5,7 +5,7 @@ import json import re from enum import Enum -from typing import Any +from typing import Any, get_origin import numpy as np from PIL import Image @@ -40,6 +40,7 @@ class LMM(Enum): GOOGLE_GEMINI_2_0 = "gemini_2_0" GOOGLE_GEMINI_2_5 = "gemini_2_5" MOONDREAM = "moondream" + KOSMOS_2 = "kosmos_2" @classmethod def list(cls): @@ -81,6 +82,7 @@ class VLM(Enum): GOOGLE_GEMINI_2_0 = "gemini_2_0" GOOGLE_GEMINI_2_5 = "gemini_2_5" MOONDREAM = "moondream" + KOSMOS_2 = "kosmos_2" @classmethod def list(cls): @@ -110,6 +112,9 @@ def from_value(cls, value: VLM | str) -> VLM: VLM.GOOGLE_GEMINI_2_0: str, VLM.GOOGLE_GEMINI_2_5: str, VLM.MOONDREAM: dict, + VLM.KOSMOS_2: tuple[ + str, list[tuple[str, tuple[int, int], list[tuple[int, int, int, int]]]] + ], } REQUIRED_ARGUMENTS: dict[VLM, list[str]] = { @@ -120,6 +125,7 @@ def from_value(cls, value: VLM | str) -> VLM: VLM.GOOGLE_GEMINI_2_0: ["resolution_wh"], VLM.GOOGLE_GEMINI_2_5: ["resolution_wh"], VLM.MOONDREAM: ["resolution_wh"], + VLM.KOSMOS_2: ["resolution_wh"], } ALLOWED_ARGUMENTS: dict[VLM, list[str]] = { @@ -130,6 +136,7 @@ def from_value(cls, value: VLM | str) -> VLM: VLM.GOOGLE_GEMINI_2_0: ["resolution_wh", "classes"], VLM.GOOGLE_GEMINI_2_5: ["resolution_wh", "classes"], VLM.MOONDREAM: ["resolution_wh"], + VLM.KOSMOS_2: ["resolution_wh", "classes"], } SUPPORTED_TASKS_FLORENCE_2 = [ @@ -169,9 +176,11 @@ def validate_vlm_parameters(vlm: VLM | str, result: Any, kwargs: dict[str, Any]) f"Invalid vlm value: {vlm}. Must be one of {[e.value for e in VLM]}" ) - if not isinstance(result, RESULT_TYPES[vlm]): + expected_type = RESULT_TYPES[vlm] + origin_type = get_origin(expected_type) or expected_type + if not isinstance(result, origin_type): raise ValueError( - f"Invalid VLM result type: {type(result)}. Must be {RESULT_TYPES[vlm]}" + f"Invalid VLM result type: {type(result)}. Must be {expected_type}" ) required_args = REQUIRED_ARGUMENTS.get(vlm, []) @@ -795,3 +804,59 @@ def from_moondream( return np.empty((0, 4)) return np.array(denormalize_xyxy, dtype=float) + + +def from_kosmos( + result: tuple[ + str, list[tuple[str, tuple[int, int], list[tuple[int, int, int, int]]]] + ], + resolution_wh: tuple[int, int], + classes: list[str] | None = None, +) -> tuple[np.ndarray]: + """ + Parse and scale bounding boxes from kosmos-2 result. + + The result is a tuple of a string and a list of tuples. + The first element of the tuple is the caption. + The second element of the tuple is a list of tuples containing the class name, + the start and end index of the class name in the caption, + and the bounding box coordinates normalized to the range [0, 1]. + + The result is supposed to be in the following format: + ```python + result = ( + 'An image of a small statue of a cat, with a gramophone and a man walking past in the background.', + [ + ('a small statue of a cat', (12, 35), [(0.265625, 0.015625, 0.703125, 0.984375)]), + ('a gramophone', (42, 54), [(0.234375, 0.015625, 0.703125, 0.515625)]), + ('a man', (59, 64), [(0.015625, 0.390625, 0.171875, 0.984375)]) + ] + ) + ``` + + Args: + result: The result from the kosmos-2 model. + resolution_wh: (output_width, output_height) to which we rescale the boxes. + classes: Optional list of valid class names. If provided, returned boxes/labels + are filtered to only those classes found here. + + Returns: + xyxy (np.ndarray): An array of shape `(n, 4)` containing + the bounding boxes coordinates in format `[x1, y1, x2, y2]` + class_id (np.ndarray): An array of shape `(n,)` containing + the class indices for each bounding box + class_name (np.ndarray): An array of shape `(n,)` containing + the class labels for each bounding box + """ # noqa: E501 + _, entity_locations = result + xyxy, class_names = [], [] + for item in entity_locations: + class_name = item[0] + + if classes is not None and class_name not in classes: + continue + + bbox = item[2][0] + xyxy.append(denormalize_boxes(np.array(bbox), resolution_wh=resolution_wh)) + class_names.append(class_name) + return np.array(xyxy).reshape(-1, 4), np.array(range(len(xyxy))), class_names diff --git a/test/detection/test_vlm.py b/test/detection/test_vlm.py index 8a8240e98..6f0a3ee52 100644 --- a/test/detection/test_vlm.py +++ b/test/detection/test_vlm.py @@ -214,7 +214,7 @@ def test_from_paligemma( ), # no snippet ( does_not_raise(), - "```json\nnot valid json\n```", + "```json\ninvalid json\n```", (640, 640), (1280, 720), None, @@ -227,7 +227,7 @@ def test_from_paligemma( (1280, 720), None, (np.empty((0, 4)), None, np.empty(0, dtype=str)), - ), # empty list + ), # empty JSON array ( does_not_raise(), """```json @@ -1232,3 +1232,117 @@ def test_from_deepseek_vl_2( detections.data[CLASS_NAME_DATA_FIELD], expected_detections.data[CLASS_NAME_DATA_FIELD], ) + + +@pytest.mark.parametrize( + "exception, result, resolution_wh, classes, expected_results", + [ + ( + DoesNotRaise(), + ("", []), + (1000, 1000), + None, + (np.empty((0, 4)), None, np.empty(0).astype(str)), + ), # empty result + ( + DoesNotRaise(), + ("An image of a cat and a dog.", []), + (1000, 1000), + None, + (np.empty((0, 4)), None, np.empty(0).astype(str)), + ), # caption but no detections + ( + DoesNotRaise(), + ("An image of a cat.", [("a cat", (12, 17), [(0.2, 0.3, 0.6, 0.7)])]), + (1000, 1000), + None, + ( + np.array([[200.0, 300.0, 600.0, 700.0]]), + np.array([0]), + np.array(["a cat"]).astype(str), + ), + ), # single detection + ( + DoesNotRaise(), + ( + "An image of a cat and a dog.", + [ + ("a cat", (12, 17), [(0.2, 0.3, 0.6, 0.7)]), + ("a dog", (23, 28), [(0.5, 0.6, 0.8, 0.9)]), + ], + ), + (1000, 1000), + None, + ( + np.array([[200.0, 300.0, 600.0, 700.0], [500.0, 600.0, 800.0, 900.0]]), + np.array([0, 1]), + np.array(["a cat", "a dog"]).astype(str), + ), + ), # multiple detections + ( + DoesNotRaise(), + ( + "An image of a cat and a dog.", + [ + ("a cat", (12, 17), [(0.2, 0.3, 0.6, 0.7)]), + ("a dog", (23, 28), [(0.5, 0.6, 0.8, 0.9)]), + ], + ), + (500, 500), + None, + ( + np.array([[100.0, 150.0, 300.0, 350.0], [250.0, 300.0, 400.0, 450.0]]), + np.array([0, 1]), + np.array(["a cat", "a dog"]).astype(str), + ), + ), # different resolution + ( + DoesNotRaise(), + ( + "An image of a cat and a dog.", + [ + ("a cat", (12, 17), [(0.2, 0.3, 0.6, 0.7)]), + ("a dog", (23, 28), [(0.5, 0.6, 0.8, 0.9)]), + ], + ), + (1000, 1000), + ["a dog"], + ( + np.array([[500.0, 600.0, 800.0, 900.0]]), + np.array([0]), + np.array(["a dog"]).astype(str), + ), + ), # with class filtering + ], +) +def test_kosmos_2( + exception, + result: tuple[ + str, list[tuple[str, tuple[int, int], list[tuple[float, float, float, float]]]] + ], + resolution_wh: tuple[int, int], + classes: list[str] | None, + expected_results: tuple[np.ndarray, np.ndarray | None, np.ndarray], +): + with exception: + detections = Detections.from_vlm( + vlm=VLM.KOSMOS_2, + result=result, + resolution_wh=resolution_wh, + classes=classes, + ) + + xyxy, class_id, class_name = expected_results + + assert len(detections) == len(xyxy) + + if len(detections) == 0: + return + + assert np.allclose(detections.xyxy, xyxy) + if class_id is not None: + assert np.array_equal(detections.class_id, class_id) + assert np.array_equal( + detections.data[CLASS_NAME_DATA_FIELD], + class_name, + )