From 9df9352e9d79f060c61d30b25faabee4bc5529ff Mon Sep 17 00:00:00 2001
From: Paul Bauriegel <paul.bauriegel@web.de>
Date: Thu, 31 Jul 2025 12:02:20 +0200
Subject: [PATCH 1/3] Add CoreML version of SAM 2.1

---
 anylabeling/configs/auto_labeling/models.yaml |  11 ++
 .../services/auto_labeling/model_manager.py   |  60 +++++---
 .../services/auto_labeling/sam2_coreml.py     | 143 ++++++++++++++++++
 .../auto_labeling/segment_anything.py         |  13 +-
 requirements-macos.txt                        |   1 +
 setup.py                                      |   1 +
 6 files changed, 205 insertions(+), 24 deletions(-)
 create mode 100644 anylabeling/services/auto_labeling/sam2_coreml.py

diff --git a/anylabeling/configs/auto_labeling/models.yaml b/anylabeling/configs/auto_labeling/models.yaml
index 7eadfd3..0941016 100644
--- a/anylabeling/configs/auto_labeling/models.yaml
+++ b/anylabeling/configs/auto_labeling/models.yaml
@@ -1,3 +1,14 @@
+- name: "sam2_1_coreml_large"
+  display_name: Segment Anything 2.1 (Large) CoreML
+  download_url: https://huggingface.co/apple/coreml-sam2.1-large
+  encoder_model_path: SAM2_1LargeImageEncoderFLOAT16.mlpackage
+  decoder_model_path: SAM2_1LargeMaskDecoderFLOAT16.mlpackage
+  image_encoder_model_path: SAM2_1LargeImageEncoderFLOAT16.mlpackage
+  prompt_encoder_model_path: sSAM2_1LargePromptEncoderFLOAT16.mlpackage
+  input_size: 1024
+  max_height: 1024
+  max_width: 1024
+  type: segment_anything
 - name: "sam2_hiera_tiny_20240803"
   display_name: Segment Anything 2 (Hiera-Tiny)
   download_url: https://huggingface.co/vietanhdev/segment-anything-2-onnx-models/resolve/main/sam2_hiera_tiny.zip
diff --git a/anylabeling/services/auto_labeling/model_manager.py b/anylabeling/services/auto_labeling/model_manager.py
index 86f903e..41abb69 100644
--- a/anylabeling/services/auto_labeling/model_manager.py
+++ b/anylabeling/services/auto_labeling/model_manager.py
@@ -21,6 +21,7 @@
 from anylabeling.config import get_config, save_config
 
 import ssl
+from huggingface_hub import snapshot_download
 
 ssl._create_default_https_context = (
     ssl._create_unverified_context
@@ -267,22 +268,7 @@ def load_model(self, config_file):
         self.model_download_thread.started.connect(self.model_download_worker.run)
         self.model_download_thread.start()
 
-    def _download_and_extract_model(self, model_config):
-        """Download and extract a model from model config"""
-        config_file = model_config["config_file"]
-        # Check if model is already downloaded
-        if not os.path.exists(config_file):
-            raise ValueError(self.tr("Error in loading config file."))
-        with open(config_file, "r") as f:
-            model_config = yaml.safe_load(f)
-        if model_config.get("has_downloaded", False):
-            return
-
-        # Download model
-        download_url = model_config.get("download_url", None)
-        if not download_url:
-            raise ValueError(self.tr("Missing download_url in config file."))
-        tmp_dir = tempfile.mkdtemp()
+    def download_zip(self, tmp_dir, download_url):
         zip_model_path = os.path.join(tmp_dir, "model.zip")
 
         # Download url
@@ -307,14 +293,11 @@ def _progress(count, block_size, total_size):
             print(f"Could not download {download_url}: {e}")
             self.new_model_status.emit(f"Could not download {download_url}")
             return None
-
         # Extract model
         tmp_extract_dir = os.path.join(tmp_dir, "extract")
-        extract_dir = os.path.dirname(config_file)
         with zipfile.ZipFile(zip_model_path, "r") as zip_ref:
             zip_ref.extractall(tmp_extract_dir)
-
-        # Find model folder (containing config.yaml)
+                # Find model folder (containing config.yaml)
         model_folder = None
         for root, _, files in os.walk(tmp_extract_dir):
             if "config.yaml" in files:
@@ -322,6 +305,43 @@ def _progress(count, block_size, total_size):
                 break
         if model_folder is None:
             raise ValueError(self.tr("Could not find config.yaml in zip file."))
+        return model_folder
+    
+    def download_hf(self, tmp_dir, download_url, model_config):
+        repo_id = download_url.split('https://huggingface.co/')[-1].strip('/')
+        tmp_extract_dir = os.path.join(tmp_dir, "extract")
+        local_dir = snapshot_download(
+            repo_id=repo_id,
+            local_dir=tmp_extract_dir  # where to store everything
+        )
+        with open(tmp_extract_dir + "/config.yaml", "w") as f:
+            model_config = yaml.dump(model_config, f, default_flow_style=False)
+        return tmp_extract_dir
+
+    def _download_and_extract_model(self, model_config):
+        """Download and extract a model from model config"""
+        config_file = model_config["config_file"]
+        extract_dir = os.path.dirname(config_file)
+        # Check if model is already downloaded
+        if not os.path.exists(config_file):
+            raise ValueError(self.tr("Error in loading config file."))
+        with open(config_file, "r") as f:
+            model_config = yaml.safe_load(f)
+        if model_config.get("has_downloaded", False):
+            return
+
+        # Download model
+        download_url = model_config.get("download_url", None)
+        if not download_url:
+            raise ValueError(self.tr("Missing download_url in config file."))
+        
+        tmp_dir = tempfile.mkdtemp()
+        if download_url.endswith('.zip'):
+            model_folder = self.download_zip(tmp_dir, download_url)
+
+        if download_url.startswith('https://huggingface.co'):
+            model_folder = self.download_hf(tmp_dir, download_url, model_config)
+
 
         # Move model folder to correct location
         shutil.rmtree(extract_dir)
diff --git a/anylabeling/services/auto_labeling/sam2_coreml.py b/anylabeling/services/auto_labeling/sam2_coreml.py
new file mode 100644
index 0000000..18a62ef
--- /dev/null
+++ b/anylabeling/services/auto_labeling/sam2_coreml.py
@@ -0,0 +1,143 @@
+
+import cv2
+import numpy as np
+import coremltools as ct
+from pathlib import Path
+from PIL import Image
+
+def find_contour_points(image_path: str):
+    # Load the image
+    image = cv2.imread(image_path)
+    thresh = get_binary_image(image)
+
+    # Find contours
+    contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+
+    # todo skip contour if it primary white
+    # Sort contours by area in descending order
+    sorted_contours = sorted(contours, key=cv2.contourArea, reverse=True)
+
+    # Select the largest contour
+    for largest_contour in sorted_contours:
+        if countour_content_is_primarily_white(image.copy(), largest_contour):
+            continue
+        M = cv2.moments(largest_contour)
+        if M["m00"] == 0:
+            raise ValueError("Countrour centroid issue")
+        cX = int(M["m10"] / M["m00"])
+        cY = int(M["m01"] / M["m00"])
+        centroid_point = (cX, cY)
+
+        mask = np.zeros(image.shape, dtype="uint8")
+        cv2.drawContours(mask, [largest_contour], -1, 255, -1)
+
+        # Find non-zero pixels (points inside the contour)
+        non_zero_points = np.argwhere(mask == 255)
+
+        # Select a random point
+        random_point_index = np.random.randint(0, len(non_zero_points))
+        random_point = non_zero_points[random_point_index]
+        return (centroid_point, random_point)
+
+class SegmentAnything2CoreML:
+    def __init__(self, model_path: str) -> None:
+        print("using CoreML", model_path)
+        self.image_encoder = ct.models.MLModel(model_path + "/SAM2_1LargeImageEncoderFLOAT16.mlpackage")
+        self.mask_decoder = ct.models.MLModel(model_path + "/SAM2_1LargeMaskDecoderFLOAT16.mlpackage")
+        self.prompt_encoder = ct.models.MLModel(model_path + "/SAM2_1LargePromptEncoderFLOAT16.mlpackage")
+        self.input_size = (1024, 1024)
+    
+    def encode(self, cv_image: np.ndarray) -> dict:
+        """Encodes the input image using the image encoder."""
+        # Convert OpenCV image to PIL Image
+        pil_image = Image.fromarray(cv2.cvtColor(cv_image, cv2.COLOR_BGR2RGB))
+        
+        # Resize image to input_size
+        original_size = pil_image.size
+        resized_image = pil_image.resize(self.input_size, Image.Resampling.LANCZOS)
+
+        # Predict image embeddings
+        embeddings = self.image_encoder.predict({"image": resized_image})
+
+        return {
+            "high_res_feats_0": embeddings["feats_s0"],
+            "high_res_feats_1": embeddings["feats_s1"],
+            "image_embedding": embeddings["image_embedding"],
+            "original_size": original_size,
+        }
+
+    def predict_masks(self, embedding: dict, prompt: list) -> list[np.ndarray]:
+        """Predicts masks based on image embedding and prompt."""
+        points = []
+        labels = []
+        for mark in prompt:
+            if mark["type"] == "point":
+                # Scale point coordinates to match the model's input size
+                x_scaled = mark["data"][0] * (self.input_size[0] / embedding["original_size"][0])
+                y_scaled = mark["data"][1] * (self.input_size[1] / embedding["original_size"][1])
+                points.append([x_scaled, y_scaled])
+                labels.append(mark["label"])
+            elif mark["type"] == "rectangle":
+                # Scale rectangle coordinates
+                x1_scaled = mark["data"][0] * (self.input_size[0] / embedding["original_size"][0])
+                y1_scaled = mark["data"][1] * (self.input_size[1] / embedding["original_size"][1])
+                x2_scaled = mark["data"][2] * (self.input_size[0] / embedding["original_size"][0])
+                y2_scaled = mark["data"][3] * (self.input_size[1] / embedding["original_size"][1])
+                points.append([x1_scaled, y1_scaled])
+                points.append([x2_scaled, y2_scaled])
+                labels.append(2)  # Label for top-left of box
+                labels.append(3)  # Label for bottom-right of box
+
+        points_array = np.array(points, dtype=np.float32).reshape(1, len(points), 2)
+        labels_array = np.array(labels, dtype=np.int32).reshape(1, len(labels))
+
+        # Get prompt embeddings
+        prompt_embeddings = self.prompt_encoder.predict(
+            {"points": points_array, "labels": labels_array}
+        )
+
+        # Predict masks
+        mask_output = self.mask_decoder.predict(
+            {
+                "image_embedding": embedding["image_embedding"],
+                "sparse_embedding": prompt_embeddings["sparse_embeddings"],
+                "dense_embedding": prompt_embeddings["dense_embeddings"],
+                "feats_s0": embedding["high_res_feats_0"],
+                "feats_s1": embedding["high_res_feats_1"],
+            }
+        )
+        
+        # The model returns low_res_masks, which need to be upscaled and thresholded
+        low_res_masks = mask_output["low_res_masks"]
+        
+        # Select the best mask based on score
+        scores = mask_output["scores"]
+        best_mask_idx = np.argmax(scores)
+        mask = low_res_masks[0, best_mask_idx] # Assuming batch size of 1
+
+        # Resize the mask back to the original image size
+        original_width, original_height = embedding["original_size"]
+        mask = cv2.resize(mask, (original_width, original_height), interpolation=cv2.INTER_LINEAR)
+        
+        # Apply threshold to get a binary mask
+        mask = (mask > 0).astype(np.uint8) * 255 # Convert to 0 or 255
+
+        return np.array([mask]) # Return as a list for consistency
+
+    
+    def transform_masks(self, masks, original_size, transform_matrix):
+        """Transform the masks back to the original image size."""
+        output_masks = []
+        for batch in range(masks.shape[0]):
+            batch_masks = []
+            for mask_id in range(masks.shape[1]):
+                mask = masks[batch, mask_id]
+                mask = cv2.warpAffine(
+                    mask,
+                    transform_matrix[:2],
+                    (original_size[1], original_size[0]),
+                    flags=cv2.INTER_LINEAR,
+                )
+                batch_masks.append(mask)
+            output_masks.append(batch_masks)
+        return np.array(output_masks)
diff --git a/anylabeling/services/auto_labeling/segment_anything.py b/anylabeling/services/auto_labeling/segment_anything.py
index 4d9274a..8813119 100644
--- a/anylabeling/services/auto_labeling/segment_anything.py
+++ b/anylabeling/services/auto_labeling/segment_anything.py
@@ -18,7 +18,7 @@
 from .types import AutoLabelingResult
 from .sam_onnx import SegmentAnythingONNX
 from .sam2_onnx import SegmentAnything2ONNX
-
+from .sam2_coreml import SegmentAnything2CoreML
 
 class SegmentAnything(Model):
     """Segmentation model using SegmentAnything"""
@@ -57,7 +57,7 @@ def __init__(self, config_path, on_message) -> None:
         encoder_model_abs_path = self.get_model_abs_path(
             self.config, "encoder_model_path"
         )
-        if not encoder_model_abs_path or not os.path.isfile(encoder_model_abs_path):
+        if not encoder_model_abs_path or not (os.path.isfile(encoder_model_abs_path) or os.path.isdir(encoder_model_abs_path)):
             raise FileNotFoundError(
                 QCoreApplication.translate(
                     "Model",
@@ -67,7 +67,7 @@ def __init__(self, config_path, on_message) -> None:
         decoder_model_abs_path = self.get_model_abs_path(
             self.config, "decoder_model_path"
         )
-        if not decoder_model_abs_path or not os.path.isfile(decoder_model_abs_path):
+        if not decoder_model_abs_path or not (os.path.isfile(decoder_model_abs_path) or os.path.isdir(decoder_model_abs_path)):
             raise FileNotFoundError(
                 QCoreApplication.translate(
                     "Model",
@@ -76,7 +76,10 @@ def __init__(self, config_path, on_message) -> None:
             )
 
         # Load models
-        if self.detect_model_variant(decoder_model_abs_path) == "sam2":
+        if "coreml" in decoder_model_abs_path:
+            config_folder = os.path.dirname(decoder_model_abs_path)
+            self.model = SegmentAnything2CoreML(config_folder)
+        elif self.detect_model_variant(decoder_model_abs_path) == "sam2":
             self.model = SegmentAnything2ONNX(
                 encoder_model_abs_path, decoder_model_abs_path
             )
@@ -84,6 +87,8 @@ def __init__(self, config_path, on_message) -> None:
             self.model = SegmentAnythingONNX(
                 encoder_model_abs_path, decoder_model_abs_path
             )
+        #else:
+        #    self.model = SegmentAnything2CoreML("/Users/A92940251/Documents/AICC-Next/digibb/models")
 
         # Mark for auto labeling
         # points, rectangles
diff --git a/requirements-macos.txt b/requirements-macos.txt
index 85db885..5d63f69 100644
--- a/requirements-macos.txt
+++ b/requirements-macos.txt
@@ -8,3 +8,4 @@ onnx==1.16.1
 onnxruntime==1.18.1
 qimage2ndarray==1.10.0
 darkdetect==0.8.0
+coremltools==8.3.0
\ No newline at end of file
diff --git a/setup.py b/setup.py
index 2bb6e80..58c0d36 100644
--- a/setup.py
+++ b/setup.py
@@ -51,6 +51,7 @@ def get_install_requires():
         "onnx==1.16.1",
         "qimage2ndarray==1.10.0",
         "darkdetect==0.8.0",
+        'coremltools==8.3.0; platform_system == "Darwin"',
     ]
 
     # Add onnxruntime-gpu if GPU is preferred

From fd12d64929dd797661d772f7f053a869075df371 Mon Sep 17 00:00:00 2001
From: Paul Bauriegel <paul.bauriegel@web.de>
Date: Thu, 31 Jul 2025 12:09:18 +0200
Subject: [PATCH 2/3] Remove unused code

---
 .../services/auto_labeling/sam2_coreml.py     | 111 +++++++-----------
 .../auto_labeling/segment_anything.py         |   2 -
 2 files changed, 41 insertions(+), 72 deletions(-)

diff --git a/anylabeling/services/auto_labeling/sam2_coreml.py b/anylabeling/services/auto_labeling/sam2_coreml.py
index 18a62ef..552e893 100644
--- a/anylabeling/services/auto_labeling/sam2_coreml.py
+++ b/anylabeling/services/auto_labeling/sam2_coreml.py
@@ -1,57 +1,32 @@
-
 import cv2
 import numpy as np
 import coremltools as ct
 from pathlib import Path
 from PIL import Image
 
-def find_contour_points(image_path: str):
-    # Load the image
-    image = cv2.imread(image_path)
-    thresh = get_binary_image(image)
-
-    # Find contours
-    contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
-
-    # todo skip contour if it primary white
-    # Sort contours by area in descending order
-    sorted_contours = sorted(contours, key=cv2.contourArea, reverse=True)
-
-    # Select the largest contour
-    for largest_contour in sorted_contours:
-        if countour_content_is_primarily_white(image.copy(), largest_contour):
-            continue
-        M = cv2.moments(largest_contour)
-        if M["m00"] == 0:
-            raise ValueError("Countrour centroid issue")
-        cX = int(M["m10"] / M["m00"])
-        cY = int(M["m01"] / M["m00"])
-        centroid_point = (cX, cY)
-
-        mask = np.zeros(image.shape, dtype="uint8")
-        cv2.drawContours(mask, [largest_contour], -1, 255, -1)
-
-        # Find non-zero pixels (points inside the contour)
-        non_zero_points = np.argwhere(mask == 255)
-
-        # Select a random point
-        random_point_index = np.random.randint(0, len(non_zero_points))
-        random_point = non_zero_points[random_point_index]
-        return (centroid_point, random_point)
 
 class SegmentAnything2CoreML:
     def __init__(self, model_path: str) -> None:
         print("using CoreML", model_path)
-        self.image_encoder = ct.models.MLModel(model_path + "/SAM2_1LargeImageEncoderFLOAT16.mlpackage")
-        self.mask_decoder = ct.models.MLModel(model_path + "/SAM2_1LargeMaskDecoderFLOAT16.mlpackage")
-        self.prompt_encoder = ct.models.MLModel(model_path + "/SAM2_1LargePromptEncoderFLOAT16.mlpackage")
+        image_decoder_path = os.path.join(
+            model_path, "SAM2_1LargeImageEncoderFLOAT16.mlpackage"
+        )
+        mask_decoder_path = os.path.join(
+            model_path, "SAM2_1LargeMaskDecoderFLOAT16.mlpackage"
+        )
+        prompt_encoder_path = os.path.join(
+            model_path, "SAM2_1LargePromptEncoderFLOAT16.mlpackage"
+        )
+        self.image_encoder = ct.models.MLModel(image_decoder_path)
+        self.mask_decoder = ct.models.MLModel(mask_decoder_path)
+        self.prompt_encoder = ct.models.MLModel(prompt_encoder_path)
         self.input_size = (1024, 1024)
-    
+
     def encode(self, cv_image: np.ndarray) -> dict:
         """Encodes the input image using the image encoder."""
         # Convert OpenCV image to PIL Image
         pil_image = Image.fromarray(cv2.cvtColor(cv_image, cv2.COLOR_BGR2RGB))
-        
+
         # Resize image to input_size
         original_size = pil_image.size
         resized_image = pil_image.resize(self.input_size, Image.Resampling.LANCZOS)
@@ -73,16 +48,28 @@ def predict_masks(self, embedding: dict, prompt: list) -> list[np.ndarray]:
         for mark in prompt:
             if mark["type"] == "point":
                 # Scale point coordinates to match the model's input size
-                x_scaled = mark["data"][0] * (self.input_size[0] / embedding["original_size"][0])
-                y_scaled = mark["data"][1] * (self.input_size[1] / embedding["original_size"][1])
+                x_scaled = mark["data"][0] * (
+                    self.input_size[0] / embedding["original_size"][0]
+                )
+                y_scaled = mark["data"][1] * (
+                    self.input_size[1] / embedding["original_size"][1]
+                )
                 points.append([x_scaled, y_scaled])
                 labels.append(mark["label"])
             elif mark["type"] == "rectangle":
                 # Scale rectangle coordinates
-                x1_scaled = mark["data"][0] * (self.input_size[0] / embedding["original_size"][0])
-                y1_scaled = mark["data"][1] * (self.input_size[1] / embedding["original_size"][1])
-                x2_scaled = mark["data"][2] * (self.input_size[0] / embedding["original_size"][0])
-                y2_scaled = mark["data"][3] * (self.input_size[1] / embedding["original_size"][1])
+                x1_scaled = mark["data"][0] * (
+                    self.input_size[0] / embedding["original_size"][0]
+                )
+                y1_scaled = mark["data"][1] * (
+                    self.input_size[1] / embedding["original_size"][1]
+                )
+                x2_scaled = mark["data"][2] * (
+                    self.input_size[0] / embedding["original_size"][0]
+                )
+                y2_scaled = mark["data"][3] * (
+                    self.input_size[1] / embedding["original_size"][1]
+                )
                 points.append([x1_scaled, y1_scaled])
                 points.append([x2_scaled, y2_scaled])
                 labels.append(2)  # Label for top-left of box
@@ -106,38 +93,22 @@ def predict_masks(self, embedding: dict, prompt: list) -> list[np.ndarray]:
                 "feats_s1": embedding["high_res_feats_1"],
             }
         )
-        
+
         # The model returns low_res_masks, which need to be upscaled and thresholded
         low_res_masks = mask_output["low_res_masks"]
-        
+
         # Select the best mask based on score
         scores = mask_output["scores"]
         best_mask_idx = np.argmax(scores)
-        mask = low_res_masks[0, best_mask_idx] # Assuming batch size of 1
+        mask = low_res_masks[0, best_mask_idx]  # Assuming batch size of 1
 
         # Resize the mask back to the original image size
         original_width, original_height = embedding["original_size"]
-        mask = cv2.resize(mask, (original_width, original_height), interpolation=cv2.INTER_LINEAR)
-        
-        # Apply threshold to get a binary mask
-        mask = (mask > 0).astype(np.uint8) * 255 # Convert to 0 or 255
+        mask = cv2.resize(
+            mask, (original_width, original_height), interpolation=cv2.INTER_LINEAR
+        )
 
-        return np.array([mask]) # Return as a list for consistency
+        # Apply threshold to get a binary mask
+        mask = (mask > 0).astype(np.uint8) * 255  # Convert to 0 or 255
 
-    
-    def transform_masks(self, masks, original_size, transform_matrix):
-        """Transform the masks back to the original image size."""
-        output_masks = []
-        for batch in range(masks.shape[0]):
-            batch_masks = []
-            for mask_id in range(masks.shape[1]):
-                mask = masks[batch, mask_id]
-                mask = cv2.warpAffine(
-                    mask,
-                    transform_matrix[:2],
-                    (original_size[1], original_size[0]),
-                    flags=cv2.INTER_LINEAR,
-                )
-                batch_masks.append(mask)
-            output_masks.append(batch_masks)
-        return np.array(output_masks)
+        return np.array([mask])  # Return as a list for consistency
diff --git a/anylabeling/services/auto_labeling/segment_anything.py b/anylabeling/services/auto_labeling/segment_anything.py
index 8813119..f757361 100644
--- a/anylabeling/services/auto_labeling/segment_anything.py
+++ b/anylabeling/services/auto_labeling/segment_anything.py
@@ -87,8 +87,6 @@ def __init__(self, config_path, on_message) -> None:
             self.model = SegmentAnythingONNX(
                 encoder_model_abs_path, decoder_model_abs_path
             )
-        #else:
-        #    self.model = SegmentAnything2CoreML("/Users/A92940251/Documents/AICC-Next/digibb/models")
 
         # Mark for auto labeling
         # points, rectangles

From e591407de772550d99effc693670c5a567b5d95c Mon Sep 17 00:00:00 2001
From: Paul Bauriegel <paul.bauriegel@web.de>
Date: Thu, 31 Jul 2025 12:23:31 +0200
Subject: [PATCH 3/3] Add missing import

---
 anylabeling/services/auto_labeling/sam2_coreml.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/anylabeling/services/auto_labeling/sam2_coreml.py b/anylabeling/services/auto_labeling/sam2_coreml.py
index 552e893..41ea0b9 100644
--- a/anylabeling/services/auto_labeling/sam2_coreml.py
+++ b/anylabeling/services/auto_labeling/sam2_coreml.py
@@ -1,3 +1,4 @@
+import os
 import cv2
 import numpy as np
 import coremltools as ct