From 9df9352e9d79f060c61d30b25faabee4bc5529ff Mon Sep 17 00:00:00 2001 From: Paul Bauriegel Date: Thu, 31 Jul 2025 12:02:20 +0200 Subject: [PATCH 1/3] Add CoreML version of SAM 2.1 --- anylabeling/configs/auto_labeling/models.yaml | 11 ++ .../services/auto_labeling/model_manager.py | 60 +++++--- .../services/auto_labeling/sam2_coreml.py | 143 ++++++++++++++++++ .../auto_labeling/segment_anything.py | 13 +- requirements-macos.txt | 1 + setup.py | 1 + 6 files changed, 205 insertions(+), 24 deletions(-) create mode 100644 anylabeling/services/auto_labeling/sam2_coreml.py diff --git a/anylabeling/configs/auto_labeling/models.yaml b/anylabeling/configs/auto_labeling/models.yaml index 7eadfd3..0941016 100644 --- a/anylabeling/configs/auto_labeling/models.yaml +++ b/anylabeling/configs/auto_labeling/models.yaml @@ -1,3 +1,14 @@ +- name: "sam2_1_coreml_large" + display_name: Segment Anything 2.1 (Large) CoreML + download_url: https://huggingface.co/apple/coreml-sam2.1-large + encoder_model_path: SAM2_1LargeImageEncoderFLOAT16.mlpackage + decoder_model_path: SAM2_1LargeMaskDecoderFLOAT16.mlpackage + image_encoder_model_path: SAM2_1LargeImageEncoderFLOAT16.mlpackage + prompt_encoder_model_path: sSAM2_1LargePromptEncoderFLOAT16.mlpackage + input_size: 1024 + max_height: 1024 + max_width: 1024 + type: segment_anything - name: "sam2_hiera_tiny_20240803" display_name: Segment Anything 2 (Hiera-Tiny) download_url: https://huggingface.co/vietanhdev/segment-anything-2-onnx-models/resolve/main/sam2_hiera_tiny.zip diff --git a/anylabeling/services/auto_labeling/model_manager.py b/anylabeling/services/auto_labeling/model_manager.py index 86f903e..41abb69 100644 --- a/anylabeling/services/auto_labeling/model_manager.py +++ b/anylabeling/services/auto_labeling/model_manager.py @@ -21,6 +21,7 @@ from anylabeling.config import get_config, save_config import ssl +from huggingface_hub import snapshot_download ssl._create_default_https_context = ( ssl._create_unverified_context @@ -267,22 +268,7 @@ def load_model(self, config_file): self.model_download_thread.started.connect(self.model_download_worker.run) self.model_download_thread.start() - def _download_and_extract_model(self, model_config): - """Download and extract a model from model config""" - config_file = model_config["config_file"] - # Check if model is already downloaded - if not os.path.exists(config_file): - raise ValueError(self.tr("Error in loading config file.")) - with open(config_file, "r") as f: - model_config = yaml.safe_load(f) - if model_config.get("has_downloaded", False): - return - - # Download model - download_url = model_config.get("download_url", None) - if not download_url: - raise ValueError(self.tr("Missing download_url in config file.")) - tmp_dir = tempfile.mkdtemp() + def download_zip(self, tmp_dir, download_url): zip_model_path = os.path.join(tmp_dir, "model.zip") # Download url @@ -307,14 +293,11 @@ def _progress(count, block_size, total_size): print(f"Could not download {download_url}: {e}") self.new_model_status.emit(f"Could not download {download_url}") return None - # Extract model tmp_extract_dir = os.path.join(tmp_dir, "extract") - extract_dir = os.path.dirname(config_file) with zipfile.ZipFile(zip_model_path, "r") as zip_ref: zip_ref.extractall(tmp_extract_dir) - - # Find model folder (containing config.yaml) + # Find model folder (containing config.yaml) model_folder = None for root, _, files in os.walk(tmp_extract_dir): if "config.yaml" in files: @@ -322,6 +305,43 @@ def _progress(count, block_size, total_size): break if model_folder is None: raise ValueError(self.tr("Could not find config.yaml in zip file.")) + return model_folder + + def download_hf(self, tmp_dir, download_url, model_config): + repo_id = download_url.split('https://huggingface.co/')[-1].strip('/') + tmp_extract_dir = os.path.join(tmp_dir, "extract") + local_dir = snapshot_download( + repo_id=repo_id, + local_dir=tmp_extract_dir # where to store everything + ) + with open(tmp_extract_dir + "/config.yaml", "w") as f: + model_config = yaml.dump(model_config, f, default_flow_style=False) + return tmp_extract_dir + + def _download_and_extract_model(self, model_config): + """Download and extract a model from model config""" + config_file = model_config["config_file"] + extract_dir = os.path.dirname(config_file) + # Check if model is already downloaded + if not os.path.exists(config_file): + raise ValueError(self.tr("Error in loading config file.")) + with open(config_file, "r") as f: + model_config = yaml.safe_load(f) + if model_config.get("has_downloaded", False): + return + + # Download model + download_url = model_config.get("download_url", None) + if not download_url: + raise ValueError(self.tr("Missing download_url in config file.")) + + tmp_dir = tempfile.mkdtemp() + if download_url.endswith('.zip'): + model_folder = self.download_zip(tmp_dir, download_url) + + if download_url.startswith('https://huggingface.co'): + model_folder = self.download_hf(tmp_dir, download_url, model_config) + # Move model folder to correct location shutil.rmtree(extract_dir) diff --git a/anylabeling/services/auto_labeling/sam2_coreml.py b/anylabeling/services/auto_labeling/sam2_coreml.py new file mode 100644 index 0000000..18a62ef --- /dev/null +++ b/anylabeling/services/auto_labeling/sam2_coreml.py @@ -0,0 +1,143 @@ + +import cv2 +import numpy as np +import coremltools as ct +from pathlib import Path +from PIL import Image + +def find_contour_points(image_path: str): + # Load the image + image = cv2.imread(image_path) + thresh = get_binary_image(image) + + # Find contours + contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) + + # todo skip contour if it primary white + # Sort contours by area in descending order + sorted_contours = sorted(contours, key=cv2.contourArea, reverse=True) + + # Select the largest contour + for largest_contour in sorted_contours: + if countour_content_is_primarily_white(image.copy(), largest_contour): + continue + M = cv2.moments(largest_contour) + if M["m00"] == 0: + raise ValueError("Countrour centroid issue") + cX = int(M["m10"] / M["m00"]) + cY = int(M["m01"] / M["m00"]) + centroid_point = (cX, cY) + + mask = np.zeros(image.shape, dtype="uint8") + cv2.drawContours(mask, [largest_contour], -1, 255, -1) + + # Find non-zero pixels (points inside the contour) + non_zero_points = np.argwhere(mask == 255) + + # Select a random point + random_point_index = np.random.randint(0, len(non_zero_points)) + random_point = non_zero_points[random_point_index] + return (centroid_point, random_point) + +class SegmentAnything2CoreML: + def __init__(self, model_path: str) -> None: + print("using CoreML", model_path) + self.image_encoder = ct.models.MLModel(model_path + "/SAM2_1LargeImageEncoderFLOAT16.mlpackage") + self.mask_decoder = ct.models.MLModel(model_path + "/SAM2_1LargeMaskDecoderFLOAT16.mlpackage") + self.prompt_encoder = ct.models.MLModel(model_path + "/SAM2_1LargePromptEncoderFLOAT16.mlpackage") + self.input_size = (1024, 1024) + + def encode(self, cv_image: np.ndarray) -> dict: + """Encodes the input image using the image encoder.""" + # Convert OpenCV image to PIL Image + pil_image = Image.fromarray(cv2.cvtColor(cv_image, cv2.COLOR_BGR2RGB)) + + # Resize image to input_size + original_size = pil_image.size + resized_image = pil_image.resize(self.input_size, Image.Resampling.LANCZOS) + + # Predict image embeddings + embeddings = self.image_encoder.predict({"image": resized_image}) + + return { + "high_res_feats_0": embeddings["feats_s0"], + "high_res_feats_1": embeddings["feats_s1"], + "image_embedding": embeddings["image_embedding"], + "original_size": original_size, + } + + def predict_masks(self, embedding: dict, prompt: list) -> list[np.ndarray]: + """Predicts masks based on image embedding and prompt.""" + points = [] + labels = [] + for mark in prompt: + if mark["type"] == "point": + # Scale point coordinates to match the model's input size + x_scaled = mark["data"][0] * (self.input_size[0] / embedding["original_size"][0]) + y_scaled = mark["data"][1] * (self.input_size[1] / embedding["original_size"][1]) + points.append([x_scaled, y_scaled]) + labels.append(mark["label"]) + elif mark["type"] == "rectangle": + # Scale rectangle coordinates + x1_scaled = mark["data"][0] * (self.input_size[0] / embedding["original_size"][0]) + y1_scaled = mark["data"][1] * (self.input_size[1] / embedding["original_size"][1]) + x2_scaled = mark["data"][2] * (self.input_size[0] / embedding["original_size"][0]) + y2_scaled = mark["data"][3] * (self.input_size[1] / embedding["original_size"][1]) + points.append([x1_scaled, y1_scaled]) + points.append([x2_scaled, y2_scaled]) + labels.append(2) # Label for top-left of box + labels.append(3) # Label for bottom-right of box + + points_array = np.array(points, dtype=np.float32).reshape(1, len(points), 2) + labels_array = np.array(labels, dtype=np.int32).reshape(1, len(labels)) + + # Get prompt embeddings + prompt_embeddings = self.prompt_encoder.predict( + {"points": points_array, "labels": labels_array} + ) + + # Predict masks + mask_output = self.mask_decoder.predict( + { + "image_embedding": embedding["image_embedding"], + "sparse_embedding": prompt_embeddings["sparse_embeddings"], + "dense_embedding": prompt_embeddings["dense_embeddings"], + "feats_s0": embedding["high_res_feats_0"], + "feats_s1": embedding["high_res_feats_1"], + } + ) + + # The model returns low_res_masks, which need to be upscaled and thresholded + low_res_masks = mask_output["low_res_masks"] + + # Select the best mask based on score + scores = mask_output["scores"] + best_mask_idx = np.argmax(scores) + mask = low_res_masks[0, best_mask_idx] # Assuming batch size of 1 + + # Resize the mask back to the original image size + original_width, original_height = embedding["original_size"] + mask = cv2.resize(mask, (original_width, original_height), interpolation=cv2.INTER_LINEAR) + + # Apply threshold to get a binary mask + mask = (mask > 0).astype(np.uint8) * 255 # Convert to 0 or 255 + + return np.array([mask]) # Return as a list for consistency + + + def transform_masks(self, masks, original_size, transform_matrix): + """Transform the masks back to the original image size.""" + output_masks = [] + for batch in range(masks.shape[0]): + batch_masks = [] + for mask_id in range(masks.shape[1]): + mask = masks[batch, mask_id] + mask = cv2.warpAffine( + mask, + transform_matrix[:2], + (original_size[1], original_size[0]), + flags=cv2.INTER_LINEAR, + ) + batch_masks.append(mask) + output_masks.append(batch_masks) + return np.array(output_masks) diff --git a/anylabeling/services/auto_labeling/segment_anything.py b/anylabeling/services/auto_labeling/segment_anything.py index 4d9274a..8813119 100644 --- a/anylabeling/services/auto_labeling/segment_anything.py +++ b/anylabeling/services/auto_labeling/segment_anything.py @@ -18,7 +18,7 @@ from .types import AutoLabelingResult from .sam_onnx import SegmentAnythingONNX from .sam2_onnx import SegmentAnything2ONNX - +from .sam2_coreml import SegmentAnything2CoreML class SegmentAnything(Model): """Segmentation model using SegmentAnything""" @@ -57,7 +57,7 @@ def __init__(self, config_path, on_message) -> None: encoder_model_abs_path = self.get_model_abs_path( self.config, "encoder_model_path" ) - if not encoder_model_abs_path or not os.path.isfile(encoder_model_abs_path): + if not encoder_model_abs_path or not (os.path.isfile(encoder_model_abs_path) or os.path.isdir(encoder_model_abs_path)): raise FileNotFoundError( QCoreApplication.translate( "Model", @@ -67,7 +67,7 @@ def __init__(self, config_path, on_message) -> None: decoder_model_abs_path = self.get_model_abs_path( self.config, "decoder_model_path" ) - if not decoder_model_abs_path or not os.path.isfile(decoder_model_abs_path): + if not decoder_model_abs_path or not (os.path.isfile(decoder_model_abs_path) or os.path.isdir(decoder_model_abs_path)): raise FileNotFoundError( QCoreApplication.translate( "Model", @@ -76,7 +76,10 @@ def __init__(self, config_path, on_message) -> None: ) # Load models - if self.detect_model_variant(decoder_model_abs_path) == "sam2": + if "coreml" in decoder_model_abs_path: + config_folder = os.path.dirname(decoder_model_abs_path) + self.model = SegmentAnything2CoreML(config_folder) + elif self.detect_model_variant(decoder_model_abs_path) == "sam2": self.model = SegmentAnything2ONNX( encoder_model_abs_path, decoder_model_abs_path ) @@ -84,6 +87,8 @@ def __init__(self, config_path, on_message) -> None: self.model = SegmentAnythingONNX( encoder_model_abs_path, decoder_model_abs_path ) + #else: + # self.model = SegmentAnything2CoreML("/Users/A92940251/Documents/AICC-Next/digibb/models") # Mark for auto labeling # points, rectangles diff --git a/requirements-macos.txt b/requirements-macos.txt index 85db885..5d63f69 100644 --- a/requirements-macos.txt +++ b/requirements-macos.txt @@ -8,3 +8,4 @@ onnx==1.16.1 onnxruntime==1.18.1 qimage2ndarray==1.10.0 darkdetect==0.8.0 +coremltools==8.3.0 \ No newline at end of file diff --git a/setup.py b/setup.py index 2bb6e80..58c0d36 100644 --- a/setup.py +++ b/setup.py @@ -51,6 +51,7 @@ def get_install_requires(): "onnx==1.16.1", "qimage2ndarray==1.10.0", "darkdetect==0.8.0", + 'coremltools==8.3.0; platform_system == "Darwin"', ] # Add onnxruntime-gpu if GPU is preferred From fd12d64929dd797661d772f7f053a869075df371 Mon Sep 17 00:00:00 2001 From: Paul Bauriegel Date: Thu, 31 Jul 2025 12:09:18 +0200 Subject: [PATCH 2/3] Remove unused code --- .../services/auto_labeling/sam2_coreml.py | 111 +++++++----------- .../auto_labeling/segment_anything.py | 2 - 2 files changed, 41 insertions(+), 72 deletions(-) diff --git a/anylabeling/services/auto_labeling/sam2_coreml.py b/anylabeling/services/auto_labeling/sam2_coreml.py index 18a62ef..552e893 100644 --- a/anylabeling/services/auto_labeling/sam2_coreml.py +++ b/anylabeling/services/auto_labeling/sam2_coreml.py @@ -1,57 +1,32 @@ - import cv2 import numpy as np import coremltools as ct from pathlib import Path from PIL import Image -def find_contour_points(image_path: str): - # Load the image - image = cv2.imread(image_path) - thresh = get_binary_image(image) - - # Find contours - contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) - - # todo skip contour if it primary white - # Sort contours by area in descending order - sorted_contours = sorted(contours, key=cv2.contourArea, reverse=True) - - # Select the largest contour - for largest_contour in sorted_contours: - if countour_content_is_primarily_white(image.copy(), largest_contour): - continue - M = cv2.moments(largest_contour) - if M["m00"] == 0: - raise ValueError("Countrour centroid issue") - cX = int(M["m10"] / M["m00"]) - cY = int(M["m01"] / M["m00"]) - centroid_point = (cX, cY) - - mask = np.zeros(image.shape, dtype="uint8") - cv2.drawContours(mask, [largest_contour], -1, 255, -1) - - # Find non-zero pixels (points inside the contour) - non_zero_points = np.argwhere(mask == 255) - - # Select a random point - random_point_index = np.random.randint(0, len(non_zero_points)) - random_point = non_zero_points[random_point_index] - return (centroid_point, random_point) class SegmentAnything2CoreML: def __init__(self, model_path: str) -> None: print("using CoreML", model_path) - self.image_encoder = ct.models.MLModel(model_path + "/SAM2_1LargeImageEncoderFLOAT16.mlpackage") - self.mask_decoder = ct.models.MLModel(model_path + "/SAM2_1LargeMaskDecoderFLOAT16.mlpackage") - self.prompt_encoder = ct.models.MLModel(model_path + "/SAM2_1LargePromptEncoderFLOAT16.mlpackage") + image_decoder_path = os.path.join( + model_path, "SAM2_1LargeImageEncoderFLOAT16.mlpackage" + ) + mask_decoder_path = os.path.join( + model_path, "SAM2_1LargeMaskDecoderFLOAT16.mlpackage" + ) + prompt_encoder_path = os.path.join( + model_path, "SAM2_1LargePromptEncoderFLOAT16.mlpackage" + ) + self.image_encoder = ct.models.MLModel(image_decoder_path) + self.mask_decoder = ct.models.MLModel(mask_decoder_path) + self.prompt_encoder = ct.models.MLModel(prompt_encoder_path) self.input_size = (1024, 1024) - + def encode(self, cv_image: np.ndarray) -> dict: """Encodes the input image using the image encoder.""" # Convert OpenCV image to PIL Image pil_image = Image.fromarray(cv2.cvtColor(cv_image, cv2.COLOR_BGR2RGB)) - + # Resize image to input_size original_size = pil_image.size resized_image = pil_image.resize(self.input_size, Image.Resampling.LANCZOS) @@ -73,16 +48,28 @@ def predict_masks(self, embedding: dict, prompt: list) -> list[np.ndarray]: for mark in prompt: if mark["type"] == "point": # Scale point coordinates to match the model's input size - x_scaled = mark["data"][0] * (self.input_size[0] / embedding["original_size"][0]) - y_scaled = mark["data"][1] * (self.input_size[1] / embedding["original_size"][1]) + x_scaled = mark["data"][0] * ( + self.input_size[0] / embedding["original_size"][0] + ) + y_scaled = mark["data"][1] * ( + self.input_size[1] / embedding["original_size"][1] + ) points.append([x_scaled, y_scaled]) labels.append(mark["label"]) elif mark["type"] == "rectangle": # Scale rectangle coordinates - x1_scaled = mark["data"][0] * (self.input_size[0] / embedding["original_size"][0]) - y1_scaled = mark["data"][1] * (self.input_size[1] / embedding["original_size"][1]) - x2_scaled = mark["data"][2] * (self.input_size[0] / embedding["original_size"][0]) - y2_scaled = mark["data"][3] * (self.input_size[1] / embedding["original_size"][1]) + x1_scaled = mark["data"][0] * ( + self.input_size[0] / embedding["original_size"][0] + ) + y1_scaled = mark["data"][1] * ( + self.input_size[1] / embedding["original_size"][1] + ) + x2_scaled = mark["data"][2] * ( + self.input_size[0] / embedding["original_size"][0] + ) + y2_scaled = mark["data"][3] * ( + self.input_size[1] / embedding["original_size"][1] + ) points.append([x1_scaled, y1_scaled]) points.append([x2_scaled, y2_scaled]) labels.append(2) # Label for top-left of box @@ -106,38 +93,22 @@ def predict_masks(self, embedding: dict, prompt: list) -> list[np.ndarray]: "feats_s1": embedding["high_res_feats_1"], } ) - + # The model returns low_res_masks, which need to be upscaled and thresholded low_res_masks = mask_output["low_res_masks"] - + # Select the best mask based on score scores = mask_output["scores"] best_mask_idx = np.argmax(scores) - mask = low_res_masks[0, best_mask_idx] # Assuming batch size of 1 + mask = low_res_masks[0, best_mask_idx] # Assuming batch size of 1 # Resize the mask back to the original image size original_width, original_height = embedding["original_size"] - mask = cv2.resize(mask, (original_width, original_height), interpolation=cv2.INTER_LINEAR) - - # Apply threshold to get a binary mask - mask = (mask > 0).astype(np.uint8) * 255 # Convert to 0 or 255 + mask = cv2.resize( + mask, (original_width, original_height), interpolation=cv2.INTER_LINEAR + ) - return np.array([mask]) # Return as a list for consistency + # Apply threshold to get a binary mask + mask = (mask > 0).astype(np.uint8) * 255 # Convert to 0 or 255 - - def transform_masks(self, masks, original_size, transform_matrix): - """Transform the masks back to the original image size.""" - output_masks = [] - for batch in range(masks.shape[0]): - batch_masks = [] - for mask_id in range(masks.shape[1]): - mask = masks[batch, mask_id] - mask = cv2.warpAffine( - mask, - transform_matrix[:2], - (original_size[1], original_size[0]), - flags=cv2.INTER_LINEAR, - ) - batch_masks.append(mask) - output_masks.append(batch_masks) - return np.array(output_masks) + return np.array([mask]) # Return as a list for consistency diff --git a/anylabeling/services/auto_labeling/segment_anything.py b/anylabeling/services/auto_labeling/segment_anything.py index 8813119..f757361 100644 --- a/anylabeling/services/auto_labeling/segment_anything.py +++ b/anylabeling/services/auto_labeling/segment_anything.py @@ -87,8 +87,6 @@ def __init__(self, config_path, on_message) -> None: self.model = SegmentAnythingONNX( encoder_model_abs_path, decoder_model_abs_path ) - #else: - # self.model = SegmentAnything2CoreML("/Users/A92940251/Documents/AICC-Next/digibb/models") # Mark for auto labeling # points, rectangles From e591407de772550d99effc693670c5a567b5d95c Mon Sep 17 00:00:00 2001 From: Paul Bauriegel Date: Thu, 31 Jul 2025 12:23:31 +0200 Subject: [PATCH 3/3] Add missing import --- anylabeling/services/auto_labeling/sam2_coreml.py | 1 + 1 file changed, 1 insertion(+) diff --git a/anylabeling/services/auto_labeling/sam2_coreml.py b/anylabeling/services/auto_labeling/sam2_coreml.py index 552e893..41ea0b9 100644 --- a/anylabeling/services/auto_labeling/sam2_coreml.py +++ b/anylabeling/services/auto_labeling/sam2_coreml.py @@ -1,3 +1,4 @@ +import os import cv2 import numpy as np import coremltools as ct