From b89d6f407044276b1f54753ef98c719e89928631 Mon Sep 17 00:00:00 2001 From: Mohammed Yasin <32206511+Y-T-G@users.noreply.github.com> Date: Sun, 13 Oct 2024 23:20:40 +0800 Subject: [PATCH] `ultralytics 8.3.12` SAM and SAM2 multi-point prompts (#16643) Co-authored-by: UltralyticsAssistant Co-authored-by: Ultralytics Assistant <135830346+UltralyticsAssistant@users.noreply.github.com> Co-authored-by: Glenn Jocher --- docs/en/models/mobile-sam.md | 24 +++++++++++++++++++++--- docs/en/models/sam.md | 30 ++++++++++++++++++++++++++++-- tests/test_cli.py | 5 ++++- tests/test_cuda.py | 14 +++++++++++++- ultralytics/__init__.py | 2 +- ultralytics/models/sam/predict.py | 17 ++++++++++++----- 6 files changed, 79 insertions(+), 13 deletions(-) diff --git a/docs/en/models/mobile-sam.md b/docs/en/models/mobile-sam.md index 0d7df2a2ca..0529db6603 100644 --- a/docs/en/models/mobile-sam.md +++ b/docs/en/models/mobile-sam.md @@ -90,8 +90,17 @@ You can download the model [here](https://github.com/ChaoningZhang/MobileSAM/blo # Load the model model = SAM("mobile_sam.pt") - # Predict a segment based on a point prompt + # Predict a segment based on a single point prompt model.predict("ultralytics/assets/zidane.jpg", points=[900, 370], labels=[1]) + + # Predict multiple segments based on multiple points prompt + model.predict("ultralytics/assets/zidane.jpg", points=[[400, 370], [900, 370]], labels=[1, 1]) + + # Predict a segment based on multiple points prompt per object + model.predict("ultralytics/assets/zidane.jpg", points=[[[400, 370], [900, 370]]], labels=[[1, 1]]) + + # Predict a segment using both positive and negative prompts. + model.predict("ultralytics/assets/zidane.jpg", points=[[[400, 370], [900, 370]]], labels=[[1, 0]]) ``` ### Box Prompt @@ -106,8 +115,17 @@ You can download the model [here](https://github.com/ChaoningZhang/MobileSAM/blo # Load the model model = SAM("mobile_sam.pt") - # Predict a segment based on a box prompt - model.predict("ultralytics/assets/zidane.jpg", bboxes=[439, 437, 524, 709]) + # Predict a segment based on a single point prompt + model.predict("ultralytics/assets/zidane.jpg", points=[900, 370], labels=[1]) + + # Predict mutiple segments based on multiple points prompt + model.predict("ultralytics/assets/zidane.jpg", points=[[400, 370], [900, 370]], labels=[1, 1]) + + # Predict a segment based on multiple points prompt per object + model.predict("ultralytics/assets/zidane.jpg", points=[[[400, 370], [900, 370]]], labels=[[1, 1]]) + + # Predict a segment using both positive and negative prompts. + model.predict("ultralytics/assets/zidane.jpg", points=[[[400, 370], [900, 370]]], labels=[[1, 0]]) ``` We have implemented `MobileSAM` and `SAM` using the same API. For more usage information, please see the [SAM page](sam.md). diff --git a/docs/en/models/sam.md b/docs/en/models/sam.md index 304fc00287..1a5c0db4a7 100644 --- a/docs/en/models/sam.md +++ b/docs/en/models/sam.md @@ -58,8 +58,17 @@ The Segment Anything Model can be employed for a multitude of downstream tasks t # Run inference with bboxes prompt results = model("ultralytics/assets/zidane.jpg", bboxes=[439, 437, 524, 709]) - # Run inference with points prompt - results = model("ultralytics/assets/zidane.jpg", points=[900, 370], labels=[1]) + # Run inference with single point + results = predictor(points=[900, 370], labels=[1]) + + # Run inference with multiple points + results = predictor(points=[[400, 370], [900, 370]], labels=[1, 1]) + + # Run inference with multiple points prompt per object + results = predictor(points=[[[400, 370], [900, 370]]], labels=[[1, 1]]) + + # Run inference with negative points prompt + results = predictor(points=[[[400, 370], [900, 370]]], labels=[[1, 0]]) ``` !!! example "Segment everything" @@ -107,8 +116,16 @@ The Segment Anything Model can be employed for a multitude of downstream tasks t predictor.set_image("ultralytics/assets/zidane.jpg") # set with image file predictor.set_image(cv2.imread("ultralytics/assets/zidane.jpg")) # set with np.ndarray results = predictor(bboxes=[439, 437, 524, 709]) + + # Run inference with single point prompt results = predictor(points=[900, 370], labels=[1]) + # Run inference with multiple points prompt + results = predictor(points=[[400, 370], [900, 370]], labels=[[1, 1]]) + + # Run inference with negative points prompt + results = predictor(points=[[[400, 370], [900, 370]]], labels=[[1, 0]]) + # Reset image predictor.reset_image() ``` @@ -245,6 +262,15 @@ model("ultralytics/assets/zidane.jpg", bboxes=[439, 437, 524, 709]) # Segment with points prompt model("ultralytics/assets/zidane.jpg", points=[900, 370], labels=[1]) + +# Segment with multiple points prompt +model("ultralytics/assets/zidane.jpg", points=[[400, 370], [900, 370]], labels=[[1, 1]]) + +# Segment with multiple points prompt per object +model("ultralytics/assets/zidane.jpg", points=[[[400, 370], [900, 370]]], labels=[[1, 1]]) + +# Segment with negative points prompt. +model("ultralytics/assets/zidane.jpg", points=[[[400, 370], [900, 370]]], labels=[[1, 0]]) ``` Alternatively, you can run inference with SAM in the command line interface (CLI): diff --git a/tests/test_cli.py b/tests/test_cli.py index 3eadf3c24e..05e06bd7aa 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -97,9 +97,12 @@ def test_mobilesam(): # Source source = ASSETS / "zidane.jpg" - # Predict a segment based on a point prompt + # Predict a segment based on a 1D point prompt and 1D labels. model.predict(source, points=[900, 370], labels=[1]) + # Predict a segment based on 3D points and 2D labels (multiple points per object). + model.predict(source, points=[[[900, 370], [1000, 100]]], labels=[[1, 1]]) + # Predict a segment based on a box prompt model.predict(source, bboxes=[439, 437, 524, 709], save=True) diff --git a/tests/test_cuda.py b/tests/test_cuda.py index 3b08edc699..89f8c39b25 100644 --- a/tests/test_cuda.py +++ b/tests/test_cuda.py @@ -127,9 +127,21 @@ def test_predict_sam(): # Run inference with bboxes prompt model(SOURCE, bboxes=[439, 437, 524, 709], device=0) - # Run inference with points prompt + # Run inference with no labels + model(ASSETS / "zidane.jpg", points=[900, 370], device=0) + + # Run inference with 1D points and 1D labels model(ASSETS / "zidane.jpg", points=[900, 370], labels=[1], device=0) + # Run inference with 2D points and 1D labels + model(ASSETS / "zidane.jpg", points=[[900, 370]], labels=[1], device=0) + + # Run inference with multiple 2D points and 1D labels + model(ASSETS / "zidane.jpg", points=[[400, 370], [900, 370]], labels=[1, 1], device=0) + + # Run inference with 3D points and 2D labels (multiple points per object) + model(ASSETS / "zidane.jpg", points=[[[900, 370], [1000, 100]]], labels=[[1, 1]], device=0) + # Create SAMPredictor overrides = dict(conf=0.25, task="segment", mode="predict", imgsz=1024, model=WEIGHTS_DIR / "mobile_sam.pt") predictor = SAMPredictor(overrides=overrides) diff --git a/ultralytics/__init__.py b/ultralytics/__init__.py index 827234c78f..5360c25e18 100644 --- a/ultralytics/__init__.py +++ b/ultralytics/__init__.py @@ -1,6 +1,6 @@ # Ultralytics YOLO 🚀, AGPL-3.0 license -__version__ = "8.3.11" +__version__ = "8.3.12" import os diff --git a/ultralytics/models/sam/predict.py b/ultralytics/models/sam/predict.py index 768d63d8f1..978f7cfd68 100644 --- a/ultralytics/models/sam/predict.py +++ b/ultralytics/models/sam/predict.py @@ -213,11 +213,14 @@ class Predictor(BasePredictor): Args: im (torch.Tensor): Preprocessed input image tensor with shape (N, C, H, W). bboxes (np.ndarray | List | None): Bounding boxes in XYXY format with shape (N, 4). - points (np.ndarray | List | None): Points indicating object locations with shape (N, 2), in pixels. - labels (np.ndarray | List | None): Point prompt labels with shape (N,). 1 for foreground, 0 for background. + points (np.ndarray | List | None): Points indicating object locations with shape (N, 2) or (N, num_points, 2), in pixels. + labels (np.ndarray | List | None): Point prompt labels with shape (N,) or (N, num_points). 1 for foreground, 0 for background. masks (np.ndarray | None): Low-res masks from previous predictions with shape (N, H, W). For SAM, H=W=256. multimask_output (bool): Flag to return multiple masks for ambiguous prompts. + Raises: + AssertionError: If the number of points don't match the number of labels, in case labels were passed. + Returns: (tuple): Tuple containing: - np.ndarray: Output masks with shape (C, H, W), where C is the number of generated masks. @@ -240,11 +243,15 @@ class Predictor(BasePredictor): points = points[None] if points.ndim == 1 else points # Assuming labels are all positive if users don't pass labels. if labels is None: - labels = np.ones(points.shape[0]) + labels = np.ones(points.shape[:-1]) labels = torch.as_tensor(labels, dtype=torch.int32, device=self.device) + assert ( + points.shape[-2] == labels.shape[-1] + ), f"Number of points {points.shape[-2]} should match number of labels {labels.shape[-1]}." points *= r - # (N, 2) --> (N, 1, 2), (N, ) --> (N, 1) - points, labels = points[:, None, :], labels[:, None] + if points.ndim == 2: + # (N, 2) --> (N, 1, 2), (N, ) --> (N, 1) + points, labels = points[:, None, :], labels[:, None] if bboxes is not None: bboxes = torch.as_tensor(bboxes, dtype=torch.float32, device=self.device) bboxes = bboxes[None] if bboxes.ndim == 1 else bboxes