Engines - PanoSAM

SegmentationEngine #

Bases: Protocol

Protocol for segmentation engines (structural typing).

Any class with a matching segment() method can be used. No inheritance required.

segment #

segment(image: Image, text_prompt: str, threshold: float = 0.5, mask_threshold: float = 0.5, simplify_tolerance: float = 0.005) -> list[FlatMaskResult]

Segment objects in an image using a text prompt.

Source code in src/panosam/api/models.py

def segment(
    self,
    image: Image.Image,
    text_prompt: str,
    threshold: float = 0.5,
    mask_threshold: float = 0.5,
    simplify_tolerance: float = 0.005,
) -> list[FlatMaskResult]:
    """Segment objects in an image using a text prompt."""

SAM3Engine #

SAM3Engine(model_id: str = 'facebook/sam3', device: Optional[str] = None, dtype: dtype = None)

SAM3 segmentation engine using HuggingFace Transformers.

This engine uses the facebook/sam3 model for Promptable Concept Segmentation (PCS) on images. It supports text prompts to segment all instances of a concept.

Attributes:

Name	Type	Description
`model`		The SAM3 model.
`processor`		The SAM3 processor for pre/post-processing.
`device`		The device to run inference on (cuda, mps, or cpu).

Note

Requires SAM3 dependencies. Install with: pip install "panosam[sam3]" Also requires HuggingFace login: huggingface-cli login

Initialize the SAM3 engine.

Parameters:

Name	Type	Description	Default
`model_id`	`str`	HuggingFace model ID for SAM3.	`'facebook/sam3'`
`device`	`Optional[str]`	Device to use. If None, auto-detects (cuda > mps > cpu).	`None`
`dtype`	`dtype`	Data type for model weights. Defaults to torch.float32.	`None`

Raises:

Type	Description
`ImportError`	If SAM3 dependencies are not installed.

Source code in src/panosam/engines/sam3.py

def __init__(
    self,
    model_id: str = "facebook/sam3",
    device: Optional[str] = None,
    dtype: "torch.dtype" = None,
):
    """Initialize the SAM3 engine.

    Args:
        model_id: HuggingFace model ID for SAM3.
        device: Device to use. If None, auto-detects (cuda > mps > cpu).
        dtype: Data type for model weights. Defaults to torch.float32.

    Raises:
        ImportError: If SAM3 dependencies are not installed.
    """
    # Check dependencies first with helpful error message
    _check_sam_dependencies()

    import torch
    from transformers import Sam3Processor, Sam3Model

    # Set default dtype after torch import
    if dtype is None:
        dtype = torch.float32

    # Auto-detect device
    if device is None:
        if torch.cuda.is_available():
            device = "cuda"
        elif torch.backends.mps.is_available():
            device = "mps"
        else:
            device = "cpu"

    self.device = device
    self.dtype = dtype

    print(f"Loading SAM3 model on {device}...")
    self.model = Sam3Model.from_pretrained(model_id).to(device)
    self.processor = Sam3Processor.from_pretrained(model_id)
    print("SAM3 model loaded successfully.")

segment #

segment(image: Image, text_prompt: str, threshold: float = 0.5, mask_threshold: float = 0.5, simplify_tolerance: float = 0.005, return_raw_masks: bool = False) -> List[FlatMaskResult] | Tuple[List[FlatMaskResult], List[np.ndarray]]

Segment objects in an image using a text prompt.

Parameters:

Name	Type	Description	Default
`image`	`Image`	Input image as PIL Image.	required
`text_prompt`	`str`	Text describing the objects to segment (e.g., "car", "person").	required
`threshold`	`float`	Confidence threshold for detections (0-1).	`0.5`
`mask_threshold`	`float`	Threshold for binary mask generation (0-1).	`0.5`
`simplify_tolerance`	`float`	Tolerance for polygon simplification (0-1).	`0.005`
`return_raw_masks`	`bool`	If True, also return raw binary masks for visualization.	`False`

Returns:

Type	Description
`List[FlatMaskResult] \| Tuple[List[FlatMaskResult], List[ndarray]]`	List of FlatMaskResult objects containing segmentation masks.
`List[FlatMaskResult] \| Tuple[List[FlatMaskResult], List[ndarray]]`	If return_raw_masks=True, returns tuple of (flat_results, raw_masks).

Source code in src/panosam/engines/sam3.py

def segment(
    self,
    image: Image.Image,
    text_prompt: str,
    threshold: float = 0.5,
    mask_threshold: float = 0.5,
    simplify_tolerance: float = 0.005,
    return_raw_masks: bool = False,
) -> List[FlatMaskResult] | Tuple[List[FlatMaskResult], List[np.ndarray]]:
    """Segment objects in an image using a text prompt.

    Args:
        image: Input image as PIL Image.
        text_prompt: Text describing the objects to segment (e.g., "car", "person").
        threshold: Confidence threshold for detections (0-1).
        mask_threshold: Threshold for binary mask generation (0-1).
        simplify_tolerance: Tolerance for polygon simplification (0-1).
        return_raw_masks: If True, also return raw binary masks for visualization.

    Returns:
        List of FlatMaskResult objects containing segmentation masks.
        If return_raw_masks=True, returns tuple of (flat_results, raw_masks).
    """
    import torch

    # Ensure RGB
    if image.mode != "RGB":
        image = image.convert("RGB")

    original_size = image.size  # (width, height)

    # Process inputs
    inputs = self.processor(images=image, text=text_prompt, return_tensors="pt").to(
        self.device
    )

    # Run inference
    with torch.no_grad():
        outputs = self.model(**inputs)

    # Post-process results
    results = self.processor.post_process_instance_segmentation(
        outputs,
        threshold=threshold,
        mask_threshold=mask_threshold,
        target_sizes=[list(reversed(original_size))],  # (height, width)
    )[0]

    # Convert to FlatMaskResult
    flat_results = []
    raw_masks = []
    masks = results.get("masks", [])
    scores = results.get("scores", [])

    for i, (mask, score) in enumerate(zip(masks, scores)):
        # Convert mask to numpy
        mask_np = mask.cpu().numpy() if isinstance(mask, torch.Tensor) else mask

        # Ensure 2D mask
        if mask_np.ndim > 2:
            mask_np = mask_np.squeeze()

        # Create FlatMaskResult from binary mask
        flat_result = FlatMaskResult.from_binary_mask(
            mask=mask_np,
            score=float(score),
            label=text_prompt,
            mask_id=f"{text_prompt}_{i}",
            simplify_tolerance=simplify_tolerance,
        )

        # Only include masks with at least one valid polygon (3+ points)
        if flat_result.polygons and any(len(p) >= 3 for p in flat_result.polygons):
            flat_results.append(flat_result)
            if return_raw_masks:
                raw_masks.append(mask_np)

    if return_raw_masks:
        return flat_results, raw_masks
    return flat_results

segment_with_boxes #

segment_with_boxes(image: Image, boxes: List[Tuple[int, int, int, int]], box_labels: Optional[List[int]] = None, threshold: float = 0.5, mask_threshold: float = 0.5, simplify_tolerance: float = 0.005) -> List[FlatMaskResult]

Segment objects in an image using bounding box prompts.

Parameters:

Name	Type	Description	Default
`image`	`Image`	Input image as PIL Image.	required
`boxes`	`List[Tuple[int, int, int, int]]`	List of bounding boxes in (x1, y1, x2, y2) pixel format.	required
`box_labels`	`Optional[List[int]]`	List of labels (1 for positive, 0 for negative). Defaults to all positive.	`None`
`threshold`	`float`	Confidence threshold for detections (0-1).	`0.5`
`mask_threshold`	`float`	Threshold for binary mask generation (0-1).	`0.5`
`simplify_tolerance`	`float`	Tolerance for polygon simplification (0-1).	`0.005`

Returns:

Type	Description
`List[FlatMaskResult]`	List of FlatMaskResult objects containing segmentation masks.

Source code in src/panosam/engines/sam3.py

def segment_with_boxes(
    self,
    image: Image.Image,
    boxes: List[Tuple[int, int, int, int]],
    box_labels: Optional[List[int]] = None,
    threshold: float = 0.5,
    mask_threshold: float = 0.5,
    simplify_tolerance: float = 0.005,
) -> List[FlatMaskResult]:
    """Segment objects in an image using bounding box prompts.

    Args:
        image: Input image as PIL Image.
        boxes: List of bounding boxes in (x1, y1, x2, y2) pixel format.
        box_labels: List of labels (1 for positive, 0 for negative). Defaults to all positive.
        threshold: Confidence threshold for detections (0-1).
        mask_threshold: Threshold for binary mask generation (0-1).
        simplify_tolerance: Tolerance for polygon simplification (0-1).

    Returns:
        List of FlatMaskResult objects containing segmentation masks.
    """
    import torch

    if len(boxes) == 0:
        return []

    # Ensure RGB
    if image.mode != "RGB":
        image = image.convert("RGB")

    original_size = image.size  # (width, height)

    # Default to positive boxes
    if box_labels is None:
        box_labels = [1] * len(boxes)

    # Format for processor: [batch, num_boxes, 4]
    input_boxes = [[list(box) for box in boxes]]
    input_boxes_labels = [box_labels]

    # Process inputs
    inputs = self.processor(
        images=image,
        input_boxes=input_boxes,
        input_boxes_labels=input_boxes_labels,
        return_tensors="pt",
    ).to(self.device)

    # Run inference
    with torch.no_grad():
        outputs = self.model(**inputs)

    # Post-process results
    results = self.processor.post_process_instance_segmentation(
        outputs,
        threshold=threshold,
        mask_threshold=mask_threshold,
        target_sizes=[list(reversed(original_size))],
    )[0]

    # Convert to FlatMaskResult
    flat_results = []
    masks = results.get("masks", [])
    scores = results.get("scores", [])

    for i, (mask, score) in enumerate(zip(masks, scores)):
        mask_np = mask.cpu().numpy() if isinstance(mask, torch.Tensor) else mask

        if mask_np.ndim > 2:
            mask_np = mask_np.squeeze()

        flat_result = FlatMaskResult.from_binary_mask(
            mask=mask_np,
            score=float(score),
            label=f"box_{i}",
            mask_id=f"box_{i}",
            simplify_tolerance=simplify_tolerance,
        )

        # Only include masks with at least one valid polygon (3+ points)
        if flat_result.polygons and any(len(p) >= 3 for p in flat_result.polygons):
            flat_results.append(flat_result)

    return flat_results

get_raw_masks #

get_raw_masks(image: Image, text_prompt: str, threshold: float = 0.5, mask_threshold: float = 0.5) -> Tuple[List[np.ndarray], List[float]]

Get raw binary masks without polygon conversion.

Useful when you need the full mask data rather than simplified polygons.

Parameters:

Name	Type	Description	Default
`image`	`Image`	Input image as PIL Image.	required
`text_prompt`	`str`	Text describing the objects to segment.	required
`threshold`	`float`	Confidence threshold for detections (0-1).	`0.5`
`mask_threshold`	`float`	Threshold for binary mask generation (0-1).	`0.5`

Returns:

Type	Description
`Tuple[List[ndarray], List[float]]`	Tuple of (masks, scores) where masks are numpy arrays.

Source code in src/panosam/engines/sam3.py

def get_raw_masks(
    self,
    image: Image.Image,
    text_prompt: str,
    threshold: float = 0.5,
    mask_threshold: float = 0.5,
) -> Tuple[List[np.ndarray], List[float]]:
    """Get raw binary masks without polygon conversion.

    Useful when you need the full mask data rather than simplified polygons.

    Args:
        image: Input image as PIL Image.
        text_prompt: Text describing the objects to segment.
        threshold: Confidence threshold for detections (0-1).
        mask_threshold: Threshold for binary mask generation (0-1).

    Returns:
        Tuple of (masks, scores) where masks are numpy arrays.
    """
    import torch

    if image.mode != "RGB":
        image = image.convert("RGB")

    original_size = image.size

    inputs = self.processor(images=image, text=text_prompt, return_tensors="pt").to(
        self.device
    )

    with torch.no_grad():
        outputs = self.model(**inputs)

    results = self.processor.post_process_instance_segmentation(
        outputs,
        threshold=threshold,
        mask_threshold=mask_threshold,
        target_sizes=[list(reversed(original_size))],
    )[0]

    masks = []
    scores = []

    for mask, score in zip(results.get("masks", []), results.get("scores", [])):
        mask_np = mask.cpu().numpy() if isinstance(mask, torch.Tensor) else mask
        if mask_np.ndim > 2:
            mask_np = mask_np.squeeze()
        masks.append(mask_np)
        scores.append(float(score))

    return masks, scores

FlatMaskResult `dataclass` #

FlatMaskResult(polygons: List[List[Tuple[float, float]]], score: float, label: Optional[str] = None, mask_id: Optional[str] = None)

A segmentation mask result in flat/perspective image coordinates.

Attributes:

Name	Type	Description
`polygons`	`List[List[Tuple[float, float]]]`	List of polygons, each polygon is a list of (x, y) tuples in normalized coordinates (0-1 range, where 0,0 is top-left).
`score`	`float`	Confidence score for this mask (0-1).
`label`	`Optional[str]`	Optional text label for the segmented object.
`mask_id`	`Optional[str]`	Optional unique identifier for this mask.

to_sphere #

to_sphere(horizontal_fov: float, vertical_fov: float, yaw_offset: float, pitch_offset: float) -> SphereMaskResult

Convert flat mask result to spherical coordinates.

Uses proper 3D rotation to accurately map perspective image coordinates to equirectangular spherical coordinates.

Parameters:

Name	Type	Description	Default
`horizontal_fov`	`float`	Horizontal field of view in degrees.	required
`vertical_fov`	`float`	Vertical field of view in degrees.	required
`yaw_offset`	`float`	Horizontal offset of the perspective in degrees.	required
`pitch_offset`	`float`	Vertical offset of the perspective in degrees.	required

Returns:

Type	Description
`SphereMaskResult`	SphereMaskResult with polygons in spherical coordinates.

Source code in src/panosam/sam/models.py

def to_sphere(
    self,
    horizontal_fov: float,
    vertical_fov: float,
    yaw_offset: float,
    pitch_offset: float,
) -> "SphereMaskResult":
    """Convert flat mask result to spherical coordinates.

    Uses proper 3D rotation to accurately map perspective image coordinates
    to equirectangular spherical coordinates.

    Args:
        horizontal_fov: Horizontal field of view in degrees.
        vertical_fov: Vertical field of view in degrees.
        yaw_offset: Horizontal offset of the perspective in degrees.
        pitch_offset: Vertical offset of the perspective in degrees.

    Returns:
        SphereMaskResult with polygons in spherical coordinates.
    """
    if (
        horizontal_fov is None
        or vertical_fov is None
        or yaw_offset is None
        or pitch_offset is None
    ):
        raise ValueError("Missing parameters")
    if horizontal_fov < 0 or vertical_fov < 0:
        raise ValueError("FOV must be positive")

    # Convert each polygon to spherical coordinates
    sphere_polygons = []
    for polygon in self.polygons:
        sphere_polygon = []
        for u, v in polygon:
            yaw, pitch = perspective_to_sphere(
                u, v, horizontal_fov, vertical_fov, yaw_offset, pitch_offset
            )
            sphere_polygon.append((yaw, pitch))
        if sphere_polygon:
            sphere_polygons.append(sphere_polygon)

    # Calculate centroid using proper spherical averaging
    # This handles wrap-around at ±180° correctly
    if sphere_polygons:
        center_yaw, center_pitch = calculate_spherical_centroid(sphere_polygons)
    else:
        center_yaw = yaw_offset
        center_pitch = pitch_offset

    return SphereMaskResult(
        polygons=sphere_polygons,
        score=self.score,
        label=self.label,
        mask_id=self.mask_id,
        center_yaw=center_yaw,
        center_pitch=center_pitch,
    )

to_dict #

to_dict() -> Dict[str, Any]

Convert to dictionary representation.

Source code in src/panosam/sam/models.py

def to_dict(self) -> Dict[str, Any]:
    """Convert to dictionary representation."""
    return {
        "polygons": self.polygons,
        "score": self.score,
        "label": self.label,
        "mask_id": self.mask_id,
    }

from_binary_mask `classmethod` #

from_binary_mask(mask: ndarray, score: float, label: Optional[str] = None, mask_id: Optional[str] = None, simplify_tolerance: float = 0.001, min_contour_area_ratio: float = 0.01) -> FlatMaskResult

Create a FlatMaskResult from a binary mask.

Extracts ALL significant contours from the mask, not just the largest.

Parameters:

Name	Type	Description	Default
`mask`	`ndarray`	Binary mask as numpy array (H, W) with values 0 or 1/255.	required
`score`	`float`	Confidence score for this mask.	required
`label`	`Optional[str]`	Optional text label.	`None`
`mask_id`	`Optional[str]`	Optional unique identifier.	`None`
`simplify_tolerance`	`float`	Tolerance for polygon simplification (0-1).	`0.001`
`min_contour_area_ratio`	`float`	Minimum contour area as ratio of largest contour. Contours smaller than this are discarded.	`0.01`

Returns:

Type	Description
`FlatMaskResult`	FlatMaskResult with normalized polygon coordinates.

Source code in src/panosam/sam/models.py

@classmethod
def from_binary_mask(
    cls,
    mask: np.ndarray,
    score: float,
    label: Optional[str] = None,
    mask_id: Optional[str] = None,
    simplify_tolerance: float = 0.001,
    min_contour_area_ratio: float = 0.01,
) -> "FlatMaskResult":
    """Create a FlatMaskResult from a binary mask.

    Extracts ALL significant contours from the mask, not just the largest.

    Args:
        mask: Binary mask as numpy array (H, W) with values 0 or 1/255.
        score: Confidence score for this mask.
        label: Optional text label.
        mask_id: Optional unique identifier.
        simplify_tolerance: Tolerance for polygon simplification (0-1).
        min_contour_area_ratio: Minimum contour area as ratio of largest contour.
                                Contours smaller than this are discarded.

    Returns:
        FlatMaskResult with normalized polygon coordinates.
    """
    import cv2

    # Ensure mask is uint8
    if mask.dtype != np.uint8:
        mask = (mask > 0.5).astype(np.uint8) * 255

    # Find contours
    contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    if len(contours) == 0:
        return cls(polygons=[], score=score, label=label, mask_id=mask_id)

    # Get the largest contour area for filtering
    contour_areas = [cv2.contourArea(c) for c in contours]
    max_area = max(contour_areas)
    min_area = max_area * min_contour_area_ratio

    # Process all significant contours
    h, w = mask.shape[:2]
    polygons = []

    for contour, area in zip(contours, contour_areas):
        if area < min_area:
            continue  # Skip tiny contours

        # Simplify the contour
        epsilon = simplify_tolerance * cv2.arcLength(contour, True)
        simplified = cv2.approxPolyDP(contour, epsilon, True)

        # Need at least 3 points for a polygon
        if len(simplified) < 3:
            continue

        # Convert to normalized coordinates (0-1)
        polygon = [(float(pt[0][0]) / w, float(pt[0][1]) / h) for pt in simplified]
        polygons.append(polygon)

    return cls(polygons=polygons, score=score, label=label, mask_id=mask_id)

SphereMaskResult `dataclass` #

SphereMaskResult(polygons: List[List[Tuple[float, float]]], score: float, label: Optional[str] = None, mask_id: Optional[str] = None, center_yaw: float = 0.0, center_pitch: float = 0.0)

A segmentation mask result in spherical/panoramic coordinates.

Attributes:

Name	Type	Description
`polygons`	`List[List[Tuple[float, float]]]`	List of polygons, each polygon is a list of (yaw, pitch) tuples in degrees.
`score`	`float`	Confidence score for this mask (0-1).
`label`	`Optional[str]`	Optional text label for the segmented object.
`mask_id`	`Optional[str]`	Optional unique identifier for this mask.
`center_yaw`	`float`	Yaw of the polygon centroid in degrees.
`center_pitch`	`float`	Pitch of the polygon centroid in degrees.

to_dict #

to_dict() -> Dict[str, Any]

Convert to dictionary representation.

Source code in src/panosam/sam/models.py

def to_dict(self) -> Dict[str, Any]:
    """Convert to dictionary representation."""
    return {
        "polygons": self.polygons,
        "score": self.score,
        "label": self.label,
        "mask_id": self.mask_id,
        "center_yaw": self.center_yaw,
        "center_pitch": self.center_pitch,
    }

from_dict `classmethod` #

from_dict(data: Dict[str, Any]) -> SphereMaskResult

Create from dictionary representation.

Source code in src/panosam/sam/models.py

@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "SphereMaskResult":
    """Create from dictionary representation."""
    # Handle both old format (polygon) and new format (polygons)
    if "polygons" in data:
        polygons = [[tuple(p) for p in poly] for poly in data["polygons"]]
    elif "polygon" in data:
        # Legacy format: single polygon
        polygons = [[tuple(p) for p in data["polygon"]]] if data["polygon"] else []
    else:
        polygons = []

    return cls(
        polygons=polygons,
        score=data["score"],
        label=data.get("label"),
        mask_id=data.get("mask_id"),
        center_yaw=data.get("center_yaw", 0.0),
        center_pitch=data.get("center_pitch", 0.0),
    )

get_bounding_box #

get_bounding_box() -> Tuple[float, float, float, float]

Get the bounding box of all polygons.

Returns:

Type	Description
`Tuple[float, float, float, float]`	Tuple of (min_yaw, min_pitch, max_yaw, max_pitch) in degrees.

Source code in src/panosam/sam/models.py

def get_bounding_box(self) -> Tuple[float, float, float, float]:
    """Get the bounding box of all polygons.

    Returns:
        Tuple of (min_yaw, min_pitch, max_yaw, max_pitch) in degrees.
    """
    all_points = [pt for polygon in self.polygons for pt in polygon]
    if not all_points:
        return (0, 0, 0, 0)

    yaws = [p[0] for p in all_points]
    pitches = [p[1] for p in all_points]

    return (min(yaws), min(pitches), max(yaws), max(pitches))

get_area_estimate #

get_area_estimate() -> float

Estimate the total area of all polygons using the shoelace formula.

Returns:

Type	Description
`float`	Estimated area in square degrees.

Source code in src/panosam/sam/models.py

def get_area_estimate(self) -> float:
    """Estimate the total area of all polygons using the shoelace formula.

    Returns:
        Estimated area in square degrees.
    """
    total_area = 0.0
    for polygon in self.polygons:
        if len(polygon) < 3:
            continue

        # Shoelace formula
        n = len(polygon)
        area = 0.0
        for i in range(n):
            j = (i + 1) % n
            area += polygon[i][0] * polygon[j][1]
            area -= polygon[j][0] * polygon[i][1]

        total_area += abs(area) / 2.0

    return total_area

Engines#

SegmentationEngine Protocol#

SegmentationEngine #

segment #

SAM3Engine#

SAM3Engine #

segment #

segment_with_boxes #

get_raw_masks #

Custom Engines#

Mask Results#

FlatMaskResult `dataclass` #

to_sphere #

to_dict #

from_binary_mask `classmethod` #

SphereMaskResult `dataclass` #

to_dict #

from_dict `classmethod` #

get_bounding_box #

get_area_estimate #

Engines#

SegmentationEngine Protocol#

SegmentationEngine #

segment #

SAM3Engine#

SAM3Engine #

segment #

segment_with_boxes #

get_raw_masks #

Custom Engines#

Mask Results#

FlatMaskResult dataclass #

to_sphere #

to_dict #

from_binary_mask classmethod #

SphereMaskResult dataclass #

to_dict #

from_dict classmethod #

get_bounding_box #

get_area_estimate #

FlatMaskResult `dataclass` #

from_binary_mask `classmethod` #

SphereMaskResult `dataclass` #

from_dict `classmethod` #