Spaces:

bellmake
/

SAM3-video-segmentation-tracking

Sleeping

App Files Files Community

SAM3-video-segmentation-tracking / sam3 /eval /postprocessors.py

bellmake

SAM3 Video Segmentation - Clean deployment

14114e8 6 days ago

raw

history blame contribute delete

28.8 kB

	# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved

	"""Postprocessors class to transform MDETR output according to the downstream task"""

	import dataclasses
	import logging
	from collections import defaultdict
	from typing import Dict, List, Optional

	import numpy as np
	import torch
	from sam3.model import box_ops
	from sam3.model.data_misc import BatchedInferenceMetadata, interpolate
	from sam3.train.masks_ops import rle_encode, robust_rle_encode
	from torch import nn


	class PostProcessNullOp(nn.Module):
	def __init__(self, **kwargs):
	super(PostProcessNullOp).__init__()
	pass

	def forward(self, input):
	pass

	def process_results(self, **kwargs):
	return kwargs["find_stages"]


	class PostProcessImage(nn.Module):
	"""This module converts the model's output into the format expected by the coco api"""

	def __init__(
	self,
	max_dets_per_img: int,
	iou_type="bbox",
	to_cpu: bool = True,
	use_original_ids: bool = False,
	use_original_sizes_box: bool = False,
	use_original_sizes_mask: bool = False,
	convert_mask_to_rle: bool = False,
	always_interpolate_masks_on_gpu: bool = True,
	use_presence: bool = True,
	detection_threshold: float = -1.0,
	) -> None:
	super().__init__()
	self.max_dets_per_img = max_dets_per_img
	self.iou_type = iou_type
	self.to_cpu = to_cpu
	self.convert_mask_to_rle = convert_mask_to_rle
	self.always_interpolate_masks_on_gpu = always_interpolate_masks_on_gpu

	self.use_presence = use_presence
	self.detection_threshold = detection_threshold
	self.use_original_ids = use_original_ids
	self.use_original_sizes_box = use_original_sizes_box
	self.use_original_sizes_mask = use_original_sizes_mask

	@torch.no_grad()
	def forward(
	self,
	outputs,
	target_sizes_boxes,
	target_sizes_masks,
	forced_labels=None,
	consistent=False,
	ret_tensordict: bool = False, # This is experimental
	):
	"""Perform the computation
	Parameters:
	outputs: raw outputs of the model
	target_sizes_boxes: tensor of dimension [batch_size x 2] containing the size of each images of the batch
	For evaluation, this must be the original image size (before any data augmentation)
	For visualization, this should be the image size after data augment, but before padding
	target_sizes_masks: same but used to resize masks
	forced_labels: tensor of dimension [batch_size] containing the label to force for each image of the batch
	This is useful when evaluating the model using standard metrics (eg on COCO, LVIS). In that case,
	we query the model with every possible class label, so we when we pass the predictions to the evaluator,
	we want to make sure that the predicted "class" matches the one that was queried.
	consistent: whether all target sizes are equal
	ret_tensordict: Experimental argument. If true, return a tensordict.TensorDict instead of a list of dictionaries for easier manipulation.
	"""
	if ret_tensordict:
	assert (
	consistent is True
	), "We don't support returning TensorDict if the outputs have different shapes" # NOTE: It's possible but we don't support it.
	assert self.detection_threshold <= 0.0, "TODO: implement?"
	try:
	from tensordict import TensorDict
	except ImportError:
	logging.info(
	"tensordict is not installed. Install by running `pip install tensordict --no-deps`. Falling back by setting `ret_tensordict=False`"
	)
	ret_tensordict = False

	out_bbox = outputs["pred_boxes"] if "pred_boxes" in outputs else None
	out_logits = outputs["pred_logits"]
	pred_masks = outputs["pred_masks"] if self.iou_type == "segm" else None
	out_probs = out_logits.sigmoid()
	if self.use_presence:
	presence_score = outputs["presence_logit_dec"].sigmoid().unsqueeze(1)
	out_probs = out_probs * presence_score

	assert target_sizes_boxes.shape[1] == 2
	assert target_sizes_masks.shape[1] == 2
	batch_size = target_sizes_boxes.shape[0]

	boxes, scores, labels, keep = self._process_boxes_and_labels(
	target_sizes_boxes, forced_labels, out_bbox, out_probs
	)
	assert boxes is None or len(boxes) == batch_size
	out_masks = self._process_masks(
	target_sizes_masks, pred_masks, consistent=consistent, keep=keep
	)
	del pred_masks

	if boxes is None:
	assert out_masks is not None
	assert not ret_tensordict, "We don't support returning TensorDict if the output does not contain boxes"
	B = len(out_masks)
	boxes = [None] * B
	scores = [None] * B
	labels = [None] * B

	results = {
	"scores": scores,
	"labels": labels,
	"boxes": boxes,
	}
	if out_masks is not None:
	if self.convert_mask_to_rle:
	results.update(masks_rle=out_masks)
	else:
	results.update(masks=out_masks)

	if ret_tensordict:
	results = TensorDict(results).auto_batch_size_()
	if self.to_cpu:
	results = results.cpu()
	else:
	# Convert a dictonary of lists/tensors to list of dictionaries
	results = [
	dict(zip(results.keys(), res_tuple))
	for res_tuple in zip(*results.values())
	]

	return results

	def _process_masks(self, target_sizes, pred_masks, consistent=True, keep=None):
	if pred_masks is None:
	return None
	if self.always_interpolate_masks_on_gpu:
	gpu_device = target_sizes.device
	assert gpu_device.type == "cuda"
	pred_masks = pred_masks.to(device=gpu_device)
	if consistent:
	assert keep is None, "TODO: implement?"
	# All masks should have the same shape, expected when processing a batch of size 1
	target_size = target_sizes.unique(dim=0)
	assert target_size.size(0) == 1, "Expecting all target sizes to be equal"
	out_masks = (
	interpolate(
	pred_masks,
	target_size.squeeze().tolist(),
	mode="bilinear",
	align_corners=False,
	).sigmoid()
	> 0.5
	)
	if self.convert_mask_to_rle:
	raise RuntimeError("TODO: implement?")
	if self.to_cpu:
	out_masks = out_masks.cpu()
	else:
	out_masks = [[]] * len(pred_masks)

	assert keep is None or len(keep) == len(pred_masks)
	for i, mask in enumerate(pred_masks):
	h, w = target_sizes[i]
	if keep is not None:
	mask = mask[keep[i]]
	# Uses the gpu version fist, moves masks to cpu if it fails"""
	try:
	interpolated = (
	interpolate(
	mask.unsqueeze(1),
	(h, w),
	mode="bilinear",
	align_corners=False,
	).sigmoid()
	> 0.5
	)
	except Exception as e:
	logging.info("Issue found, reverting to CPU mode!")
	mask_device = mask.device
	mask = mask.cpu()
	interpolated = (
	interpolate(
	mask.unsqueeze(1),
	(h, w),
	mode="bilinear",
	align_corners=False,
	).sigmoid()
	> 0.5
	)
	interpolated = interpolated.to(mask_device)

	if self.convert_mask_to_rle:
	out_masks[i] = robust_rle_encode(interpolated.squeeze(1))
	else:
	out_masks[i] = interpolated
	if self.to_cpu:
	out_masks[i] = out_masks[i].cpu()

	return out_masks

	def _process_boxes_and_labels(
	self, target_sizes, forced_labels, out_bbox, out_probs
	):
	if out_bbox is None:
	return None, None, None, None
	assert len(out_probs) == len(target_sizes)
	if self.to_cpu:
	out_probs = out_probs.cpu()
	scores, labels = out_probs.max(-1)
	if forced_labels is None:
	labels = torch.ones_like(labels)
	else:
	labels = forced_labels[:, None].expand_as(labels)

	# convert to [x0, y0, x1, y1] format
	boxes = box_ops.box_cxcywh_to_xyxy(out_bbox)

	img_h, img_w = target_sizes.unbind(1)
	scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1)
	boxes = boxes * scale_fct[:, None, :]

	if self.to_cpu:
	boxes = boxes.cpu()

	keep = None
	if self.detection_threshold > 0:
	# Filter out the boxes with scores below the detection threshold
	keep = scores > self.detection_threshold
	assert len(keep) == len(boxes) == len(scores) == len(labels)

	boxes = [b[k.to(b.device)] for b, k in zip(boxes, keep)]
	scores = [s[k.to(s.device)] for s, k in zip(scores, keep)]
	labels = [l[k.to(l.device)] for l, k in zip(labels, keep)]

	return boxes, scores, labels, keep

	def process_results(
	self, find_stages, find_metadatas: List[BatchedInferenceMetadata], **kwargs
	):
	if find_stages.loss_stages is not None:
	find_metadatas = [find_metadatas[i] for i in find_stages.loss_stages]
	assert len(find_stages) == len(find_metadatas)
	results = {}
	for outputs, meta in zip(find_stages, find_metadatas):
	img_size_for_boxes = (
	meta.original_size
	if self.use_original_sizes_box
	else torch.ones_like(meta.original_size)
	)
	img_size_for_masks = (
	meta.original_size
	if self.use_original_sizes_mask
	else torch.ones_like(meta.original_size)
	)
	detection_results = self(
	outputs,
	img_size_for_boxes,
	img_size_for_masks,
	forced_labels=(
	meta.original_category_id if self.use_original_ids else None
	),
	)
	ids = (
	meta.original_image_id if self.use_original_ids else meta.coco_image_id
	)
	assert len(detection_results) == len(ids)
	for img_id, result in zip(ids, detection_results):
	if img_id.item() not in results:
	results[img_id.item()] = result
	else:
	assert set(results[img_id.item()].keys()) == set(result.keys())
	for k in result.keys():
	if isinstance(result[k], torch.Tensor):
	results[img_id.item()][k] = torch.cat(
	[results[img_id.item()][k], result[k]], dim=0
	)
	elif isinstance(result[k], list):
	results[img_id.item()][k] += result[k]
	else:
	raise NotImplementedError(
	f"Unexpected type {type(result[k])} in result."
	)
	# Prune the results to the max number of detections per image.
	for img_id, result in results.items():
	if (
	self.max_dets_per_img > 0
	and len(result["scores"]) > self.max_dets_per_img
	):
	_, topk_indexes = torch.topk(
	result["scores"], self.max_dets_per_img, dim=0
	)
	if self.to_cpu:
	topk_indexes = topk_indexes.cpu()
	for k in result.keys():
	if isinstance(results[img_id][k], list):
	results[img_id][k] = [
	results[img_id][k][i] for i in topk_indexes.tolist()
	]
	else:
	results[img_id][k] = results[img_id][k].to(topk_indexes.device)[
	topk_indexes
	]

	return results


	class PostProcessAPIVideo(PostProcessImage):
	"""This module converts the video model's output into the format expected by the YT-VIS api"""

	def __init__(
	self,
	*args,
	to_cpu: bool = True,
	convert_mask_to_rle: bool = False,
	always_interpolate_masks_on_gpu: bool = True,
	prob_thresh: float = 0.5,
	use_presence: bool = False,
	**kwargs,
	):
	super().__init__(
	*args,
	# Here we always set `convert_mask_to_rle=False` in the base `PostProcessAPI` class
	# (so that its `_process_masks` won't return a list of RLEs). If we want to return
	# RLEs for video masklets, we handle it in this `PostProcessAPIVideo` class instead.
	convert_mask_to_rle=False,
	# Here we always set `to_cpu=False` in the base `PostProcessAPI` class (so that
	# the interpolated masks won't be automatically moved back to CPU). We will handle
	# it in this `PostProcessAPIVideo` class instead.
	always_interpolate_masks_on_gpu=always_interpolate_masks_on_gpu,
	use_presence=use_presence,
	**kwargs,
	)
	# Expected keys in the output dict to postprocess
	self.EXPECTED_KEYS = [
	"pred_logits",
	"pred_boxes",
	"pred_masks",
	]
	# Whether to post-process video masklets (under packed representation) into RLE format
	self.convert_mask_to_rle_for_video = convert_mask_to_rle
	self.to_cpu_for_video = to_cpu
	self.prob_thresh = prob_thresh

	def process_results(
	self, find_stages, find_metadatas: List[BatchedInferenceMetadata], **kwargs
	):
	"""
	Tracking Postprocessor for SAM 3 video model.
	This function takes in the output of the SAM 3 video model and processes it to extract all the tracklet predictions.
	Args:
	find_stages: A list of tensors representing the output of the SAM 3 video model.
	find_metadatas: A list of BatchedInferenceMetadata objects containing metadata about each frame.
	**kwargs: Additional keyword arguments.
	Returns:
	A dictionary of predcitions with video_id as key.
	"""

	# Import tensordict here to avoid global dependency.
	try:
	from tensordict import TensorDict
	except ImportError as e:
	logging.error(
	"tensordict is not installed, please install by running `pip install tensordict --no-deps`"
	)
	raise e
	# Notes and assumptions:
	# 1- This postprocessor assumes results only for a single video.
	# 2- There are N stage outputs corresponding to N video frames
	# 3- Each stage outputs contains PxQ preds, where P is number of prompts and Q is number of object queries. The output should also contain the tracking object ids corresponding to each object query.
	# 4- The tracking object id has a default value of -1, indicating that the object query is not tracking any object in the frame, and hence its predictions can be ingored for a given frame.
	# 5- Some objects may be tracked in a subset of frames only. So, we first extract the predictions in a packed representation (for efficient postprocessing -- specially memory)
	# and then we convert the packed representation into a padded one, where we zero pad boxes/masks for objects that are not tracked in some frames.
	# 6- We refer to objects by an object id, which is a tuple (prompt_idx, obj_id)

	assert len(find_stages) > 0, "There is nothing to postprocess?"
	PROMPT_AXIS, OBJ_QUERY_AXIS = (0, 1)
	NO_OBJ_ID = -1
	# Maps object ID -> [indices in packed tensor]
	tracked_objects_packed_idx = defaultdict(list)
	# Maps object ID -> [indices in padded tensor (abs frame index)]
	tracked_objects_frame_idx = defaultdict(list)
	total_num_preds = 0
	# This will hold the packed representation of predictions.
	vid_preds_packed: List[TensorDict] = []
	vid_masklets_rle_packed: List[Optional[Dict]] = []
	video_id = -1 # We assume single video postprocessing, this ID should be unique in the datapoint.

	for frame_idx, (frame_outs, meta) in enumerate(
	zip(find_stages, find_metadatas)
	):
	# only store keys we need to extract the results
	frame_outs_td = TensorDict(
	{k: frame_outs[k] for k in self.EXPECTED_KEYS}
	).auto_batch_size_() # Shape is [P,Q,...]
	meta_td = TensorDict(
	dataclasses.asdict(meta)
	).auto_batch_size_() # Shape is [P,...]
	unique_vid_id = meta.original_image_id.unique()
	assert unique_vid_id.size(0) == 1
	if video_id == -1:
	video_id = unique_vid_id.item()
	else:
	assert (
	video_id == unique_vid_id.item()
	), "We can only postprocess one video per datapoint"
	# keeping track of which objects appear in the current frame
	obj_ids_per_frame = frame_outs["pred_object_ids"]
	assert obj_ids_per_frame.size(-1) == frame_outs["pred_logits"].size(-2)
	if self.prob_thresh is not None:
	# only keep the predictions on this frame with probability above the threshold
	# (remove those predictions during the keep-alive period of a tracking query,
	# where its "pred_object_ids" is still the tracked object ID rather than -1)
	pred_probs = frame_outs["pred_logits"].sigmoid().squeeze(-1)
	obj_ids_per_frame = torch.where(
	pred_probs >= self.prob_thresh, obj_ids_per_frame, NO_OBJ_ID
	)
	tracked_obj_ids_idx = torch.where(obj_ids_per_frame != NO_OBJ_ID)
	# Object id is a tuple of (prompt_idx, obj_id). This is because the model can assign same obj_id for two different prompts.
	tracked_obj_ids = [
	(p_id.item(), obj_ids_per_frame[p_id, q_id].item())
	for p_id, q_id in zip(
	tracked_obj_ids_idx[PROMPT_AXIS],
	tracked_obj_ids_idx[OBJ_QUERY_AXIS],
	)
	]
	if len(tracked_obj_ids) == 0:
	continue
	# For each object, we keep track of the packed and padded (frame index) indices
	for oid in tracked_obj_ids:
	tracked_objects_packed_idx[oid].append(total_num_preds)
	tracked_objects_frame_idx[oid].append(frame_idx)
	total_num_preds += 1

	# Since we have P*Q masks per frame, mask interpolation is the GPU memory bottleneck or time bottleneck in case of cpu processing.
	# Instead, we first extract results only for tracked objects, reducing the number of masks to K = sum_i(tracked_objs_per_ith_prompt), hopefully <<< P*Q
	tracked_objs_outs_td = frame_outs_td[
	tracked_obj_ids_idx
	] # [P,Q,...] --> [K,...]
	meta_td = meta_td[tracked_obj_ids_idx[PROMPT_AXIS].cpu()]
	if self.always_interpolate_masks_on_gpu:
	gpu_device = meta_td["original_size"].device
	assert gpu_device.type == "cuda"
	tracked_objs_outs_td = tracked_objs_outs_td.to(device=gpu_device)
	frame_results_td = self(
	tracked_objs_outs_td.unsqueeze(1),
	(
	meta_td["original_size"]
	if self.use_original_sizes
	else torch.ones_like(meta_td["original_size"])
	),
	forced_labels=(
	meta_td["original_category_id"] if self.use_original_ids else None
	),
	consistent=True,
	ret_tensordict=True,
	).squeeze(1)
	del tracked_objs_outs_td

	# Optionally, remove "masks" from output tensor dict and directly encode them
	# to RLE format under packed representations
	if self.convert_mask_to_rle_for_video:
	interpolated_binary_masks = frame_results_td.pop("masks")
	rle_list = rle_encode(interpolated_binary_masks, return_areas=True)
	vid_masklets_rle_packed.extend(rle_list)
	# Optionally, move output TensorDict to CPU (do this after RLE encoding step above)
	if self.to_cpu_for_video:
	frame_results_td = frame_results_td.cpu()
	vid_preds_packed.append(frame_results_td)

	if len(vid_preds_packed) == 0:
	logging.debug(f"Video {video_id} has no predictions")
	return {video_id: []}

	vid_preds_packed = torch.cat(vid_preds_packed, dim=0)
	############### Construct a padded representation of the predictions ###############
	num_preds = len(tracked_objects_packed_idx)
	num_frames = len(find_stages)
	# We zero pad any missing prediction
	# NOTE: here, we also have padded tensors for "scores" and "labels", but we overwrite them later.
	padded_frames_results = TensorDict(
	{
	k: torch.zeros(
	num_preds, num_frames, *v.shape[1:], device=v.device, dtype=v.dtype
	)
	for k, v in vid_preds_packed.items()
	},
	batch_size=[
	num_preds,
	num_frames,
	],
	)
	padded_frames_results["scores"][...] = -1e8 # a very low score for empty object
	# Track scores and labels of each pred tracklet, only for frames where the model was able to track that object
	tracklet_scores = []
	tracklet_labels = []
	# Optionally, fill the list of RLEs for masklets
	# note: only frames with actual predicted masks (in packed format) will be
	# filled with RLEs; the rest will remains None in results["masks_rle"]
	if self.convert_mask_to_rle_for_video:
	vid_masklets_rle_padded = [[None] * num_frames for _ in range(num_preds)]
	for o_idx, oid in enumerate(tracked_objects_packed_idx):
	oid2packed_idx = tracked_objects_packed_idx[oid]
	oid2padded_idx = tracked_objects_frame_idx[oid]
	obj_packed_results = vid_preds_packed[oid2packed_idx]
	padded_frames_results[o_idx][oid2padded_idx] = obj_packed_results
	if self.convert_mask_to_rle_for_video:
	for packed_idx, padded_idx in zip(oid2packed_idx, oid2padded_idx):
	vid_masklets_rle_padded[o_idx][padded_idx] = (
	vid_masklets_rle_packed[packed_idx]
	)
	# NOTE: We need a single confidence score per tracklet for the mAP metric.
	# We use the average confidence score across time. (How does this impact AP?)
	tracklet_scores.append(obj_packed_results["scores"].mean())
	# We also need to have a unique category Id per tracklet.
	# This is not a problem for phrase AP, however, for mAP we do majority voting across time.
	tracklet_labels.append(obj_packed_results["labels"].mode()[0])

	results = padded_frames_results.to_dict()
	results["scores"] = torch.stack(tracklet_scores, dim=0)
	results["labels"] = torch.stack(tracklet_labels, dim=0)
	if self.convert_mask_to_rle_for_video:
	results["masks_rle"] = vid_masklets_rle_padded
	# we keep the frame-level scores since it's needed by some evaluation scripts
	results["per_frame_scores"] = padded_frames_results["scores"]

	return {video_id: results}


	class PostProcessTracking(PostProcessImage):
	"""This module converts the model's output into the format expected by the coco api"""

	def __init__(
	self,
	max_dets_per_img: int,
	iou_type="bbox",
	force_single_mask: bool = False,
	**kwargs,
	) -> None:
	super().__init__(max_dets_per_img=max_dets_per_img, iou_type=iou_type, **kwargs)
	self.force_single_mask = force_single_mask

	def process_results(
	self, find_stages, find_metadatas: BatchedInferenceMetadata, **kwargs
	):
	assert len(find_stages) == len(find_metadatas)
	results = {}
	for outputs, meta in zip(find_stages, find_metadatas):
	if self.force_single_mask:
	scores, labels = outputs["pred_logits"].max(-1)
	m = []
	for i in range(len(outputs["pred_masks"])):
	score, idx = scores[i].max(0)
	m.append(outputs["pred_masks"][i][idx])
	outputs["pred_masks"] = torch.stack(m, 0).unsqueeze(1)
	detection_results = self(outputs, meta.original_size, consistent=False)
	assert len(detection_results) == len(meta.coco_image_id)
	results.update(
	{
	(media_id.item(), object_id.item(), frame_index.item()): result
	for media_id, object_id, frame_index, result in zip(
	meta.original_image_id,
	meta.object_id,
	meta.frame_index,
	detection_results,
	)
	}
	)
	return results


	class PostProcessCounting(nn.Module):
	"""This module converts the model's output to be evaluated for counting tasks"""

	def __init__(
	self,
	use_original_ids: bool = False,
	threshold: float = 0.5,
	use_presence: bool = False,
	) -> None:
	"""
	Args:
	use_original_ids: whether to use the original image ids or the coco ids
	threshold: threshold for counting (values above this are counted)
	"""
	super().__init__()
	self.use_original_ids = use_original_ids
	self.threshold = threshold
	self.use_presence = use_presence

	def forward(self, outputs, target_sizes):
	"""Perform the computation
	Parameters:
	outputs: raw outputs of the model
	target_sizes: tensor of dimension [batch_size x 2] containing the size of each images of the batch
	"""
	# Extract scores from model outputs and apply sigmoid
	scores = torch.sigmoid(outputs["pred_logits"]).squeeze(-1) # [B, N]
	if self.use_presence:
	presence_score = outputs["presence_logit_dec"].sigmoid()
	if presence_score.ndim == 1:
	presence_score = presence_score.unsqueeze(1) # [B, 1]
	scores = scores * presence_score # [B, N]

	# Calculate counts by summing values above threshold
	counts = (scores > self.threshold).float().sum(dim=1)

	assert len(counts) == len(target_sizes)
	results = []
	for count in counts:
	results.append({"count": count.item()})

	return results

	@torch.no_grad()
	def process_results(
	self, find_stages, find_metadatas: List[BatchedInferenceMetadata], **kwargs
	):
	assert len(find_stages) == len(find_metadatas)
	results = {}
	for outputs, meta in zip(find_stages, find_metadatas):
	detection_results = self(
	outputs,
	meta.original_size,
	)
	ids = (
	meta.original_image_id if self.use_original_ids else meta.coco_image_id
	)
	assert len(detection_results) == len(ids)
	for img_id, result in zip(ids, detection_results):
	results[img_id.item()] = result

	return results