Spaces:

bellmake
/

SAM3-video-segmentation-tracking

Sleeping

App Files Files Community

SAM3-video-segmentation-tracking / sam3 /eval /conversion_util.py

bellmake

SAM3 Video Segmentation - Clean deployment

14114e8 7 days ago

raw

history blame contribute delete

7.15 kB

	# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved
	import json
	import os
	from collections import defaultdict

	from tqdm import tqdm


	def convert_ytbvis_to_cocovid_gt(ann_json, save_path=None):
	"""Convert YouTube VIS dataset to COCO-style video instance segmentation format.

	Args:
	ann_json (str): Path to YouTube VIS annotation JSON file
	save_path (str): path to save converted COCO-style JSON
	"""
	# Initialize COCO structure
	VIS = {
	"info": {},
	"images": [],
	"videos": [],
	"tracks": [],
	"annotations": [],
	"categories": [],
	"licenses": [],
	}

	# Load original annotations
	official_anns = json.load(open(ann_json))
	VIS["categories"] = official_anns["categories"] # Direct copy categories

	# Initialize counters
	records = dict(img_id=1, ann_id=1)

	# Create video-to-annotations mapping
	vid_to_anns = defaultdict(list)
	for ann in official_anns["annotations"]:
	vid_to_anns[ann["video_id"]].append(ann)

	# Create tracks directly
	VIS["tracks"] = [
	{
	"id": ann["id"],
	"category_id": ann["category_id"],
	"video_id": ann["video_id"],
	}
	for ann in official_anns["annotations"]
	]

	# Process videos
	for video_info in tqdm(official_anns["videos"]):
	# Create video entry
	video = {
	"id": video_info["id"],
	"name": os.path.dirname(video_info["file_names"][0]),
	"width": video_info["width"],
	"height": video_info["height"],
	"length": video_info["length"],
	"neg_category_ids": [],
	"not_exhaustive_category_ids": [],
	}
	VIS["videos"].append(video)

	# Process frames
	num_frames = len(video_info["file_names"])
	for frame_idx in range(num_frames):
	# Create image entry
	image = {
	"id": records["img_id"],
	"video_id": video_info["id"],
	"file_name": video_info["file_names"][frame_idx],
	"width": video_info["width"],
	"height": video_info["height"],
	"frame_index": frame_idx,
	"frame_id": frame_idx,
	}
	VIS["images"].append(image)

	# Process annotations for this frame
	if video_info["id"] in vid_to_anns:
	for ann in vid_to_anns[video_info["id"]]:
	bbox = ann["bboxes"][frame_idx]
	if bbox is None:
	continue

	# Create annotation entry
	annotation = {
	"id": records["ann_id"],
	"video_id": video_info["id"],
	"image_id": records["img_id"],
	"track_id": ann["id"],
	"category_id": ann["category_id"],
	"bbox": bbox,
	"area": ann["areas"][frame_idx],
	"segmentation": ann["segmentations"][frame_idx],
	"iscrowd": ann["iscrowd"],
	}
	VIS["annotations"].append(annotation)
	records["ann_id"] += 1

	records["img_id"] += 1

	# Print summary
	print(f"Converted {len(VIS['videos'])} videos")
	print(f"Converted {len(VIS['images'])} images")
	print(f"Created {len(VIS['tracks'])} tracks")
	print(f"Created {len(VIS['annotations'])} annotations")

	if save_path is None:
	return VIS

	# Save output
	save_dir = os.path.dirname(save_path)
	os.makedirs(save_dir, exist_ok=True)
	json.dump(VIS, open(save_path, "w"))

	return VIS


	def convert_ytbvis_to_cocovid_pred(
	youtubevis_pred_path: str, converted_dataset_path: str, output_path: str
	) -> None:
	"""
	Convert YouTubeVIS predictions to COCO format with video_id preservation

	Args:
	youtubevis_pred_path: Path to YouTubeVIS prediction JSON
	converted_dataset_path: Path to converted COCO dataset JSON
	output_path: Path to save COCO format predictions
	"""

	# Load YouTubeVIS predictions
	with open(youtubevis_pred_path) as f:
	ytv_predictions = json.load(f)

	# Load converted dataset for image ID mapping
	with open(converted_dataset_path) as f:
	coco_dataset = json.load(f)

	# Create (video_id, frame_idx) -> image_id mapping
	image_id_map = {
	(img["video_id"], img["frame_index"]): img["id"]
	for img in coco_dataset["images"]
	}

	coco_annotations = []
	track_id_counter = 1 # Unique track ID generator

	for pred in tqdm(ytv_predictions):
	video_id = pred["video_id"]
	category_id = pred["category_id"]
	bboxes = pred["bboxes"]
	segmentations = pred.get("segmentations", []) # Get segmentations if available
	areas = pred.get("areas", []) # Get areas if available
	score = pred["score"]

	# Assign unique track ID for this prediction
	track_id = track_id_counter
	track_id_counter += 1

	# Ensure segmentations and areas have the same length as bboxes
	if len(segmentations) == 0:
	segmentations = [None] * len(bboxes)
	if len(areas) == 0:
	areas = [None] * len(bboxes)

	for frame_idx, (bbox, segmentation, area_from_pred) in enumerate(
	zip(bboxes, segmentations, areas)
	):
	# Skip frames with missing objects (None or zero bbox)
	if bbox is None or all(x == 0 for x in bbox):
	continue

	# Get corresponding image ID from mapping
	image_id = image_id_map.get((video_id, frame_idx))
	if image_id is None:
	raise RuntimeError(
	f"prediction {video_id=}, {frame_idx=} does not match any images in the converted COCO format"
	)

	# Extract bbox coordinates
	x, y, w, h = bbox

	# Calculate area - use area from prediction if available, otherwise from bbox
	if area_from_pred is not None and area_from_pred > 0:
	area = area_from_pred
	else:
	area = w * h

	# Create COCO annotation with video_id
	coco_annotation = {
	"image_id": int(image_id),
	"video_id": video_id, # Added video_id field
	"track_id": track_id,
	"category_id": category_id,
	"bbox": [float(x), float(y), float(w), float(h)],
	"area": float(area),
	"iscrowd": 0,
	"score": float(score),
	}

	# Add segmentation if available
	if segmentation is not None:
	coco_annotation["segmentation"] = segmentation

	coco_annotations.append(coco_annotation)

	# Save output
	with open(output_path, "w") as f:
	json.dump(coco_annotations, f)

	print(f"Converted {len(coco_annotations)} predictions to COCO format with video_id")