File size: 17,027 Bytes
14114e8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved
import copy
import gc
import logging
import os
from collections import defaultdict
from operator import xor
from pathlib import Path
from typing import List, Optional

import numpy as np
import pycocotools.mask as mask_util
import torch
from pycocotools.cocoeval import COCOeval
from sam3.eval.cgf1_eval import CGF1Eval
from sam3.eval.coco_eval_offline import convert_to_xywh
from sam3.model.box_ops import box_xywh_inter_union
from sam3.train.masks_ops import rle_encode
from sam3.train.utils import distributed as dist
from typing_extensions import override

try:
    import rapidjson as json
except ModuleNotFoundError:
    import json

from iopath.common.file_io import g_pathmgr


class YTVISevalMixin:
    """
    Identical to COCOeval but adapts computeIoU to compute IoU between tracklets/masklets.
    """

    @override
    def _prepare(self):
        """
        Copied from cocoeval.py but doesn't convert masks to RLEs (we assume they already are RLEs)
        """
        p = self.params
        if p.useCats:
            gts = self.cocoGt.loadAnns(
                self.cocoGt.getAnnIds(imgIds=p.imgIds, catIds=p.catIds)
            )
            dts = self.cocoDt.loadAnns(
                self.cocoDt.getAnnIds(imgIds=p.imgIds, catIds=p.catIds)
            )
        else:
            gts = self.cocoGt.loadAnns(self.cocoGt.getAnnIds(imgIds=p.imgIds))
            dts = self.cocoDt.loadAnns(self.cocoDt.getAnnIds(imgIds=p.imgIds))

        # set ignore flag
        for gt in gts:
            gt["ignore"] = gt["ignore"] if "ignore" in gt else 0
            gt["ignore"] = "iscrowd" in gt and gt["iscrowd"]
            if p.iouType == "keypoints":
                gt["ignore"] = (gt["num_keypoints"] == 0) or gt["ignore"]
        self._gts = defaultdict(list)  # gt for evaluation
        self._dts = defaultdict(list)  # dt for evaluation
        for gt in gts:
            self._gts[gt["image_id"], gt["category_id"]].append(gt)
        for dt in dts:
            self._dts[dt["image_id"], dt["category_id"]].append(dt)
        self.evalImgs = defaultdict(list)  # per-image per-category evaluation results
        self.eval = {}  # accumulated evaluation results

    def computeIoU(self, imgId, catId):
        """
        Compute IoU between tracklets. Copied from cocoeval.py but adapted for videos (in YT-VIS format)
        """
        p = self.params
        if p.useCats:
            gt = self._gts[imgId, catId]
            dt = self._dts[imgId, catId]
        else:
            gt = [_ for cId in p.catIds for _ in self._gts[imgId, cId]]
            dt = [_ for cId in p.catIds for _ in self._dts[imgId, cId]]
        if len(gt) == 0 or len(dt) == 0:
            return []

        # For class mAP and phrase AP evaluation, we sort the detections in descending order of scores (as in COCOeval).
        # For demo F1 evaluation, we DO NOT sort the detections (but match them with GTs via Hungarian matching).
        assert hasattr(self, "sort_inds_by_scores_in_iou"), (
            "subclasses that inherits YTVISevalMixin should set `self.sort_inds_by_scores_in_iou` "
            "(True for class mAP and phrase AP, False for demo F1)"
        )
        if self.sort_inds_by_scores_in_iou:
            inds = np.argsort([-d["score"] for d in dt], kind="mergesort")
            dt = [dt[i] for i in inds]
            if len(dt) > p.maxDets[-1]:
                dt = dt[0 : p.maxDets[-1]]

        if p.iouType == "segm":
            g = [g["segmentations"] for g in gt]
            d = [d["segmentations"] for d in dt]
        elif p.iouType == "bbox":
            g = [g["bboxes"] for g in gt]
            d = [d["bboxes"] for d in dt]
        else:
            raise Exception("unknown iouType for iou computation")

        def iou_tracklets(preds, gts):
            preds = torch.tensor(preds)
            gts = torch.tensor(gts)
            inter, union = box_xywh_inter_union(
                preds.unsqueeze(1), gts.unsqueeze(0)
            )  # Num preds x Num GTS x Num frames
            inter = inter.sum(-1)
            union = union.sum(-1)
            assert (
                union > 0
            ).all(), (
                "There exists a tracklet with zero GTs across time. This is suspicious"
            )
            return inter / union

        def iou_masklets(preds, gts):
            inter = 0
            union = 0
            for p_i, gt_i in zip(preds, gts):
                if p_i and gt_i:
                    # Compute areas of intersection and union
                    inter += mask_util.area(
                        mask_util.merge([p_i, gt_i], intersect=True)
                    )
                    union += mask_util.area(
                        mask_util.merge([p_i, gt_i], intersect=False)
                    )
                elif gt_i:
                    union += mask_util.area(gt_i)
                elif p_i:
                    union += mask_util.area(p_i)
            if union > 0:
                iou = inter / union
                assert iou >= 0 and iou <= 1, "Encountered an error in IoU computation"
            else:
                assert np.isclose(inter, 0) and np.isclose(
                    union, 0
                ), "Encountered an error in IoU computation"
                iou = 1
            return iou

        if p.iouType == "segm":
            ious = [[iou_masklets(d_i, g_i) for g_i in g] for d_i in d]
        else:
            ious = iou_tracklets(d, g)
        return np.array(ious)


class YTVISeval(YTVISevalMixin, COCOeval):
    # For class mAP and phrase AP evaluation, we sort the detections in descending order of scores (as in COCOeval).
    sort_inds_by_scores_in_iou = True


class VideoDemoF1Eval(YTVISevalMixin, CGF1Eval):
    # For demo F1 evaluation, we DO NOT sort the detections (but match them with GTs via Hungarian matching).
    sort_inds_by_scores_in_iou = False


class YTVISResultsWriter:
    """
    Gather and dumps predictions in YT-VIS format.
    Expected flow of API calls: reset() -> N * update() -> compute_synced()
    """

    def __init__(
        self,
        dump_file: str,
        postprocessor,
        gather_pred_via_filesys=False,
        pred_file_evaluators: Optional[List] = None,
        save_per_frame_scores: bool = False,
        write_eval_metrics_file: bool = True,
        eval_metrics_file_suffix: str = ".sam3_eval_metrics",
    ):
        self.dump_file = dump_file
        self.dump = []
        self.postprocessor = postprocessor
        self.gather_pred_via_filesys = gather_pred_via_filesys
        if dist.is_main_process():
            dirname = os.path.dirname(self.dump_file)
            if not os.path.exists(dirname):
                os.makedirs(dirname, exist_ok=True)
                logging.info(f"Creating folder: {dirname}")

        # the evaluation hooks to be applied to the prediction files
        self.pred_file_evaluators = pred_file_evaluators or []
        self.save_per_frame_scores = save_per_frame_scores
        # in addition to the prediction file, we also write the evaluation metrics
        # for easier debugging and analysis (stored in another eval_metrics_file
        # so that we can keep the dumped prediction file under YT-VIS format)
        self.write_eval_metrics_file = write_eval_metrics_file
        if self.write_eval_metrics_file:
            self.eval_metrics_file = self.dump_file + eval_metrics_file_suffix
            os.makedirs(os.path.dirname(self.eval_metrics_file), exist_ok=True)

    def _dump_vid_preds(self, results):
        dumped_results = copy.deepcopy(results)
        self.dump.extend(dumped_results)

    def prepare(self, predictions):
        ytvis_results = []
        for video_id, prediction in predictions.items():
            if len(prediction) == 0:
                continue
            for k in ["boxes", "scores", "labels"]:
                assert (
                    k in prediction
                ), f"Expected predictions to have `{k}` key, available keys are {prediction.keys()}"
            if self.save_per_frame_scores:
                assert (
                    "per_frame_scores" in prediction
                ), f"Expected predictions to have `per_frame_scores` key, available keys are {prediction.keys()}"
            assert xor(
                "masks" in prediction, "masks_rle" in prediction
            ), f"Expected predictions to have either `masks` key or `masks_rle` key, available keys are {prediction.keys()}"

            boxes = prediction["boxes"]
            boxes = convert_to_xywh(boxes).tolist()
            scores = prediction["scores"].tolist()
            labels = prediction["labels"].tolist()
            if "masks" in prediction:
                masks = prediction["masks"].squeeze(2)
                assert (
                    masks.ndim == 4
                ), "Expected masks to be of shape(N_preds,T_frames,H,W)"

                areas = [mask.flatten(1).sum(1).tolist() for mask in masks]
                rles = [rle_encode(masklet) for masklet in masks]

                # memory clean
                del masks
                del prediction["masks"]
            elif "masks_rle" in prediction:
                rles = prediction.pop("masks_rle")
                areas = [
                    [0 if rle is None else rle.pop("area") for rle in rles_per_obj]
                    for rles_per_obj in rles
                ]
            else:
                raise ValueError(
                    "Expected either `masks` or `masks_rle` key in the predictions."
                )

            new_results = [
                {
                    "video_id": video_id,
                    "category_id": track_label,
                    "bboxes": track_boxes,
                    "score": track_score,
                    "segmentations": track_masks,
                    "areas": track_areas,
                }
                for (
                    track_boxes,
                    track_masks,
                    track_areas,
                    track_score,
                    track_label,
                ) in zip(boxes, rles, areas, scores, labels)
            ]
            # Optionally, save per-frame scores
            if self.save_per_frame_scores:
                per_frame_scores = prediction["per_frame_scores"].tolist()
                for res, track_per_frame_scores in zip(new_results, per_frame_scores):
                    res["per_frame_scores"] = track_per_frame_scores

            ytvis_results.extend(new_results)

        return ytvis_results

    def set_sync_device(self, device: torch.device):
        self._sync_device = device

    def update(self, *args, **kwargs):
        predictions = self.postprocessor.process_results(*args, **kwargs)
        results = self.prepare(predictions)
        self._dump_vid_preds(results)

    def _dump_preds(self):
        if not dist.is_main_process():
            self.dump = []
            gc.collect()
            return
        dumped_file = Path(self.dump_file)
        logging.info(f"YTVIS evaluator: Dumping predictions to {dumped_file}")
        with g_pathmgr.open(str(dumped_file), "w") as f:
            json.dump(self.dump, f)
        self.dump = []
        gc.collect()
        return str(dumped_file)

    def synchronize_between_processes(self):
        logging.info("YT-VIS evaluator: Synchronizing between processes")
        dump_dict = self._dedup_pre_gather(self.dump)
        if self.gather_pred_via_filesys:
            dump_dict_all_gpus = dist.gather_to_rank_0_via_filesys(dump_dict)
        else:
            dump_dict_all_gpus = dist.all_gather(dump_dict, force_cpu=True)
        self.dump = self._dedup_post_gather(dump_dict_all_gpus)
        logging.info(f"Gathered all {len(self.dump)} predictions")

    def _dedup_pre_gather(self, predictions):
        """
        Organize the predictions as a dict-of-list using (video_id, category_id) as keys
        for deduplication after gathering them across GPUs.

        During evaluation, PyTorch data loader under `drop_last: False` would wrap
        around the dataset length to be a multiple of world size (GPU num) and duplicate
        the remaining batches. This causes the same test sample to appear simultaneously
        in multiple GPUs, resulting in duplicated predictions being saved into prediction
        files. These duplicates are then counted as false positives under detection mAP
        metrics (since a ground truth can be matched with only one prediction).

        For example, if there are 4 GPUs and 6 samples [A1, A2, B1, B2, C1, C2], the data
        loader (under `drop_last: False`) would load it by wrapping it around like
        `[A1, A2, B1, B2, C1, C2, *A1*, *A2*]` to make a multiple of 4 and then split it as

        - GPU 0: A1, C1
        - GPU 1: A2, C2
        - GPU 3: B1, **A1**
        - GPU 4: B2, **A2**
        (as in DistributedSampler in https://github.com/pytorch/pytorch/blob/521588519da9f4876d90ddd7a17c10d0eca89dc6/torch/utils/data/distributed.py#L116-L124)

        so the predictions on A1 and A2 will occur twice in the final gathered outputs
        in the prediction file (and counted as false positives). This also affects our
        YT-VIS official val evaluation, but to a lesser extent than YT-VIS dev since
        the latter is much smaller and more susceptible to false positives.

        So we to deduplicate this. The tricky part is that we cannot deduplicate them
        simply using video id, given that we are sharding the classes in each video
        across multiple batches (with 20 prompts per batch) in our "orig_cats" eval dbs.

        The solution is to deduplicate based on (video_id, category_id) tuple as keys.
        We organize the predictions as a dict-of-list using (video_id, category_id) as
        keys on each GPU, with the list of masklets under this (video_id, category_id)
        on this GPU as values. Then, we all-gather this dict-of-list across GPUs and
        if a key (video_id, category_id) appears in multiple GPUs, we only take the
        prediction masklet list from one GPU.
        """
        prediction_dict = defaultdict(list)
        for p in predictions:
            prediction_dict[(p["video_id"], p["category_id"])].append(p)
        return prediction_dict

    def _dedup_post_gather(self, list_of_prediction_dict):
        """
        Deduplicate the predictions from all GPUs. See `_dedup_pre_gather` for details.
        """
        dedup_prediction_dict = {}
        duplication_keys = []
        for prediction_dict in list_of_prediction_dict:
            for k, v in prediction_dict.items():
                if k not in dedup_prediction_dict:
                    dedup_prediction_dict[k] = v
                else:
                    duplication_keys.append(k)

        logging.info(
            f"skipped {len(duplication_keys)} duplicated predictions in YTVISResultsWriter "
            f"with the following (video_id, category_id) tuples: {duplication_keys}"
        )
        dedup_predictions = sum(dedup_prediction_dict.values(), [])
        return dedup_predictions

    def compute_synced(
        self,
    ):
        self.synchronize_between_processes()
        dumped_file = self._dump_preds()
        if not dist.is_main_process():
            return {"": 0.0}

        # run evaluation hooks on the prediction file
        meters = {}
        all_video_np_level_results = defaultdict(dict)
        for evaluator in self.pred_file_evaluators:
            gc.collect()
            results, video_np_level_results = evaluator.evaluate(dumped_file)
            meters.update(results)
            for (video_id, category_id), res in video_np_level_results.items():
                all_video_np_level_results[(video_id, category_id)].update(res)

        gc.collect()
        if self.write_eval_metrics_file:
            # convert the nested dict of {(video_id, category_id): per_sample_metric_dict}
            # to a list of per-sample metric dicts (with video_id and category_id) for JSON,
            # as JSON doesn't allow using tuples like (video_id, category_id) as dict keys
            video_np_level_metrics = [
                {"video_id": video_id, "category_id": category_id, **res}
                for (video_id, category_id), res in all_video_np_level_results.items()
            ]
            eval_metrics = {
                "dataset_level_metrics": meters,
                "video_np_level_metrics": video_np_level_metrics,
            }
            with g_pathmgr.open(self.eval_metrics_file, "w") as f:
                json.dump(eval_metrics, f)
            logging.info(
                f"YTVIS evaluator: Dumped evaluation metrics to {self.eval_metrics_file}"
            )

        if len(meters) == 0:
            meters = {"": 0.0}
        return meters

    def compute(self):
        return {"": 0.0}

    def reset(self, *args, **kwargs):
        self.dump = []