Spaces:

akhaliq
/

sam-audio-large

Runtime error

App Files Files Community

akhaliq HF Staff commited on 2 days ago

Commit

cff486f

verified ·

1 Parent(s): 3250861

Upload 30 files

Browse files

Files changed (30) hide show

eval/README.md +100 -0
eval/dataset/__init__.py +70 -0
eval/dataset/musdb.py +75 -0
eval/dataset/sam_audio_bench.py +153 -0
eval/main.py +162 -0
eval/metrics/__init__.py +13 -0
eval/metrics/aes.py +49 -0
eval/metrics/clap.py +46 -0
eval/metrics/imagebind.py +52 -0
eval/metrics/judge.py +44 -0
sam_audio/__init__.py +4 -0
sam_audio/model/__init__.py +4 -0
sam_audio/model/align.py +50 -0
sam_audio/model/base.py +58 -0
sam_audio/model/codec.py +108 -0
sam_audio/model/config.py +251 -0
sam_audio/model/judge.py +135 -0
sam_audio/model/model.py +362 -0
sam_audio/model/patcher.py +164 -0
sam_audio/model/rope.py +155 -0
sam_audio/model/text_encoder.py +37 -0
sam_audio/model/transformer.py +524 -0
sam_audio/model/vision_encoder.py +113 -0
sam_audio/processor.py +382 -0
sam_audio/ranking/__init__.py +30 -0
sam_audio/ranking/clap.py +84 -0
sam_audio/ranking/imagebind.py +197 -0
sam_audio/ranking/judge.py +42 -0
sam_audio/ranking/ranker.py +36 -0
sam_audio/ranking/sound_activity.py +129 -0

eval/README.md ADDED Viewed

	@@ -0,0 +1,100 @@

+# Evaluation
+This directory contains the evaluation code to reproduce the results from the SAM-Audio paper. The evaluation framework supports multiple datasets, prompting modes (text-only, span, visual), and metrics.
+## Setup
+Before running evaluation, ensure you have:
+1. Installed the SAM-Audio package and its dependencies
+2. Authenticated with Hugging Face to access the model checkpoints (see main [README](../README.md))
+## Quick Start
+Run evaluation on the default setting (instr-pro):
+```bash
+python main.py
+```
+You can also use multiple GPUs to speed up evaluation:
+```bash
+torchrun --nproc_per_node=<ngpus> python main.py
+```
+Evaluate on a specific setting:
+```bash
+python main.py --setting sfx
+```
+Evaluate on multiple settings:
+```bash
+python main.py --setting sfx speech music
+```
+## Available Evaluation Settings
+Run `python main.py --help` to see all available settings
+## Command Line Options
+```bash
+python main.py [OPTIONS]
+```
+### Options:
+- `-s, --setting` - Which setting(s) to evaluate (default: `instr-pro`)
+  - Choices: See available settings above
+  - Can specify multiple settings: `--setting sfx speech music`
+- `--cache-path` - Where to cache downloaded datasets (default: `~/.cache/sam_audio`)
+- `-p, --checkpoint-path` - Model checkpoint to evaluate (default: `facebook/sam-audio-1b`)
+  - Can use local path or Hugging Face model ID
+- `-b, --batch-size` - Batch size for evaluation (default: `1`)
+- `-w, --num-workers` - Number of data loading workers (default: `4`)
+- `-c, --candidates` - Number of reranking candidates (default: `8`)
+## Evaluation Metrics
+The evaluation framework computes the following metrics:
+- **Judge** - SAM Audio Judge quality assessment metric
+- **Aesthetic** - Aesthetic quality metric
+- **CLAP** - Audio-text alignment metric (CLAP similarity)
+- **ImageBind** - Audio-video alignment metric (for visual settings only)
+## Output
+Results are saved to the `results/` directory as JSON files, one per setting:
+```
+results/
+├── sfx.json
+├── speech.json
+└── music.json
+```
+Each JSON file contains the averaged metric scores across all samples in that setting.
+Example output:
+```json
+{
+    "JudgeOverall": "4.386",
+    "JudgeFaithfulness": "4.708",
+    "JudgeRecall": "4.934",
+    "JudgePrecision": "4.451",
+    "ContentEnjoyment": "5.296",
+    "ContentUsefulness": "6.903",
+    "ProductionComplexity": "4.301",
+    "ProductionQuality": "7.100",
+    "CLAPSimilarity": "0.271"
+}
+```

eval/dataset/__init__.py ADDED Viewed

	@@ -0,0 +1,70 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved\n
+from typing import Callable
+from .musdb import MUSDB
+from .sam_audio_bench import SAMAudioBench
+SETTINGS = {
+    # Text-only settings
+    "sfx": (
+        SAMAudioBench,
+        {"span": False, "visual": False, "subset": "others-50:text-only"},
+    ),
+    "speech": (
+        SAMAudioBench,
+        {"span": False, "visual": False, "subset": "speech-clean-50:text-only"},
+    ),
+    "speaker": (
+        SAMAudioBench,
+        {"span": False, "visual": False, "subset": "spk-50:text-only"},
+    ),
+    "music": (
+        SAMAudioBench,
+        {"span": False, "visual": False, "subset": "music-clean-50:text-only"},
+    ),
+    "instr-wild": (
+        SAMAudioBench,
+        {"span": False, "visual": False, "subset": "instr-50:text-only"},
+    ),
+    "instr-pro": (MUSDB, {}),
+    # Span settings
+    "sfx-span": (
+        SAMAudioBench,
+        {"span": True, "visual": False, "subset": "others-50:text+span"},
+    ),
+    "speech-span": (
+        SAMAudioBench,
+        {"span": True, "visual": False, "subset": "speech-clean-50:text+span"},
+    ),
+    "speaker-span": (
+        SAMAudioBench,
+        {"span": True, "visual": False, "subset": "spk-50:text+span"},
+    ),
+    "music-span": (
+        SAMAudioBench,
+        {"span": True, "visual": False, "subset": "music-clean-50:text+span"},
+    ),
+    "instr-wild-span": (
+        SAMAudioBench,
+        {"span": True, "visual": False, "subset": "instr-50:text+span"},
+    ),
+    # Visual settings
+    "sfx-visual": (
+        SAMAudioBench,
+        {"span": False, "visual": True, "subset": "others-onscreen-50:visual-only"},
+    ),
+    "speaker-visual": (
+        SAMAudioBench,
+        {"span": False, "visual": True, "subset": "spk-onscreen-50:visual-only"},
+    ),
+    "instr-wild-visual": (
+        SAMAudioBench,
+        {"span": False, "visual": True, "subset": "instr-onscreen-50:visual-only"},
+    ),
+}
+def make_dataset(setting: str, cache_path: str, collate_fn: Callable):
+    dataset, kwargs = SETTINGS[setting]
+    return dataset(cache_path=cache_path, collate_fn=collate_fn, **kwargs)

eval/dataset/musdb.py ADDED Viewed

	@@ -0,0 +1,75 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved\n
+import os
+from subprocess import check_call
+import torchaudio
+from datasets import load_dataset
+from torch.utils.data import Dataset
+from torchcodec.decoders import AudioDecoder
+def cache_file(url, outfile):
+    if not os.path.exists(outfile):
+        print("Downloading musdb18hq dataset...")
+        os.makedirs(os.path.dirname(outfile), exist_ok=True)
+        check_call(["curl", "--url", url, "--output", outfile + ".tmp"])
+        os.rename(outfile + ".tmp", outfile)
+class MUSDB(Dataset):
+    def __init__(
+        self,
+        collate_fn,
+        sample_rate: int = 48_000,
+        cache_path: str = os.path.expanduser("~/.cache/sam_audio"),
+    ):
+        self.cache_path = os.path.join(cache_path, "musdb18hq")
+        self.ds = self.get_dataset(cache_path)
+        self.captions = ["bass", "drums", "vocals"]
+        self.collate_fn = collate_fn
+        self.sample_rate = sample_rate
+    @property
+    def visual(self):
+        return False
+    def get_dataset(self, cache_path):
+        zip_file = os.path.join(cache_path, "musdb18hq.zip")
+        url = "https://zenodo.org/records/3338373/files/musdb18hq.zip?download=1"
+        cache_file(url, zip_file)
+        extracted_dir = os.path.join(cache_path, "musdb18hq")
+        if not os.path.exists(extracted_dir):
+            check_call(["unzip", zip_file, "-d", extracted_dir + ".tmp"])
+            os.rename(extracted_dir + ".tmp", extracted_dir)
+        return load_dataset("facebook/sam-audio-musdb18hq-test")["test"]
+    def __len__(self):
+        return len(self.ds)
+    def collate(self, items):
+        audios, descriptions = zip(*items, strict=False)
+        return self.collate_fn(
+            audios=audios,
+            descriptions=descriptions,
+        )
+    def __getitem__(self, idx):
+        item = self.ds[idx]
+        path = os.path.join(self.cache_path, "test", item["id"], "mixture.wav")
+        assert os.path.exists(path), f"{path} does not exist!"
+        decoder = AudioDecoder(path)
+        data = decoder.get_samples_played_in_range(item["start_time"], item["end_time"])
+        wav = data.data
+        if data.sample_rate != self.sample_rate:
+            wav = torchaudio.functional.resample(
+                wav, data.sample_rate, self.sample_rate
+            )
+        wav = wav.mean(0, keepdim=True)
+        return wav, item["description"]
+if __name__ == "__main__":
+    dataset = MUSDB(lambda **kwargs: None)
+    print(len(dataset))
+    print(dataset[0])

eval/dataset/sam_audio_bench.py ADDED Viewed

	@@ -0,0 +1,153 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved\n
+import os
+from dataclasses import dataclass
+from io import BytesIO
+from typing import Optional, Tuple
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torchaudio
+from datasets import load_dataset
+from torchcodec.decoders import AudioDecoder, VideoDecoder
+@dataclass
+class Item:
+    anchors: list[Tuple[str, float, float]]
+    masked_video_frames: torch.Tensor
+    audio_samples: torch.Tensor
+    description: str
+class SAMAudioBench(torch.utils.data.Dataset):
+    def __init__(
+        self,
+        cache_path,
+        collate_fn,
+        span: bool = True,
+        visual: bool = True,
+        subset: Optional[str] = None,
+    ):
+        self.dataset = load_dataset("facebook/sam-audio-bench")["test"]
+        self.subset = subset
+        self._span = span
+        self._visual = visual
+        if subset is not None:
+            self.dataset = self.dataset.filter(lambda x: subset in x["paper_eval_sets"])
+        self.cache_path = os.path.join(cache_path, "sam_audio_bench")
+        self.collate_fn = collate_fn
+        DATA_MSG = (
+            f"`SAMAudioBench` requires the user to create a directory named {self.cache_path} "
+            "see the README.md file for how to prepare"
+        )
+        assert os.path.exists(self.cache_path), DATA_MSG
+    @property
+    def visual(self):
+        return self._visual
+    def __len__(self):
+        return len(self.dataset)
+    def _get_path(
+        self, video_id: str, source_dataset: str, start_offset: float, end_offset: float
+    ) -> str:
+        path = f"{self.cache_path}/{source_dataset}/{video_id}.mp4"
+        select_frames = True
+        if not os.path.exists(path):
+            path = f"{self.cache_path}/{source_dataset}/{video_id}_{int(start_offset * 1000)}_{int(end_offset * 1000)}.mp4"
+            select_frames = False
+        if not os.path.exists(path):
+            path = f"{self.cache_path}/{source_dataset}/{video_id}_{int(start_offset)}_{int(end_offset)}.mp4"
+        if not os.path.exists(path):
+            path = f"{self.cache_path}/{source_dataset}/{video_id}.{int(start_offset * 1000):08d}_{int(end_offset * 1000):08d}.mp4"
+        return path, select_frames
+    def collate(self, items: list[Item]):
+        has_video = any(item.masked_video_frames is not None for item in items)
+        return self.collate_fn(
+            descriptions=[item.description for item in items],
+            audios=[item.audio_samples for item in items],
+            anchors=[item.anchors for item in items] if self._span else None,
+            masked_videos=[item.masked_video_frames for item in items]
+            if has_video and self._visual
+            else None,
+        )
+    def _get_masked_video(self, item, video_path, select_frames):
+        if item["mask_bytes"] is None:
+            return None
+        mask = torch.from_numpy(np.load(BytesIO(item["mask_bytes"]))["video_masklet"])
+        video_decoder = VideoDecoder(video_path)
+        if select_frames:
+            video_frames = video_decoder.get_frames_played_in_range(
+                item["start_offset"], item["end_offset"]
+            ).data
+        else:
+            video_frames = video_decoder[:].data
+        if mask.size(0) != video_frames.size(0):
+            # It's possible that the mask and the video frames differ by a small amount
+            # we interpolate the mask frame to match
+            idxs = (
+                torch.linspace(0, mask.size(0) - 1, video_frames.size(0)).round().long()
+            )
+            mask = mask[idxs]
+        mask = mask.unsqueeze(1)
+        if mask.shape[-2:] != video_frames.shape[-2:]:
+            mask = F.interpolate(mask, size=video_frames.shape[-2:])
+        import torchvision
+        torchvision.io.write_video("test.mp4", video_frames.permute(0, 2, 3, 1), 30)
+        torchvision.io.write_video(
+            "test_mask.mp4", mask.unsqueeze(-1).expand(-1, -1, -1, 3) * 255, 30
+        )
+        return video_frames * mask
+    def __getitem__(self, idx) -> Item:
+        item = self.dataset[idx]
+        video_path, select_frames = self._get_path(
+            item["video_id"],
+            item["source_dataset"],
+            item["start_offset"],
+            item["end_offset"],
+        )
+        assert os.path.exists(video_path), f"{video_path} does not exist!"
+        audio_decoder = AudioDecoder(video_path)
+        audio_samples = audio_decoder.get_samples_played_in_range(
+            start_seconds=item["start_offset"] if select_frames else 0,
+            stop_seconds=item["end_offset"] if select_frames else None,
+        )
+        if audio_samples.sample_rate != self.collate_fn.audio_sampling_rate:
+            resampled_audio = torchaudio.functional.resample(
+                audio_samples.data,
+                audio_samples.sample_rate,
+                self.collate_fn.audio_sampling_rate,
+            )
+        else:
+            resampled_audio = audio_samples.data
+        masked_video_frames = self._get_masked_video(item, video_path, select_frames)
+        return Item(
+            description=item["description"],
+            anchors=[("+", start, end) for start, end in item["spans"]],
+            masked_video_frames=masked_video_frames,
+            audio_samples=resampled_audio.mean(0, keepdim=True),
+        )

eval/main.py ADDED Viewed

	@@ -0,0 +1,162 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved\n
+import argparse
+import json
+import os
+import pandas as pd
+import torch
+import torch.distributed as dist
+from dataset import SETTINGS, make_dataset
+from metrics import CLAP, Aesthetic, ImageBind, Judge
+from torch.utils.data import DataLoader
+from torch.utils.data.distributed import DistributedSampler
+from tqdm import tqdm
+from sam_audio import SAMAudio, SAMAudioProcessor
+def gather_and_average_results(results, world_size):
+    if world_size == 1:
+        return json.loads(results.mean().to_json())
+    # 1. Gather all dictionaries to all ranks
+    all_results = [None for _ in range(world_size)]
+    dist.all_gather_object(
+        all_results, {"sum": results.sum().to_json(), "count": len(results)}
+    )
+    summed = {}
+    counts = 0
+    for res in all_results:
+        for k, v in json.loads(res["sum"]).items():
+            if k not in summed:
+                summed[k] = 0.0
+            summed[k] += v
+        counts += res["count"]
+    # 3. Compute average for keys that appeared at least once
+    averaged = {k: summed[k] / counts for k in summed}
+    return averaged
+def main(
+    settings: list[str],
+    cache_path: str,
+    batch_size: int,
+    checkpoint_path: str,
+    num_workers: int = 4,
+    reranking_candidates: int = 8,
+):
+    world_size = int(os.environ.get("WORLD_SIZE", 1))
+    rank = int(os.environ.get("RANK", 0))
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    if world_size > 1:
+        torch.distributed.init_process_group(backend="nccl")
+        device = torch.device(f"cuda:{rank}")
+        torch.cuda.set_device(device)
+    model = SAMAudio.from_pretrained(checkpoint_path)
+    model = model.eval().to(device)
+    processor = SAMAudioProcessor.from_pretrained(checkpoint_path)
+    judge_metric = Judge(device=device)
+    aes_metric = Aesthetic(device=device)
+    clap_metric = CLAP(device=device)
+    imagebind_metric = ImageBind(device=device)
+    for setting in settings:
+        print(f"Evaluating: {setting}")
+        dset = make_dataset(setting, cache_path=cache_path, collate_fn=processor)
+        sampler = None
+        if world_size > 1:
+            sampler = DistributedSampler(dset)
+        dl = DataLoader(
+            dset,
+            batch_size=batch_size,
+            shuffle=False,
+            collate_fn=dset.collate,
+            num_workers=num_workers,
+            sampler=sampler,
+        )
+        all_metrics = [
+            judge_metric,
+            aes_metric,
+            clap_metric,
+        ]
+        if dset.visual:
+            all_metrics.append(imagebind_metric)
+        dfs = []
+        with torch.inference_mode():
+            for batch in tqdm(dl, disable=rank > 1):
+                batch = batch.to(device)
+                result = model.separate(
+                    batch, reranking_candidates=reranking_candidates
+                )
+                mets = {}
+                for metric in all_metrics:
+                    input_wavs = model.unbatch(batch.audios.squeeze(1), batch.wav_sizes)
+                    mets.update(
+                        metric(
+                            target_wavs=result.target,
+                            target_wavs_sample_rate=model.sample_rate,
+                            descriptions=batch.descriptions,
+                            input_wavs=input_wavs,
+                            videos=batch.masked_video,
+                        )
+                    )
+                dfs.append(pd.DataFrame.from_dict(mets))
+        df = pd.concat(dfs)
+        averaged_results = gather_and_average_results(df, world_size)
+        if rank == 0:
+            results_dict = {k: f"{v:.3f}" for k, v in averaged_results.items()}
+            print(json.dumps(results_dict, indent=4))
+            os.makedirs("results", exist_ok=True)
+            outfile = f"results/{setting}.json"
+            with open(outfile, "w") as fout:
+                print(json.dumps(results_dict), file=fout)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--setting",
+        "-s",
+        choices=SETTINGS.keys(),
+        help=f"Which setting to evaluate.  Choices: {SETTINGS.keys()}",
+        default=["instr-pro"],
+        nargs="+",
+    )
+    parser.add_argument(
+        "--cache-path",
+        type=str,
+        default=os.path.expanduser("~/.cache/sam_audio"),
+        help="Where to cache downloaded datasets",
+    )
+    parser.add_argument(
+        "--checkpoint-path", "-p", type=str, default="facebook/sam-audio-large"
+    )
+    parser.add_argument("--batch-size", "-b", type=int, default=1, help="Batch size")
+    parser.add_argument(
+        "--num-workers", "-w", type=int, default=4, help="Number of workers"
+    )
+    parser.add_argument("--candidates", "-c", type=int, default=8)
+    opt = parser.parse_args()
+    main(
+        settings=opt.setting,
+        cache_path=opt.cache_path,
+        batch_size=opt.batch_size,
+        checkpoint_path=opt.checkpoint_path,
+        num_workers=opt.num_workers,
+        reranking_candidates=opt.candidates,
+    )

eval/metrics/__init__.py ADDED Viewed

	@@ -0,0 +1,13 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved\n
+from metrics.aes import Aesthetic
+from metrics.clap import CLAP
+from metrics.imagebind import ImageBind
+from metrics.judge import Judge
+__all__ = [
+    "Aesthetic",
+    "CLAP",
+    "ImageBind",
+    "Judge",
+]

eval/metrics/aes.py ADDED Viewed

	@@ -0,0 +1,49 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved\n
+from typing import Optional
+import torch
+from audiobox_aesthetics.infer import AesPredictor
+COLUMN_MAP = {
+    "CE": "ContentEnjoyment",
+    "CU": "ContentUsefulness",
+    "PC": "ProductionComplexity",
+    "PQ": "ProductionQuality",
+}
+class Aesthetic(torch.nn.Module):
+    def __init__(
+        self,
+        checkpoint: Optional[str] = None,
+        device: Optional[torch.device] = None,
+    ):
+        super().__init__()
+        self.model = AesPredictor(
+            checkpoint_pth=checkpoint,
+            data_col="wav",
+        )
+        self.device = device or torch.device(
+            "cuda" if torch.cuda.is_available() else "cpu"
+        )
+    def __call__(
+        self,
+        target_wavs: list[torch.Tensor],
+        target_wavs_sample_rate: int = 48_000,
+        **kwargs,
+    ) -> dict[str, list[float]]:
+        result = self.model.forward(
+            [
+                {
+                    "wav": wav[None] if wav.ndim == 1 else wav,
+                    "sample_rate": target_wavs_sample_rate,
+                }
+                for wav in target_wavs
+            ]
+        )
+        return {
+            long_name: [x[shortname] for x in result]
+            for shortname, long_name in COLUMN_MAP.items()
+        }

eval/metrics/clap.py ADDED Viewed

	@@ -0,0 +1,46 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved\n
+from tempfile import TemporaryDirectory
+from typing import Optional
+import torch
+from torchcodec.encoders import AudioEncoder
+from sam_audio.ranking.clap import get_model
+class CLAP(torch.nn.Module):
+    def __init__(
+        self,
+        checkpoint: Optional[str] = None,
+        device: Optional[torch.device] = None,
+    ):
+        super().__init__()
+        self.model = get_model(device)
+        self.device = device or torch.device(
+            "cuda" if torch.cuda.is_available() else "cpu"
+        )
+    def __call__(
+        self,
+        target_wavs: list[torch.Tensor],
+        descriptions: list[str],
+        target_wavs_sample_rate: int = 48_000,
+        **kwargs,
+    ) -> list[dict[str, float]]:
+        with TemporaryDirectory() as tdir, torch.inference_mode():
+            file_list = []
+            for i, wav in enumerate(target_wavs):
+                file_list.append(f"{tdir}/hyp_{i}.wav")
+                encoder = AudioEncoder(
+                    samples=wav.cpu()[None] if wav.ndim == 1 else wav.cpu(),
+                    sample_rate=target_wavs_sample_rate,
+                )
+                encoder.to_file(file_list[-1])
+            audio_embs = self.model.get_audio_embedding_from_filelist(
+                file_list, use_tensor=True
+            )
+            text_embs = self.model.get_text_embedding(descriptions, use_tensor=True)
+            sims = audio_embs.unsqueeze(1) @ text_embs.unsqueeze(2)
+            return {"CLAPSimilarity": sims.cpu()[:, 0, 0].tolist()}

eval/metrics/imagebind.py ADDED Viewed

	@@ -0,0 +1,52 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved\n
+from typing import Optional
+import torch
+from imagebind.models.imagebind_model import ModalityType, imagebind_huge
+from sam_audio.ranking.imagebind import VideoTransform, load_and_transform_audio_data
+class ImageBind(torch.nn.Module):
+    def __init__(
+        self,
+        checkpoint: Optional[str] = None,
+        device: Optional[torch.device] = None,
+    ):
+        super().__init__()
+        self.model = imagebind_huge(pretrained=checkpoint is None)
+        if checkpoint is not None:
+            self.model.load_state_dict(torch.load(checkpoint, map_location="cpu"))
+        self.model = self.model.eval()
+        self.video_transform = VideoTransform()
+        self.device = device or torch.device(
+            "cuda" if torch.cuda.is_available() else "cpu"
+        )
+        self.model = self.model.to(self.device)
+    def __call__(
+        self,
+        target_wavs: list[torch.Tensor],
+        videos: list[torch.Tensor],
+        target_wavs_sample_rate: int = 48_000,
+        **kwargs,
+    ) -> dict[str, list[float]]:
+        audio_data = load_and_transform_audio_data(
+            target_wavs, input_sample_rate=target_wavs_sample_rate
+        )
+        durations = [x.size(-1) / target_wavs_sample_rate for x in target_wavs]
+        video_data = self.video_transform(videos, durations, audio_data.device)
+        inputs = {ModalityType.AUDIO: audio_data, ModalityType.VISION: video_data}
+        embs = self.model(inputs)
+        audio_embs, video_embs = embs[ModalityType.AUDIO], embs[ModalityType.VISION]
+        audio_embs, video_embs = (
+            audio_embs / ((audio_embs**2).sum(dim=-1, keepdims=True) ** 0.5),
+            video_embs / ((video_embs**2).sum(dim=-1, keepdims=True) ** 0.5),
+        )
+        bsz = len(target_wavs)
+        candidates = len(audio_embs) // bsz
+        scores = audio_embs.view(bsz, candidates, -1) @ video_embs.view(bsz, -1, 1)
+        return {"ImageBind": scores.squeeze(1, 2).cpu().tolist()}

eval/metrics/judge.py ADDED Viewed

	@@ -0,0 +1,44 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved\n
+from typing import Optional
+import torch
+from sam_audio import SAMAudioJudgeModel, SAMAudioJudgeProcessor
+class Judge(torch.nn.Module):
+    def __init__(
+        self,
+        checkpoint: str = "facebook/sam-audio-judge",
+        device: Optional[torch.device] = None,
+    ):
+        super().__init__()
+        self.model = SAMAudioJudgeModel.from_pretrained(checkpoint).to(device)
+        self.processor = SAMAudioJudgeProcessor.from_pretrained(checkpoint)
+        self.device = device or torch.device(
+            "cuda" if torch.cuda.is_available() else "cpu"
+        )
+    def forward(
+        self,
+        input_wavs: list[torch.Tensor],
+        target_wavs: list[torch.Tensor],
+        descriptions: list[str],
+        target_wavs_sample_rate: int = 48_000,
+        **kwargs,
+    ) -> torch.Tensor:
+        with torch.inference_mode():
+            processed = self.processor(
+                text=descriptions,
+                input_audio=[x.cpu() for x in input_wavs],
+                separated_audio=[x.cpu() for x in target_wavs],
+                sampling_rate=target_wavs_sample_rate,
+            ).to(self.device)
+            result = self.model(**processed)
+            return {
+                "JudgeOverall": result.overall.squeeze(-1).cpu().tolist(),
+                "JudgeFaithfulness": result.faithfulness.squeeze(-1).cpu().tolist(),
+                "JudgeRecall": result.recall.squeeze(-1).cpu().tolist(),
+                "JudgePrecision": result.precision.squeeze(-1).cpu().tolist(),
+            }

sam_audio/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved\n
+from .model import *  # noqa
+from .processor import *  # noqa

sam_audio/model/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved\n
+from .model import *  # noqa
+from .judge import *  # noqa

sam_audio/model/align.py ADDED Viewed

	@@ -0,0 +1,50 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved\n
+from typing import Optional
+import torch
+class AlignModalities(torch.nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        normalize: bool = True,
+        with_gate: bool = True,
+    ):
+        super().__init__()
+        self.conv = torch.nn.Conv1d(
+            in_channels=in_channels, out_channels=out_channels, kernel_size=1
+        )
+        self.normalize = normalize
+        if self.normalize:
+            self.layer_norm = torch.nn.LayerNorm(out_channels)
+        self.gate = None
+        if with_gate:
+            self.gate = torch.nn.Parameter(torch.tensor([0.0]))
+        self.out_channels = out_channels
+    def forward(self, anchor: torch.Tensor, tgt: Optional[torch.Tensor] = None):
+        """
+        Align video features to the input audio features
+        Args:
+            anchor (torch.Tensor): Input anchor tensor of shape (B, T, C), where B is batch size, C is channel size, and T is sequence length.
+            tgt (Optional[torch.Tensor]): Optional features tensor to be aligned to anchor, expected shape (B, in_channels, T).
+        """
+        if tgt is None:
+            return anchor
+        post_conv = self.conv(tgt)
+        post_conv = post_conv.permute(0, 2, 1)  # BCT -> BTC
+        if self.normalize:
+            post_conv = self.layer_norm(post_conv)
+        if self.gate is None:
+            return post_conv
+        else:
+            return anchor + self.gate.tanh() * post_conv

sam_audio/model/base.py ADDED Viewed

	@@ -0,0 +1,58 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved\n
+import json
+import os
+from typing import Callable, Dict, Optional, Union
+import torch
+from huggingface_hub import ModelHubMixin, snapshot_download
+class BaseModel(torch.nn.Module, ModelHubMixin):
+    config_cls: Callable
+    def device(self):
+        return next(self.parameters()).device
+    @classmethod
+    def _from_pretrained(
+        cls,
+        *,
+        model_id: str,
+        cache_dir: str,
+        force_download: bool,
+        proxies: Optional[Dict],
+        resume_download: bool,
+        local_files_only: bool,
+        token: Union[str, bool, None],
+        map_location: str = "cpu",
+        strict: bool = True,
+        revision: Optional[str] = None,
+        **model_kwargs,
+    ):
+        if os.path.isdir(model_id):
+            cached_model_dir = model_id
+        else:
+            cached_model_dir = snapshot_download(
+                repo_id=model_id,
+                revision=cls.revision,
+                cache_dir=cache_dir,
+                force_download=force_download,
+                proxies=proxies,
+                resume_download=resume_download,
+                token=token,
+                local_files_only=local_files_only,
+            )
+        with open(os.path.join(cached_model_dir, "config.json")) as fin:
+            config = json.load(fin)
+        config = cls.config_cls(**config)
+        model = cls(config)
+        state_dict = torch.load(
+            os.path.join(cached_model_dir, "checkpoint.pt"),
+            weights_only=True,
+            map_location=map_location,
+        )
+        model.load_state_dict(state_dict, strict=strict)
+        return model

sam_audio/model/codec.py ADDED Viewed

	@@ -0,0 +1,108 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved\n
+import math
+from abc import ABCMeta, abstractmethod
+from typing import Union
+import dacvae
+import torch
+from sam_audio.model.config import DACVAEConfig
+class Encoder(torch.nn.Module, metaclass=ABCMeta):
+    @abstractmethod
+    def forward(self, waveform: torch.Tensor) -> torch.Tensor: ...
+class Codec(Encoder):
+    @abstractmethod
+    def decode(self, encoded_frames: torch.Tensor) -> torch.Tensor: ...
+    @abstractmethod
+    def wav_idx_to_feature_idx(
+        self, wav_idx: Union[torch.Tensor, int], sample_rate=None
+    ) -> Union[torch.Tensor, int]: ...
+    @abstractmethod
+    def feature_idx_to_wav_idx(
+        self, feature_idx: Union[torch.Tensor, int], sample_rate=None
+    ) -> Union[torch.Tensor, int]: ...
+    @staticmethod
+    def cast_to_int(
+        x: Union[int, torch.Tensor],
+    ) -> Union[int, torch.Tensor]:
+        if isinstance(x, torch.Tensor):
+            return x.int()
+        else:
+            return int(x)
+class DACVAEEncoder(Encoder):
+    def __init__(self, config: DACVAEConfig) -> None:
+        super().__init__()
+        model = dacvae.DACVAE(
+            encoder_dim=config.encoder_dim,
+            encoder_rates=config.encoder_rates,
+            latent_dim=config.latent_dim,
+            decoder_dim=config.decoder_dim,
+            decoder_rates=config.decoder_rates,
+            n_codebooks=config.n_codebooks,
+            codebook_size=config.codebook_size,
+            codebook_dim=config.codebook_dim,
+            quantizer_dropout=config.quantizer_dropout,
+            sample_rate=config.sample_rate,
+        ).eval()
+        self._setup_model(model)
+        self.hop_length = config.hop_length
+        self.sample_rate = config.sample_rate
+    def _setup_model(self, model):
+        self.encoder = model.encoder
+        self.quantizer = model.quantizer
+    def forward(self, waveform: torch.Tensor) -> torch.Tensor:
+        with torch.no_grad():
+            z = self.encoder(self._pad(waveform))
+            mean, scale = self.quantizer.in_proj(z).chunk(2, dim=1)
+            encoded_frames = mean
+        return encoded_frames
+    def _pad(self, wavs):
+        length = wavs.size(-1)
+        if length % self.hop_length:
+            p1d = (0, self.hop_length - (length % self.hop_length))
+            return torch.nn.functional.pad(wavs, p1d, "reflect")
+        else:
+            return wavs
+class DACVAE(DACVAEEncoder, Codec):
+    def _setup_model(self, model):
+        super()._setup_model(model)
+        self.decoder = model.decoder
+    def decode(self, encoded_frames: torch.Tensor) -> torch.Tensor:
+        emb = self.quantizer.out_proj(encoded_frames)
+        return self.decoder(emb)
+    def feature_idx_to_wav_idx(self, feature_idx, sample_rate=None):
+        if sample_rate is None:
+            sample_rate = self.sample_rate
+        orig_freq = sample_rate
+        new_freq = self.sample_rate
+        wav_chunklen = feature_idx * self.hop_length * (orig_freq / new_freq)
+        return self.cast_to_int(wav_chunklen)
+    def wav_idx_to_feature_idx(self, wav_idx, sample_rate=None):
+        ceil = math.ceil
+        if torch.is_tensor(wav_idx):
+            ceil = torch.ceil
+        if sample_rate is None:
+            sample_rate = self.sample_rate
+        orig_freq = sample_rate
+        new_freq = self.sample_rate
+        target_length = ceil(new_freq * wav_idx / orig_freq)
+        res = ceil(target_length / self.hop_length)
+        return self.cast_to_int(res)

sam_audio/model/config.py ADDED Viewed

	@@ -0,0 +1,251 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved\n
+from typing import Optional, Tuple
+import numpy as np
+from core.audio_visual_encoder.config import TransformerConfig as PEAVTransformerConfig
+from transformers import ModernBertConfig
+class DACVAEConfig:
+    def __init__(
+        self,
+        encoder_dim: int = 64,
+        encoder_rates: list[int] = [2, 8, 10, 12],
+        latent_dim: int = 1024,
+        decoder_dim: int = 1536,
+        decoder_rates: list[int] = [12, 10, 8, 2],
+        n_codebooks: int = 16,
+        codebook_size: int = 1024,
+        codebook_dim: int = 128,
+        quantizer_dropout: bool = False,
+        sample_rate: int = 48_000,
+        mean: float = 0.0,
+        std: float = 1.0,
+    ):
+        self.encoder_dim = encoder_dim
+        self.encoder_rates = encoder_rates
+        self.latent_dim = latent_dim
+        self.decoder_dim = decoder_dim
+        self.decoder_rates = decoder_rates
+        self.n_codebooks = n_codebooks
+        self.codebook_size = codebook_size
+        self.codebook_dim = codebook_dim
+        self.quantizer_dropout = quantizer_dropout
+        self.sample_rate = sample_rate
+        self.mean = mean
+        self.std = std
+    @property
+    def hop_length(self):
+        return int(np.prod(self.encoder_rates))
+class TextEncoderConfig:
+    def __init__(self, dim: int = 768):
+        self.dim = dim
+class T5EncoderConfig(TextEncoderConfig):
+    def __init__(
+        self,
+        name: str = "t5-base",
+        max_length: Optional[int] = 512,
+        pad_mode: str = "longest",
+        dim: int = 768,
+    ):
+        super().__init__(dim=dim)
+        self.name = name
+        self.max_length = max_length
+        self.pad_mode = pad_mode
+class VisionEncoderConfig:
+    def __init__(self, dim: int = 1024, batch_size: int = 300):
+        self.dim = dim
+        self.batch_size = batch_size
+class PerceptionEncoderConfig(VisionEncoderConfig):
+    def __init__(
+        self,
+        dim: int = 1024,
+        batch_size: int = 300,
+        name: str = "PE-Core-L14-336",
+        normalize_feature: bool = True,
+        interpolation_mode: str = "BICUBIC",
+        image_size: int = 336,
+    ):
+        super().__init__(dim=dim, batch_size=batch_size)
+        self.name = name
+        self.normalize_feature = normalize_feature
+        self.interpolation_mode = interpolation_mode
+        self.image_size = image_size
+class TransformerConfig:
+    def __init__(
+        self,
+        dim: int = 2048,
+        n_heads: int = 16,
+        n_layers: int = 16,
+        dropout: float = 0.1,
+        norm_eps: float = 1.0e-05,
+        qk_norm: bool = True,
+        fc_bias: bool = False,
+        ffn_exp: int = 4,
+        ffn_dim_multiplier: int = 1,
+        multiple_of: int = 64,
+        non_linearity: str = "swiglu",
+        use_rope: bool = True,
+        max_positions: int = 10000,
+        frequency_embedding_dim: int = 256,
+        timestep_non_linearity: str = "swiglu",
+        t_block_non_linearity: str = "silu",
+        t_block_bias: bool = True,
+        context_dim: int = 2048,
+        context_non_linearity: str = "swiglu",
+        context_embedder_dropout: float = 0.0,
+        context_norm: bool = False,
+        out_channels: int = 256,
+        in_channels: Optional[int] = None,
+    ):
+        self.dim = dim
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.dropout = dropout
+        self.norm_eps = norm_eps
+        self.qk_norm = qk_norm
+        self.fc_bias = fc_bias
+        self.ffn_exp = ffn_exp
+        self.ffn_dim_multiplier = ffn_dim_multiplier
+        self.multiple_of = multiple_of
+        self.non_linearity = non_linearity
+        self.use_rope = use_rope
+        self.max_positions = max_positions
+        self.frequency_embedding_dim = frequency_embedding_dim
+        self.timestep_non_linearity = timestep_non_linearity
+        self.t_block_non_linearity = t_block_non_linearity
+        self.t_block_bias = t_block_bias
+        self.context_dim = context_dim
+        self.context_non_linearity = context_non_linearity
+        self.context_embedder_dropout = context_embedder_dropout
+        self.context_norm = context_norm
+        self.out_channels = out_channels
+        self.in_channels = in_channels
+class RankerConfig:
+    kind: str
+class ImageBindRankerConfig(RankerConfig):
+    kind: str = "imagebind"
+    def __init__(self, checkpoint: Optional[str] = None):
+        self.checkpoint = checkpoint
+class ClapRankerConfig(RankerConfig):
+    kind: str = "clap"
+    def __init__(self, checkpoint: Optional[str] = None):
+        self.checkpoint = checkpoint
+class JudgeRankerConfig(RankerConfig):
+    kind: str = "judge"
+    def __init__(self, checkpoint_or_model_id: str = "facebook/sam-audio-judge"):
+        self.checkpoint_or_model_id = checkpoint_or_model_id
+class SoundActivityRankerConfig(RankerConfig):
+    kind: str = "sound_activity"
+    def __init__(
+        self,
+        threshold_mode: str = "rel_to_max",
+        sil_threshold: float = -40,
+        metric: str = "iou",
+    ):
+        self.threshold_mode = threshold_mode
+        self.sil_threshold = sil_threshold
+        self.metric = metric
+class EnsembleRankerConfig(RankerConfig):
+    kind: str = "ensemble"
+    def __init__(self, rankers: dict[str, Tuple[RankerConfig, float]]):
+        self.rankers = rankers
+def parse_ranker_config(config_dict: dict):
+    kind = config_dict.pop("kind")
+    match kind:
+        case ImageBindRankerConfig.kind:
+            return ImageBindRankerConfig(**config_dict)
+        case ClapRankerConfig.kind:
+            return ClapRankerConfig(**config_dict)
+        case JudgeRankerConfig.kind:
+            return JudgeRankerConfig(**config_dict)
+        case SoundActivityRankerConfig.kind:
+            return SoundActivityRankerConfig(**config_dict)
+        case EnsembleRankerConfig.kind:
+            return EnsembleRankerConfig(
+                {
+                    k: (parse_ranker_config(v), w)
+                    for k, (v, w) in config_dict["rankers"].items()
+                }
+            )
+class SAMAudioConfig:
+    def __init__(
+        self,
+        in_channels: int = 768,
+        audio_codec=None,
+        text_encoder=None,
+        vision_encoder=None,
+        transformer=None,
+        num_anchors: int = 3,
+        anchor_embedding_dim: int = 128,
+        visual_ranker=None,
+        text_ranker=None,
+        span_predictor: Optional[str] = "pe-a-frame-large",
+    ):
+        self.in_channels = in_channels
+        self.audio_codec = DACVAEConfig(**(audio_codec or {}))
+        self.text_encoder = T5EncoderConfig(**(text_encoder or {}))
+        self.vision_encoder = PerceptionEncoderConfig(**(vision_encoder or {}))
+        self.transformer = TransformerConfig(**(transformer or {}))
+        self.num_anchors = num_anchors
+        self.anchor_embedding_dim = anchor_embedding_dim
+        self.visual_ranker = (
+            None if visual_ranker is None else parse_ranker_config(visual_ranker)
+        )
+        self.text_ranker = (
+            None if text_ranker is None else parse_ranker_config(text_ranker)
+        )
+        self.span_predictor = span_predictor
+class SAMAudioJudgeConfig:
+    def __init__(
+        self,
+        audio_codec: DACVAEConfig = None,
+        transformer: PEAVTransformerConfig = None,
+        text_model: ModernBertConfig = None,
+        finetune_transformer: PEAVTransformerConfig = None,
+        nth_text_layer: int = 22,
+        bottleneck_dim: int = 256,
+    ):
+        self.audio_codec = DACVAEConfig(**(audio_codec or {}))
+        self.transformer = PEAVTransformerConfig(**(transformer or {}))
+        self.text_model = ModernBertConfig(**(text_model or {}))
+        self.finetune_transformer = PEAVTransformerConfig(
+            **(finetune_transformer or {})
+        )
+        self.nth_text_layer = nth_text_layer
+        self.bottleneck_dim = bottleneck_dim

sam_audio/model/judge.py ADDED Viewed

	@@ -0,0 +1,135 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved\n
+from dataclasses import dataclass
+from typing import Optional
+import torch
+from core.audio_visual_encoder.transformer import BaseModelOutputWithPooling
+from core.audio_visual_encoder.transformer import Transformer as PEAVTransformer
+from transformers import AutoModel
+from .base import BaseModel
+from .codec import DACVAEEncoder
+from .config import SAMAudioJudgeConfig
+@dataclass
+class SAMAudioJudgeOutput:
+    r"""
+    overall (torch.Tensor, optional): Overall score tensor of shape (batch_size, 1).
+    recall (torch.Tensor, optional): Recall score tensor of shape (batch_size, 1).
+    precision (torch.Tensor, optional): Precision score tensor of shape (batch_size, 1).
+    faithfulness (torch.Tensor, optional): Faithfulness score tensor of shape (batch_size, 1).
+    text_model_output (BaseModelOutputWithPooling): Output from the text model.
+    audio_model_output (BaseModelOutputWithPooling): Output from the audio model.
+    """
+    overall: Optional[torch.Tensor] = None
+    recall: Optional[torch.Tensor] = None
+    precision: Optional[torch.Tensor] = None
+    faithfulness: Optional[torch.Tensor] = None
+    text_model_output: BaseModelOutputWithPooling = None
+    audio_model_output: BaseModelOutputWithPooling = None
+class SAMAudioJudgeModel(BaseModel):
+    config_cls = SAMAudioJudgeConfig
+    revision = "sam_audio"
+    def __init__(self, config: SAMAudioJudgeConfig):
+        super().__init__()
+        self.config = config
+        self.data_proj = torch.nn.Linear(
+            config.audio_codec.codebook_dim, config.transformer.hidden_size
+        )
+        self.audio_codec = DACVAEEncoder(config.audio_codec)
+        self.transformer = PEAVTransformer(config.transformer)
+        self.finetune_transformer = PEAVTransformer(config.finetune_transformer)
+        self.text_model = AutoModel.from_config(config.text_model)
+        self.cat_audio_proj = torch.nn.Linear(
+            2 * config.transformer.hidden_size, config.bottleneck_dim
+        )
+        self.text_proj1 = torch.nn.Linear(
+            in_features=config.text_model.hidden_size,
+            out_features=config.transformer.hidden_size,
+            bias=False,
+        )
+        self.text_proj2 = torch.nn.Linear(
+            in_features=config.transformer.hidden_size,
+            out_features=config.bottleneck_dim,
+        )
+        self.layer_norm = torch.nn.LayerNorm(config.bottleneck_dim)
+        self.proj_audio_and_text = torch.nn.Linear(
+            2 * config.bottleneck_dim, config.bottleneck_dim
+        )
+        self.finetune_data_proj = torch.nn.Linear(
+            config.bottleneck_dim, config.finetune_transformer.hidden_size
+        )
+        self.head = torch.nn.Linear(
+            config.finetune_transformer.hidden_size, 4, bias=False
+        )
+        self.mean = torch.nn.Parameter(torch.zeros(4, requires_grad=False))
+        self.std = torch.nn.Parameter(torch.ones(4, requires_grad=False))
+    def _get_text_output(self, input_ids, attention_mask):
+        nth_layer = self.config.nth_text_layer
+        output = self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            output_hidden_states=nth_layer is not None,
+        )
+        if nth_layer is None:
+            text_model_output = output.last_hidden_state
+        else:
+            text_model_output = output.hidden_states[nth_layer]
+        return BaseModelOutputWithPooling(
+            last_hidden_state=text_model_output, pooler_output=text_model_output[:, 0]
+        )
+    def forward(
+        self,
+        input_ids: torch.Tensor,  # tokenized text
+        input_values: torch.Tensor,  # input audio waveform
+        separated_values: torch.Tensor,  # separated audio waveform
+        attention_mask: Optional[torch.Tensor] = None,  # text attention mask
+        padding_mask: Optional[torch.Tensor] = None,  # audio padding mask
+    ) -> SAMAudioJudgeOutput:
+        text_features = self.text_proj1(
+            self._get_text_output(input_ids, attention_mask).pooler_output
+        )
+        stacked_audios = torch.cat([input_values, separated_values], dim=0)
+        stacked_codec_features = self.audio_codec(stacked_audios)
+        feature_padding_mask = None
+        if padding_mask is not None:
+            feature_padding_mask = padding_mask[
+                :, :: self.config.audio_codec.hop_length
+            ]
+        stacked_features = self.transformer(
+            self.data_proj(stacked_codec_features.transpose(1, 2)),
+            padding_mask=feature_padding_mask,
+        )
+        input_features, hyp_features = stacked_features.last_hidden_state.chunk(2, 0)
+        audio_features = self.cat_audio_proj(
+            torch.cat([hyp_features, input_features], dim=2)
+        )
+        expanded_text = (
+            self.layer_norm(self.text_proj2(text_features))
+            .unsqueeze(1)
+            .expand_as(audio_features)
+        )
+        audio_and_text = self.proj_audio_and_text(
+            torch.cat([audio_features, expanded_text], dim=2)
+        )
+        finetune_transformer_output = self.finetune_transformer(
+            self.finetune_data_proj(audio_and_text), padding_mask=feature_padding_mask
+        )
+        result = self.head(finetune_transformer_output.last_hidden_state)
+        if feature_padding_mask is not None:
+            feature_padding_mask = feature_padding_mask.unsqueeze(-1)
+        pooled = torch.masked.mean(result, mask=feature_padding_mask, dim=1)
+        de_normalized = pooled * self.std + self.mean
+        return SAMAudioJudgeOutput(*de_normalized.chunk(4, dim=1))
+__all__ = ["SAMAudioJudgeModel", "SAMAudioJudgeOutput"]

sam_audio/model/model.py ADDED Viewed

	@@ -0,0 +1,362 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved\n
+import math
+import re
+from dataclasses import dataclass
+from typing import Any, Dict, Optional
+import torch
+from core.audio_visual_encoder import PEAudioFrame, PEAudioFrameTransform
+from torchdiffeq import odeint
+from sam_audio.model.align import AlignModalities
+from sam_audio.model.base import BaseModel
+from sam_audio.model.codec import DACVAE
+from sam_audio.model.config import SAMAudioConfig
+from sam_audio.model.text_encoder import T5TextEncoder
+from sam_audio.model.transformer import DiT
+from sam_audio.model.vision_encoder import PerceptionEncoder
+from sam_audio.processor import Batch
+from sam_audio.ranking import create_ranker
+DFLT_ODE_OPT = {"method": "midpoint", "options": {"step_size": 2 / 32}}
+class SinusoidalEmbedding(torch.nn.Module):
+    def __init__(self, dim, theta=10000):
+        super().__init__()
+        assert (dim % 2) == 0
+        half_dim = dim // 2
+        inv_freq = torch.exp(
+            -math.log(theta) * torch.arange(half_dim).float() / half_dim
+        )
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+    def forward(self, x, pos=None):
+        if pos is None:
+            seq_len, device = x.shape[1], x.device
+            pos = torch.arange(seq_len, device=device)
+        emb = torch.einsum("i, j -> i j", pos, self.inv_freq)
+        emb = torch.cat((emb.cos(), emb.sin()), dim=-1)
+        return emb
+class EmbedAnchors(torch.nn.Module):
+    def __init__(self, num_embeddings: int, embedding_dim: int, out_dim: int):
+        super().__init__()
+        self.embed = torch.nn.Embedding(
+            num_embeddings + 1, embedding_dim, padding_idx=num_embeddings
+        )
+        self.gate = torch.nn.Parameter(torch.tensor([0.0]))
+        self.proj = torch.nn.Linear(embedding_dim, out_dim, bias=False)
+    def forward(
+        self,
+        x: torch.Tensor,
+        anchor_ids: Optional[torch.Tensor] = None,
+        anchor_alignment: Optional[torch.Tensor] = None,
+    ):
+        if anchor_ids is None:
+            return x
+        embs = self.embed(anchor_ids.gather(1, anchor_alignment))
+        proj = self.proj(embs)
+        return x + self.gate.tanh() * proj
+@dataclass
+class SeparationResult:
+    target: torch.Tensor
+    residual: torch.Tensor
+    noise: torch.Tensor
+class SAMAudio(BaseModel):
+    config_cls = SAMAudioConfig
+    revision = None
+    def __init__(self, cfg: SAMAudioConfig):
+        super().__init__()
+        self.audio_codec = DACVAE(cfg.audio_codec)
+        self.text_encoder = T5TextEncoder(cfg.text_encoder)
+        self.vision_encoder = PerceptionEncoder(cfg.vision_encoder)
+        self.transformer = DiT(cfg.transformer)
+        self.proj = torch.nn.Linear(cfg.in_channels, cfg.transformer.dim)
+        self.align_masked_video = AlignModalities(
+            cfg.vision_encoder.dim, cfg.transformer.dim
+        )
+        self.embed_anchors = EmbedAnchors(
+            cfg.num_anchors, cfg.anchor_embedding_dim, cfg.transformer.dim
+        )
+        self.memory_proj = torch.nn.Linear(cfg.text_encoder.dim, cfg.transformer.dim)
+        self.timestep_emb = SinusoidalEmbedding(cfg.transformer.dim)
+        self.visual_ranker = create_ranker(cfg.visual_ranker)
+        self.text_ranker = create_ranker(cfg.text_ranker)
+        if cfg.span_predictor is not None:
+            self.span_predictor = PEAudioFrame.from_config(
+                cfg.span_predictor, pretrained=True
+            )
+            self.span_predictor_transform = PEAudioFrameTransform.from_config(
+                cfg.span_predictor
+            )
+    @property
+    def sample_rate(self):
+        return self.audio_codec.sample_rate
+    def align_inputs(
+        self,
+        noisy_audio,
+        audio_features: torch.Tensor,
+        masked_video_features: Optional[torch.Tensor] = None,
+        anchor_ids: Optional[torch.Tensor] = None,
+        anchor_alignment: Optional[torch.Tensor] = None,
+    ):
+        x = torch.cat(
+            [
+                noisy_audio,
+                torch.zeros_like(audio_features),
+                audio_features,
+            ],
+            dim=2,
+        )
+        projected = self.proj(x)
+        aligned = self.align_masked_video(projected, masked_video_features)
+        aligned = self.embed_anchors(aligned, anchor_ids, anchor_alignment)
+        return aligned
+    def forward(
+        self,
+        noisy_audio: torch.Tensor,
+        audio_features: torch.Tensor,
+        text_features: torch.Tensor,
+        time: torch.Tensor,
+        masked_video_features: Optional[torch.Tensor] = None,
+        text_mask: Optional[torch.Tensor] = None,
+        anchor_ids: Optional[torch.Tensor] = None,
+        anchor_alignment: Optional[torch.Tensor] = None,
+        audio_pad_mask: Optional[torch.Tensor] = None,
+    ):
+        """
+        Forward pass for the model.  Represents one function evaluation of the ODE.
+        In the below descriptions, B is batch size, T is sequence length, C is channel size.
+        Note that the size of C and T may vary across arguments (ex. text_features vs. audio_features),
+        it is used only to designate a Channel or time/sequence-length dimension respectively.
+        Args:
+            noisy_audio (torch.Tensor): Noisy audio input tensor (being denoised).
+            audio_features (torch.Tensor): Clean audio features [B x T x C].
+            text_features (torch.Tensor): Encoded text features tensor [B x T x C].
+            time (torch.Tensor): Timestep tensor for positional encoding [B].
+            masked_video_features (Optional[torch.Tensor], optional): Masked video features tensor. [B x C x T].
+            text_mask (Optional[torch.Tensor], optional): Padding mask for text features. [B x T].
+            anchor_ids (Optional[torch.Tensor], optional): Anchor IDs tensor. Defaults to None [B x T].
+            anchor_alignment (Optional[torch.Tensor], optional): Anchor alignment tensor. B x T.
+            audio_pad_mask (Optional[torch.Tensor], optional): Padding mask for audio input. [B x T].
+        Returns:
+            torch.Tensor
+        """
+        aligned_inputs = self.align_inputs(
+            noisy_audio,
+            audio_features,
+            masked_video_features=masked_video_features,
+            anchor_ids=anchor_ids,
+            anchor_alignment=anchor_alignment,
+        )
+        memory = timestep_emb = self.timestep_emb(time, pos=time).unsqueeze(1)
+        if text_features is not None:
+            memory = self.memory_proj(text_features) + timestep_emb
+        return self.transformer(
+            aligned_inputs,
+            time,
+            padding_mask=audio_pad_mask,
+            memory=memory,
+            memory_padding_mask=text_mask,
+        )
+    def _get_audio_features(self, audios: torch.Tensor):
+        audio_features = self.audio_codec(audios).transpose(1, 2)
+        return torch.cat([audio_features, audio_features], dim=2)
+    def _get_video_features(self, video, audio_features):
+        B, T, _ = audio_features.shape
+        if video is None:
+            return audio_features.new_zeros(B, self.vision_encoder.dim, T)
+        else:
+            return self.vision_encoder(video).transpose(1, 2)
+    def _repeat_for_reranking(self, tensor, candidates):
+        if candidates > 1:
+            B = tensor.size(0)
+            rest = tensor.shape[1:]
+            return (
+                tensor.unsqueeze(1)
+                .expand(B, candidates, *rest)
+                .reshape(B * candidates, *rest)
+            )
+        else:
+            return tensor
+    def _unrepeat_from_reranking(self, tensor, candidates):
+        return tensor[::candidates]
+    def _get_forward_args(self, batch: Batch, candidates: int = 1):
+        audio_features = self._get_audio_features(batch.audios)
+        text_features, text_mask = self.text_encoder(batch.descriptions)
+        masked_video_features = self._get_video_features(
+            batch.masked_video, audio_features
+        )
+        return {
+            "audio_features": self._repeat_for_reranking(audio_features, candidates),
+            "text_features": self._repeat_for_reranking(text_features, candidates),
+            "text_mask": self._repeat_for_reranking(text_mask, candidates),
+            "masked_video_features": self._repeat_for_reranking(
+                masked_video_features, candidates
+            ),
+            "anchor_ids": self._repeat_for_reranking(batch.anchor_ids, candidates),
+            "anchor_alignment": self._repeat_for_reranking(
+                batch.anchor_alignment, candidates
+            ),
+            "audio_pad_mask": self._repeat_for_reranking(
+                batch.audio_pad_mask, candidates
+            ),
+        }
+    def predict_spans(
+        self, batch: Batch, audio_features: torch.Tensor, audio_pad_mask: torch.Tensor
+    ) -> Batch:
+        input = self.span_predictor_transform(text=batch.descriptions).to(
+            audio_features.device
+        )
+        output = self.span_predictor(
+            input_features=audio_features[:, :, :128],
+            padding_mask=audio_pad_mask,
+            return_spans=True,
+            **input,
+        )
+        anchors = [[["+"] + anchor for anchor in anchors] for anchors in output.spans]
+        batch.process_anchors(anchors)
+        return batch
+    @torch.inference_mode()
+    def separate(
+        self,
+        batch: Batch,
+        noise: Optional[torch.Tensor] = None,
+        ode_opt: Dict[str, Any] = DFLT_ODE_OPT,
+        reranking_candidates: int = 1,
+        predict_spans: bool = False,
+    ) -> SeparationResult:
+        # Encode audio
+        forward_args = self._get_forward_args(batch, candidates=reranking_candidates)
+        if predict_spans and hasattr(self, "span_predictor") and batch.anchors is None:
+            batch = self.predict_spans(
+                batch=batch,
+                audio_features=self._unrepeat_from_reranking(
+                    forward_args["audio_features"], reranking_candidates
+                ),
+                audio_pad_mask=self._unrepeat_from_reranking(
+                    forward_args["audio_pad_mask"], reranking_candidates
+                ),
+            )
+        audio_features = forward_args["audio_features"]
+        B, T, C = audio_features.shape
+        C = C // 2  # we stack audio_features, so the actual channels is half
+        if noise is None:
+            noise = torch.randn_like(audio_features)
+        def vector_field(t, noisy_audio):
+            res = self.forward(
+                noisy_audio=noisy_audio,
+                time=t.expand(noisy_audio.size(0)),
+                **forward_args,
+            )
+            return res
+        states = odeint(
+            vector_field,
+            noise,
+            torch.tensor([0.0, 1.0], device=noise.device),
+            **ode_opt,
+        )
+        generated_features = states[-1].transpose(1, 2)
+        # generated_features has shape [B, 2C, T].  Reshape to stack along the batch dimension
+        wavs = self.audio_codec.decode(generated_features.reshape(2 * B, C, T)).view(
+            B, 2, -1
+        )
+        bsz = wavs.size(0) // reranking_candidates
+        sizes = self.audio_codec.feature_idx_to_wav_idx(batch.sizes)
+        target_wavs = self.unbatch(
+            wavs[:, 0].view(bsz, reranking_candidates, -1), sizes
+        )
+        residual_wavs = self.unbatch(
+            wavs[:, 1].view(bsz, reranking_candidates, -1), sizes
+        )
+        if (
+            reranking_candidates > 1
+            and batch.masked_video is not None
+            and self.visual_ranker is not None
+        ):
+            scores = self.visual_ranker(
+                extracted_audio=target_wavs,
+                videos=batch.masked_video,
+                sample_rate=self.audio_codec.sample_rate,
+            )
+            idxs = scores.argmax(dim=1)
+        elif reranking_candidates > 1 and self.text_ranker is not None:
+            input_audio = [
+                audio[:, :size].expand(reranking_candidates, -1)
+                for audio, size in zip(batch.audios, sizes, strict=False)
+            ]
+            scores = self.text_ranker(
+                extracted_audio=target_wavs,
+                input_audio=input_audio,
+                descriptions=batch.descriptions,
+                sample_rate=self.audio_codec.sample_rate,
+            )
+            idxs = scores.argmax(dim=1)
+        else:
+            idxs = torch.zeros(bsz, dtype=torch.long, device=noise.device)
+        return SeparationResult(
+            target=[wav[idx] for wav, idx in zip(target_wavs, idxs, strict=False)],
+            residual=[
+                wavs[idx] for wavs, idx in zip(residual_wavs, idxs, strict=False)
+            ],
+            noise=noise,
+        )
+    def unbatch(self, wavs: torch.Tensor, sizes: torch.Tensor, time_dim: int = -1):
+        result = []
+        for row, size in zip(wavs, sizes, strict=False):
+            result.append(row.narrow(dim=time_dim, start=0, length=size))
+        return result
+    def load_state_dict(self, state_dict, strict=True):
+        if strict:
+            missing_keys, unexpected_keys = super().load_state_dict(
+                state_dict, strict=False
+            )
+            # We load this directly from HF, not in checkpoint
+            skip_regex = re.compile(
+                "(^text_encoder|^visual_ranker|^text_ranker|^span_predictor)"
+            )
+            missing_keys = [x for x in missing_keys if not re.search(skip_regex, x)]
+            if len(missing_keys) > 0 or len(unexpected_keys) > 0:
+                raise RuntimeError(
+                    f"Missing keys: {missing_keys}, unexpected_keys: {unexpected_keys}"
+                )
+__all__ = ["SAMAudio"]

sam_audio/model/patcher.py ADDED Viewed

	@@ -0,0 +1,164 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved\n
+import math
+from typing import Tuple
+import torch
+import torch.nn.functional as F
+from einops import rearrange
+def pad1d(
+    x: torch.Tensor,
+    paddings: Tuple[int, int],
+    mode: str = "constant",
+    value: float = 0.0,
+):
+    # Copied from https://github.com/facebookresearch/audiocraft/blob/main/audiocraft/modules/conv.py
+    """Tiny wrapper around F.pad, just to allow for reflect padding on small input.
+    If this is the case, we insert extra 0 padding to the right before the reflection happen.
+    """
+    length = x.shape[-1]
+    padding_left, padding_right = paddings
+    assert padding_left >= 0 and padding_right >= 0, (padding_left, padding_right)
+    if mode == "reflect":
+        max_pad = max(padding_left, padding_right)
+        extra_pad = 0
+        if length <= max_pad:
+            extra_pad = max_pad - length + 1
+            x = F.pad(x, (0, extra_pad))
+        padded = F.pad(x, paddings, mode, value)
+        end = padded.shape[-1] - extra_pad
+        return padded[..., :end]
+    else:
+        return F.pad(x, paddings, mode, value)
+def get_extra_padding_for_conv1d(
+    x: torch.Tensor, kernel_size: int, stride: int, padding_total: int = 0
+) -> int:
+    # Copied from https://github.com/facebookresearch/audiocraft/blob/main/audiocraft/modules/conv.py
+    """See `pad_for_conv1d`."""
+    length = x.shape[-1]
+    n_frames = (length - kernel_size + padding_total) / stride + 1
+    ideal_length = (math.ceil(n_frames) - 1) * stride + (kernel_size - padding_total)
+    return ideal_length - length
+class Conv1d(torch.nn.Conv1d):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        kernel_size = self.kernel_size[0]
+        stride = self.stride[0]
+        dilation = self.dilation[0]
+        kernel_size = (
+            kernel_size - 1
+        ) * dilation + 1  # effective kernel size with dilations
+        padding_total = kernel_size - stride
+        extra_padding = get_extra_padding_for_conv1d(
+            x, kernel_size, stride, padding_total
+        )
+        # Asymmetric padding required for odd strides
+        padding_right = padding_total // 2
+        padding_left = padding_total - padding_right
+        x = pad1d(x, (padding_left, padding_right + extra_padding))
+        return super().forward(x)
+class ConvBlock1d(torch.nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        *,
+        kernel_size: int = 3,
+        stride: int = 1,
+        dilation: int = 1,
+        num_groups: int = 8,
+    ) -> None:
+        super().__init__()
+        self.groupnorm = torch.nn.GroupNorm(
+            num_groups=num_groups, num_channels=in_channels
+        )
+        self.activation = torch.nn.SiLU()
+        self.project = Conv1d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            dilation=dilation,
+        )
+    def forward(
+        self,
+        x: torch.Tensor,
+    ) -> torch.Tensor:
+        x = self.groupnorm(x)
+        x = self.activation(x)
+        return self.project(x)
+class ResnetBlock1d(torch.nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        *,
+        kernel_size: int = 3,
+        stride: int = 1,
+        dilation: int = 1,
+        num_groups: int = 8,
+    ) -> None:
+        super().__init__()
+        self.block1 = ConvBlock1d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            dilation=dilation,
+            num_groups=num_groups,
+        )
+        self.block2 = ConvBlock1d(
+            in_channels=out_channels,
+            out_channels=out_channels,
+            num_groups=num_groups,
+        )
+        self.to_out = (
+            Conv1d(in_channels=in_channels, out_channels=out_channels, kernel_size=1)
+            if in_channels != out_channels
+            else torch.nn.Identity()
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        h = self.block1(x)
+        h = self.block2(h)
+        return h + self.to_out(x)
+class Patcher(torch.nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        patch_size: int,
+    ):
+        super().__init__()
+        assert_message = f"out_channels must be divisible by patch_size ({patch_size})"
+        assert out_channels % patch_size == 0, assert_message
+        self.patch_size = patch_size
+        self.block = ResnetBlock1d(
+            in_channels=in_channels,
+            out_channels=out_channels // patch_size,
+            num_groups=1,
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.block(x)
+        x = rearrange(x, "b c (l p) -> b (c p) l", p=self.patch_size)
+        return x

sam_audio/model/rope.py ADDED Viewed

	@@ -0,0 +1,155 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved\n
+import math
+from typing import Tuple
+import torch
+def reshape_for_broadcast(freqs_cis: torch.Tensor, x: torch.Tensor, seq_dim: int):
+    """
+    Reshape frequency tensor for broadcasting it with another tensor.
+    This function reshapes the frequency tensor to have the same shape as the target tensor 'x'
+    for the purpose of broadcasting the frequency tensor during element-wise operations.
+    Args:
+        freqs_cis (torch.Tensor): Frequency tensor to be reshaped.
+        x (torch.Tensor): Target tensor for broadcasting compatibility.
+        seq_dim (int): Sequence dimension index.
+    Returns:
+        torch.Tensor: Reshaped frequency tensor.
+    """
+    ndim = x.ndim
+    assert 0 <= seq_dim < ndim
+    assert freqs_cis.shape == (
+        x.shape[seq_dim],
+        x.shape[-3],
+        2,
+        2,
+    ), f"freqs_cis vs x: {(freqs_cis.shape, x.shape)}"
+    shape = [
+        d if i == seq_dim or i == ndim - 3 else 1 for i, d in enumerate(x.shape[:-2])
+    ] + [2, 2]
+    return freqs_cis.view(*shape)
+def apply_rotary_emb(
+    xq: torch.Tensor,
+    xk: torch.Tensor,
+    seq_dim: int,
+    freqs_cis: torch.Tensor,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    xq_ = xq.reshape(*xq.shape[:-1], -1, 1, 2)  # B S H D -> B S H D/2 1 2
+    xk_ = xk.reshape(*xk.shape[:-1], -1, 1, 2)  # B S H D -> B S H D/2 1 2
+    freqs_cis = reshape_for_broadcast(
+        freqs_cis, xq_, seq_dim
+    ).float()  # S D/2 2 2 -> 1 S 1 D/2 2 2
+    xq_out = (xq_ * freqs_cis).sum(5).flatten(3)
+    xk_out = (xk_ * freqs_cis).sum(5).flatten(3)
+    return xq_out.type_as(xq), xk_out.type_as(xk)
+class RotaryEmbedding(torch.nn.Module):
+    """
+    RotaryEmbedding Module
+    """
+    def __init__(
+        self,
+        theta: float,
+        head_dim: int,
+        max_seqlen: int = 1024,
+        scale_factor: int = 1,
+        low_freq_factor: int = 1,
+        high_freq_factor: int = 32,
+        old_context_len: int = 8192,
+    ):
+        super().__init__()
+        self.theta = theta
+        self.head_dim = head_dim
+        self.max_seqlen = max_seqlen
+        self.scale_factor = scale_factor
+        self.low_freq_factor = low_freq_factor
+        self.high_freq_factor = high_freq_factor
+        self.old_context_len = old_context_len
+        if scale_factor != 1:
+            self.low_freq_wavelen = old_context_len / low_freq_factor
+            self.high_freq_wavelen = old_context_len / high_freq_factor
+            assert self.low_freq_wavelen >= self.high_freq_wavelen
+    def reset_parameters(self):
+        freqs_cis = self.precompute_freqs_cis(
+            dim=self.head_dim, end=self.max_seqlen, theta=self.theta
+        )
+        S, D, _, _ = freqs_cis.shape
+        # S D 2 2 -> 1 S 1 D 2 2
+        freqs_cis = freqs_cis.view(1, S, 1, D, 2, 2)
+        self.register_buffer(
+            "freqs_cis",
+            freqs_cis,
+            persistent=False,
+        )
+    def apply_scaling(self, freqs):
+        if self.scale_factor == 1:
+            return freqs
+        new_freqs = []
+        for freq in freqs:
+            wavelen = 2 * math.pi / freq
+            if wavelen < self.high_freq_wavelen:
+                new_freqs.append(freq)
+            elif wavelen > self.low_freq_wavelen:
+                new_freqs.append(freq / self.scale_factor)
+            else:
+                assert self.low_freq_wavelen != self.high_freq_wavelen
+                smooth = (self.old_context_len / wavelen - self.low_freq_factor) / (
+                    self.high_freq_factor - self.low_freq_factor
+                )
+                new_freqs.append(
+                    (1 - smooth) * freq / self.scale_factor + smooth * freq
+                )
+        return torch.tensor(new_freqs, dtype=freqs.dtype, device=freqs.device)
+    def precompute_freqs_cis(
+        self,
+        dim: int,
+        end: int,
+        theta: float = 10000.0,
+    ):
+        """
+        Precompute the frequency tensor for complex exponentials (cis) with given dimensions.
+        This function calculates a frequency tensor with complex exponentials using the given dimension 'dim'
+        and the end index 'end'. The 'theta' parameter scales the frequencies.
+        The returned tensor contains complex values in complex64 data type.
+        Args:
+            dim (int): Dimension of the frequency tensor.
+            end (int): End index for precomputing frequencies.
+            theta (float, optional): Scaling factor for frequency computation. Defaults to 10000.0.
+        Returns:
+            torch.Tensor: Precomputed frequency tensor with complex exponentials.
+        """
+        freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))
+        freqs = self.apply_scaling(freqs)
+        t = torch.arange(end, device=freqs.device)
+        freqs = torch.outer(t, freqs).float()
+        cos, sin = freqs.cos(), freqs.sin()
+        return torch.stack((cos, -sin, sin, cos), dim=-1).view(*freqs.size(), 2, 2)
+    def forward(self, x: torch.Tensor, bhle: bool = False, **kwargs):
+        if bhle:
+            x = x.transpose(1, 2)  # (B H L E) -> (B L H E)
+        seqlen = x.size(1)
+        x_ = x.reshape(*x.shape[:-1], -1, 1, 2)  # B L H E -> B L H E/2 1 2
+        x_out = (x_ * self.freqs_cis[:, :seqlen]).sum(5).flatten(3)
+        if bhle:
+            x_out = x_out.transpose(1, 2)
+        return x_out.type_as(x)

sam_audio/model/text_encoder.py ADDED Viewed

	@@ -0,0 +1,37 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved\n
+from typing import Tuple
+import torch
+import transformers
+from sam_audio.model.config import T5EncoderConfig
+class T5TextEncoder(torch.nn.Module):
+    def __init__(self, cfg: T5EncoderConfig):
+        super().__init__()
+        self.model = transformers.T5EncoderModel.from_pretrained(cfg.name)
+        self.tokenizer = transformers.AutoTokenizer.from_pretrained(cfg.name)
+        self.pad_mode = cfg.pad_mode
+        self.max_length = cfg.max_length
+    def forward(self, texts: list[str]) -> Tuple[torch.Tensor, torch.Tensor]:
+        device = next(self.model.parameters()).device
+        encoded = self.tokenizer(
+            texts,
+            truncation=True,
+            max_length=self.max_length,
+            padding=self.pad_mode,
+            return_tensors="pt",
+        )
+        input_ids = encoded["input_ids"].to(device)
+        attention_mask = encoded["attention_mask"].to(device)
+        res = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            output_hidden_states=True,
+        )["last_hidden_state"]
+        return res, attention_mask.bool()

sam_audio/model/transformer.py ADDED Viewed

	@@ -0,0 +1,524 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved\n
+import math
+from functools import partial
+from typing import List, Optional, Union
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+from .config import TransformerConfig
+from .patcher import Patcher
+from .rope import RotaryEmbedding
+def gate(x, gate):
+    return x * gate
+def modulate(x, shift, scale):
+    return x * (1 + scale) + shift
+def get_nonlinearity(kind: str):
+    return {
+        "relu": F.relu,
+        "gelu": F.gelu,
+        "swiglu": None,
+        "approx_gelu": partial(F.gelu, approximate="tanh"),
+        "srelu": lambda x: F.relu(x) ** 2,
+        "silu": F.silu,
+    }[kind]
+class RMSNorm(torch.nn.Module):
+    def __init__(self, dim: int, eps: float = 1e-5):
+        super().__init__()
+        self.eps = eps
+        self.weight = torch.nn.Parameter(torch.ones(dim))
+    def _norm(self, x):
+        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+    def forward(self, x):
+        output = self._norm(x.float())
+        return (output * self.weight).type_as(x)
+class ProjectionLayer(torch.nn.Module):
+    def __init__(
+        self,
+        in_dim: int,
+        out_dim: int,
+        non_linearity: str,
+        dropout: float,
+        fc_bias: bool = False,
+    ):
+        super().__init__()
+        self.swiglu = non_linearity == "swiglu"
+        self.dropout = dropout
+        self.w1 = torch.nn.Linear(in_dim, out_dim, bias=fc_bias)
+        self.w2 = torch.nn.Linear(out_dim, out_dim, bias=fc_bias)
+        if self.swiglu:
+            self.w3 = torch.nn.Linear(in_dim, out_dim, bias=fc_bias)
+        # non-linearity
+        self.non_linearity = get_nonlinearity(non_linearity)
+    def forward(self, x):
+        hidden1 = self.w1(x)
+        if self.swiglu:
+            hidden3 = self.w3(x)
+            hidden = F.silu(hidden1) * hidden3
+        else:
+            hidden = self.non_linearity(hidden1)
+        hidden = F.dropout(hidden, p=self.dropout, training=self.training)
+        return self.w2(hidden)
+class Attention(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        head_dim: int,
+        n_heads: int,
+        n_kv_heads: int,
+        norm_eps: float = 1e-5,
+        use_qk_norm: bool = False,
+        fc_bias: bool = False,
+    ):
+        super().__init__()
+        assert n_heads % n_kv_heads == 0
+        self.head_dim = head_dim
+        self.n_heads = n_heads
+        self.n_kv_heads = n_kv_heads
+        self.use_qk_norm = use_qk_norm
+        self.wq = torch.nn.Linear(dim, n_heads * head_dim, bias=fc_bias)
+        self.wk, self.wv = [
+            torch.nn.Linear(
+                dim,
+                n_kv_heads * head_dim,
+                bias=fc_bias,
+            )
+            for _ in range(2)
+        ]
+        self.wo = torch.nn.Linear(
+            n_heads * head_dim,
+            dim,
+            bias=fc_bias,
+        )
+        if self.use_qk_norm is True:
+            self.q_norm = RMSNorm(head_dim, eps=norm_eps)
+            self.k_norm = RMSNorm(head_dim, eps=norm_eps)
+    def reshape_heads(self, x: torch.Tensor, heads: int) -> torch.Tensor:
+        B, T, C = x.shape
+        # B x T x C -> B x T x C/H x H
+        x = x.reshape(B, T, C // heads, heads)
+        # B x T x C/H x H -> B x H x T x C/H
+        return x.permute(0, 3, 1, 2)
+    def forward(
+        self,
+        x: torch.Tensor,
+        cross_x: Optional[torch.Tensor] = None,
+        key_padding_mask: Optional[torch.Tensor] = None,
+        rope: Optional[RotaryEmbedding] = None,
+    ):
+        # x: B, T, E
+        xq = self.wq(x)
+        if cross_x is not None:
+            xk, xv = self.wk(cross_x), self.wv(cross_x)
+        else:
+            xk, xv = self.wk(x), self.wv(x)
+        xk = self.reshape_heads(xk, self.n_kv_heads)
+        xv = self.reshape_heads(xv, self.n_kv_heads)
+        xq = self.reshape_heads(xq, self.n_heads)
+        if self.use_qk_norm:
+            xq = self.q_norm(xq)
+            xk = self.k_norm(xk)
+        if rope is not None:
+            xq = rope(xq, bhle=True)
+            xk = rope(xk, bhle=True)
+        attn_mask = None
+        if key_padding_mask is not None:
+            attn_mask = key_padding_mask[:, None, None, :]
+        output = F.scaled_dot_product_attention(xq, xk, xv, attn_mask=attn_mask)
+        output = rearrange(output, "b h n d -> b n (h d)")
+        return self.wo(output)
+class FeedForward(torch.nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        hidden_dim: int,
+        ffn_dim_multiplier: float,
+        multiple_of: int,
+        dropout: float,
+        non_linearity: str = "swiglu",
+        fc_bias: bool = False,
+    ):
+        super().__init__()
+        self.dropout = dropout
+        self.swiglu = non_linearity == "swiglu"
+        # swiglu hidden dim factor multiplier (same #params as relu / gelu)
+        if self.swiglu:
+            hidden_dim = int(2 * hidden_dim / 3)
+        # custom dim factor multiplier
+        hidden_dim = int(ffn_dim_multiplier * hidden_dim)
+        # round hidden dimension to `multiple_of`
+        hidden_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
+        # layers
+        self.w1 = torch.nn.Linear(dim, hidden_dim, bias=fc_bias)
+        self.w2 = torch.nn.Linear(hidden_dim, dim, bias=fc_bias)
+        if self.swiglu:
+            self.w3 = torch.nn.Linear(dim, hidden_dim, bias=fc_bias)
+        # non-linearity
+        self.non_linearity = get_nonlinearity(non_linearity)
+    def forward(
+        self,
+        x,
+    ):
+        hidden1 = self.w1(x)
+        if self.swiglu:
+            hidden3 = self.w3(x)
+            hidden = F.silu(hidden1) * hidden3
+        else:
+            hidden = self.non_linearity(hidden1)
+        hidden = F.dropout(hidden, p=self.dropout, training=self.training)
+        return self.w2(hidden)
+class TimestepEmbedder(torch.nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        frequency_embedding_dim: int,
+        non_linearity: str,
+        dropout: float,
+        fc_bias: bool,
+        max_period: int = 10000,
+    ):
+        super().__init__()
+        self.frequency_embedding_size = frequency_embedding_dim
+        self.projection = ProjectionLayer(
+            in_dim=frequency_embedding_dim,
+            out_dim=dim,
+            non_linearity=non_linearity,
+            dropout=dropout,
+            fc_bias=fc_bias,
+        )
+        half = frequency_embedding_dim // 2
+        freqs = torch.exp(
+            -math.log(max_period)
+            * torch.arange(start=0, end=half, dtype=torch.float32)
+            / half
+        )
+        self.register_buffer("freqs", freqs, persistent=False)
+    def timestep_embedding(self, t, dim):
+        """
+        Create sinusoidal timestep embeddings.
+        :param t: a 1-D Tensor of N indices, one per batch element.
+                          These may be fractional.
+        :param dim: the dimension of the output.
+        :param max_period: controls the minimum frequency of the embeddings.
+        :return: an (N, D) Tensor of positional embeddings.
+        """
+        # https://github.com/openai/glide-text2im/blob/main/glide_text2im/nn.py
+        self.freqs = self.freqs.to(device=t.device)
+        args = t[:, None].float() * self.freqs[None]
+        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+        if dim % 2:
+            embedding = torch.cat(
+                [embedding, torch.zeros_like(embedding[:, :1])], dim=-1
+            )
+        return embedding.to(t)
+    def forward(self, t):
+        x = self.timestep_embedding(t, self.frequency_embedding_size)
+        return self.projection(x)
+class ContextEmbedder(torch.nn.Module):
+    def __init__(
+        self,
+        in_dim: int,
+        out_dim: int,
+        non_linearity: str,
+        dropout: float,
+        fc_bias: bool,
+        norm_eps: float = 1e-5,
+        context_norm: bool = False,
+    ):
+        super().__init__()
+        self.context_norm = context_norm
+        if context_norm:
+            self.norm = RMSNorm(in_dim, norm_eps)
+        self.projection = ProjectionLayer(
+            in_dim=in_dim,
+            out_dim=out_dim,
+            non_linearity=non_linearity,
+            dropout=dropout,
+            fc_bias=fc_bias,
+        )
+    def forward(self, x):
+        if self.context_norm:
+            x = self.norm(x)
+        h = self.projection(x)
+        return h
+class DiTBlock(torch.nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        n_heads: int,
+        n_kv_heads: Optional[int] = None,
+        dropout: float = 0.0,
+        norm_eps: float = 1e-5,
+        qk_norm: bool = False,
+        fc_bias: bool = False,
+        ffn_exp: int = 1,
+        ffn_dim_multiplier: int = 4,
+        multiple_of: int = 64,
+        non_linearity: str = "silu",
+        no_cross_attention: bool = False,
+    ):
+        super().__init__()
+        assert dim % n_heads == 0
+        self.n_heads = n_heads
+        self.n_kv_heads = n_heads if n_kv_heads is None else n_kv_heads
+        self.dim = dim
+        self.dropout = dropout
+        self.head_dim = dim // n_heads
+        assert self.n_heads % self.n_kv_heads == 0
+        self.attention = Attention(
+            dim=dim,
+            head_dim=self.head_dim,
+            n_heads=self.n_heads,
+            n_kv_heads=self.n_kv_heads,
+            norm_eps=norm_eps,
+            use_qk_norm=qk_norm,
+            fc_bias=fc_bias,
+        )
+        self.feed_forward = FeedForward(
+            dim=dim,
+            hidden_dim=int(ffn_exp * dim),
+            ffn_dim_multiplier=ffn_dim_multiplier,
+            multiple_of=multiple_of,
+            dropout=dropout,
+            non_linearity=non_linearity,
+            fc_bias=fc_bias,
+        )
+        self.attention_norm, self.ffn_norm = [RMSNorm(dim, norm_eps) for _ in range(2)]
+        self.cross_attention = None
+        if not no_cross_attention:
+            self.cross_attention = Attention(
+                dim=dim,
+                head_dim=self.head_dim,
+                n_heads=self.n_heads,
+                n_kv_heads=self.n_heads,
+                norm_eps=norm_eps,
+                use_qk_norm=qk_norm,
+                fc_bias=fc_bias,
+            )
+        self.scale_shift_table = nn.Parameter(
+            torch.randn(6, self.dim) / self.dim**0.5,
+        )
+    def forward(
+        self,
+        x: torch.Tensor,
+        cross_x: Optional[torch.Tensor],
+        t: torch.Tensor,
+        padding_mask: Optional[torch.Tensor],
+        memory_padding_mask: Optional[torch.Tensor],
+        rope: Optional[RotaryEmbedding] = None,
+    ):
+        biases = self.scale_shift_table[None] + t.reshape(x.size(0), 6, -1)
+        (
+            shift_msa,
+            scale_msa,
+            gate_msa,
+            shift_mlp,
+            scale_mlp,
+            gate_mlp,
+        ) = biases.chunk(6, dim=1)
+        assert self.attention is not None and self.attention_norm is not None
+        h_attn = self.attention(
+            modulate(self.attention_norm(x), shift_msa, scale_msa),
+            key_padding_mask=padding_mask,
+            rope=rope,
+        )
+        h = x + gate(h_attn, gate_msa)
+        if self.cross_attention is not None:
+            h_cross = self.cross_attention(
+                x=h,
+                cross_x=cross_x,
+                key_padding_mask=memory_padding_mask,
+            )
+            h = h + h_cross  # residual
+        h_ff = self.feed_forward(modulate(self.ffn_norm(h), shift_mlp, scale_mlp))
+        out = h + gate(h_ff, gate_mlp)
+        return out
+class DiT(torch.nn.Module):
+    def __init__(self, config: TransformerConfig):
+        super().__init__()
+        self.dropout = config.dropout
+        if config.in_channels is not None:
+            self.data_proj = torch.nn.Linear(config.in_channels, config.dim)
+        # embeddings
+        self.rope_embeddings = None
+        # rotary embeddings
+        if config.use_rope:
+            self.rope_embeddings = RotaryEmbedding(
+                theta=max(10000, 2 * config.max_positions),
+                head_dim=config.dim // config.n_heads,
+                max_seqlen=config.max_positions,
+            )
+            self.rope_embeddings.reset_parameters()
+        # transformer blocks
+        self.layers = nn.ModuleList()
+        for _ in range(config.n_layers):
+            self.layers.append(
+                DiTBlock(
+                    dim=config.dim,
+                    n_heads=config.n_heads,
+                    dropout=config.dropout,
+                    norm_eps=config.norm_eps,
+                    qk_norm=config.qk_norm,
+                    fc_bias=config.fc_bias,
+                    ffn_exp=config.ffn_exp,
+                    ffn_dim_multiplier=config.ffn_dim_multiplier,
+                    multiple_of=config.multiple_of,
+                    non_linearity=config.non_linearity,
+                )
+            )
+        self.norm = RMSNorm(config.dim, config.norm_eps)
+        # output layer
+        self.output = torch.nn.Linear(
+            config.dim, config.out_channels, bias=config.fc_bias
+        )
+        self.x_embedder = Patcher(
+            in_channels=config.dim,
+            out_channels=config.dim,
+            patch_size=1,
+        )
+        self.y_embedder = ContextEmbedder(
+            in_dim=config.context_dim,
+            out_dim=config.dim,
+            non_linearity=config.context_non_linearity,
+            dropout=config.context_embedder_dropout,
+            fc_bias=config.fc_bias,
+            norm_eps=config.norm_eps,
+            context_norm=config.context_norm,
+        )
+        self.t_embedder = TimestepEmbedder(
+            config.dim,
+            config.frequency_embedding_dim,
+            non_linearity=config.timestep_non_linearity,
+            dropout=config.dropout,
+            fc_bias=config.fc_bias,
+            max_period=10000,
+        )
+        self.t_block_non_linearity = get_nonlinearity(config.t_block_non_linearity)
+        self.t_block = torch.nn.Linear(
+            config.dim,
+            config.dim * 6,
+            bias=config.t_block_bias,
+        )
+        self.final_layer_scale_shift_table = nn.Parameter(
+            torch.randn(2, config.dim) / config.dim**0.5,
+        )
+    def forward(
+        self,
+        x: torch.Tensor,
+        time: torch.Tensor,
+        *,
+        padding_mask: Optional[torch.Tensor] = None,
+        memory: Optional[torch.Tensor] = None,
+        memory_padding_mask: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, List[torch.Tensor]]:
+        x = rearrange(x, "b l c-> b c l")
+        h = self.x_embedder(x)
+        h = rearrange(h, "b c l -> b l c")
+        original_N = h.shape[1]
+        N = h.shape[1]
+        h = F.dropout(h, p=self.dropout, training=self.training)
+        t = self.t_embedder(time)  # B -> B D
+        t0 = self.t_block_non_linearity(t)
+        t0 = self.t_block(t0)  # B D -> B 6D
+        y = self.y_embedder(memory)
+        for layer in self.layers:
+            h = layer(
+                x=h,
+                cross_x=y,
+                t=t0,
+                padding_mask=padding_mask,
+                memory_padding_mask=memory_padding_mask,
+                rope=self.rope_embeddings,
+            )
+        shift, scale = (self.final_layer_scale_shift_table[None] + t[:, None]).chunk(
+            2, dim=1
+        )
+        # output layer
+        if self.norm is not None:
+            h = self.norm(h)
+        h = modulate(h, shift, scale)
+        h = F.dropout(h, p=self.dropout, training=self.training)
+        output = self.output(h)
+        N = output.shape[1]
+        if original_N != N:
+            output = output[:, -original_N:]
+        return output

sam_audio/model/vision_encoder.py ADDED Viewed

	@@ -0,0 +1,113 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved\n
+from abc import ABCMeta, abstractmethod
+import torch
+import torchvision
+from core.vision_encoder import pe
+from torch.nn.utils.rnn import pad_sequence
+from sam_audio.model.config import (
+    PerceptionEncoderConfig,
+    VisionEncoderConfig,
+)
+class RescaleTransform(object):
+    """Rescale the image in a sample to a given size.
+    Args:
+        output_size (tuple or int): Desired output size. If tuple, output is
+            matched to output_size. If int, smaller of image edges is matched
+            to output_size keeping aspect ratio the same.
+    """
+    def __init__(self, output_size, interpolation):
+        assert isinstance(output_size, (int, tuple))
+        self.output_size = output_size
+        if isinstance(output_size, int):
+            self.output_size = (output_size, output_size)
+        self.interpolation = interpolation
+    def __call__(self, sample):
+        # sample: [T, C, H, W]
+        sample = torch.nn.functional.interpolate(
+            sample.float(), size=self.output_size, mode=self.interpolation.value
+        )
+        return sample
+class VisionEncoder(torch.nn.Module, metaclass=ABCMeta):
+    def __init__(self, cfg: VisionEncoderConfig):
+        super().__init__()
+        self.batch_size = cfg.batch_size
+        self.dim = cfg.dim
+        self.transform = self.get_transform()
+    @torch.no_grad()
+    def forward(self, videos: list[torch.Tensor]) -> torch.Tensor:
+        """
+        Encodes a list of input videos.  Each element of the list is a video represented
+            as a tensor [T, C, H, W]
+        Args:
+            videos (list[torch.Tensor]): List of input image tensors to be processed.
+        Returns:
+            torch.Tensor: Encoded feature representations of the input tensors.
+                The output is padded along the time dimension for variable length videos
+        """
+        result = []
+        for video in videos:
+            video = self.transform(video)
+            if self.batch_size > 0 and video.size(0) > self.batch_size:
+                res = []
+                for i in range(0, video.size(0), self.batch_size):
+                    res.append(self.encode(video[i : i + self.batch_size]))
+                result.append(torch.cat(res, dim=0))
+            else:
+                result.append(self.encode(video))
+        return pad_sequence(result, batch_first=True, padding_value=0.0)
+    @abstractmethod
+    def encode(self, x: torch.Tensor) -> torch.Tensor:
+        pass
+    @abstractmethod
+    def get_transform(self):
+        pass
+class PerceptionEncoder(VisionEncoder):
+    def __init__(self, cfg: PerceptionEncoderConfig):
+        self.normalize_feature = cfg.normalize_feature
+        self.interpolation_mode = cfg.interpolation_mode
+        self.image_size = cfg.image_size
+        super().__init__(cfg)
+        self.model = pe.CLIP.from_config(cfg.name)
+    def encode(self, x):
+        image_features = self.model.encode_image(x, normalize=self.normalize_feature)
+        return image_features
+    def get_transform(self):
+        T = torchvision.transforms
+        try:
+            interp = getattr(T.InterpolationMode, self.interpolation_mode.upper())
+        except AttributeError as err:
+            raise ValueError(
+                f"Unsupported interpolation_mode: {self.interpolation_mode}"
+            ) from err
+        crop = [
+            T.Resize(
+                (self.image_size, self.image_size),
+                interpolation=interp,
+            )
+        ]
+        return T.Compose(
+            crop
+            + [
+                T.Lambda(lambda x: x.float() / 255.0),
+                T.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5], inplace=True),
+            ]
+        )

sam_audio/processor.py ADDED Viewed

	@@ -0,0 +1,382 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved\n
+import json
+import logging
+import math
+import os
+from typing import Callable, List, Optional, Tuple
+import torch
+import torchaudio
+from huggingface_hub import hf_hub_download
+from torch.nn.utils.rnn import pad_sequence
+from torchcodec.decoders import AudioDecoder, VideoDecoder
+from transformers import AutoTokenizer, BatchFeature
+from sam_audio.model.config import SAMAudioConfig, SAMAudioJudgeConfig
+logger = logging.getLogger(__name__)
+Anchor = Tuple[str, float, float]
+def batch_audio(
+    audios: list[str | torch.Tensor], audio_sampling_rate: int = 48_000
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    wavs = []
+    for audio in audios:
+        if isinstance(audio, str):
+            wav, sr = torchaudio.load(audio)
+            if sr != audio_sampling_rate:
+                wav = torchaudio.functional.resample(wav, sr, audio_sampling_rate)
+        else:
+            wav = audio
+        wavs.append(wav.mean(0))
+    sizes = torch.tensor([wav.size(-1) for wav in wavs])
+    return pad_sequence(wavs, batch_first=True).unsqueeze(1), sizes
+class Batch:
+    def __init__(
+        self,
+        audios: torch.Tensor,
+        sizes: torch.Tensor,
+        wav_sizes: torch.Tensor,
+        descriptions: list[str],
+        hop_length: int,
+        audio_sampling_rate: int,
+        anchors: Optional[list[list[Anchor]]] = None,
+        audio_pad_mask: Optional[torch.Tensor] = None,
+        masked_video: Optional[torch.Tensor] = None,
+    ):
+        self.audios = audios
+        self.sizes = sizes
+        self.wav_sizes = wav_sizes
+        self.descriptions = descriptions
+        self.audio_pad_mask = audio_pad_mask
+        self.masked_video = masked_video
+        self.hop_length = hop_length
+        self.audio_sampling_rate = audio_sampling_rate
+        self.process_anchors(anchors)
+        assert self.audios.size(0) == len(self.descriptions)
+    def _wav_to_feature_idx(self, wav_idx: int):
+        return math.ceil(wav_idx / self.hop_length)
+    def to(self, device: torch.device):
+        self.audios = self.audios.to(device)
+        self.anchor_ids = self.anchor_ids.to(device)
+        self.anchor_alignment = self.anchor_alignment.to(device)
+        self.sizes = self.sizes.to(device)
+        self.wav_sizes = self.wav_sizes.to(device)
+        if self.audio_pad_mask is not None:
+            self.audio_pad_mask = self.audio_pad_mask.to(device)
+        if self.masked_video is not None:
+            self.masked_video = [v.to(device) for v in self.masked_video]
+        return self
+    def process_anchors(self, anchors: Optional[list[list[Anchor]]]):
+        batch_size = len(self.audios)
+        anchor_dict = {"<null>": 0, "+": 1, "-": 2, "<pad>": 3}
+        if anchors is None:
+            anchor_ids = torch.full(
+                (batch_size, 2), anchor_dict["<null>"], dtype=torch.long
+            )
+            anchor_ids[:, 1] = anchor_dict["<pad>"]
+            anchor_alignment = torch.full(
+                (
+                    batch_size,
+                    self.audio_pad_mask.size(-1),
+                ),
+                0,
+                dtype=torch.long,
+            )
+            anchor_alignment[~self.audio_pad_mask] = 1  # point to pad token
+        else:
+            anchor_alignment = torch.full(
+                (
+                    batch_size,
+                    self.audio_pad_mask.size(-1),
+                ),
+                0,
+                dtype=torch.long,
+            )
+            anchor_alignment[~self.audio_pad_mask] = 1  # point to pad token
+            ids = []
+            for i, anchor_list in enumerate(anchors):
+                current = [anchor_dict["<null>"], anchor_dict["<pad>"]]
+                for token, start_time, end_time in anchor_list:
+                    start_idx = self._wav_to_feature_idx(
+                        start_time * self.audio_sampling_rate
+                    )
+                    end_idx = self._wav_to_feature_idx(
+                        end_time * self.audio_sampling_rate
+                    )
+                    anchor_alignment[i, start_idx:end_idx] = len(current)
+                    current.append(anchor_dict[token])
+                ids.append(torch.tensor(current))
+            anchor_ids = pad_sequence(
+                ids, batch_first=True, padding_value=anchor_dict["<pad>"]
+            )
+        self.anchor_ids = anchor_ids
+        self.anchor_alignment = anchor_alignment
+        self.anchors = anchors
+def mask_from_sizes(sizes: torch.Tensor) -> torch.Tensor:
+    return torch.arange(sizes.max()).expand(len(sizes), -1) < sizes.unsqueeze(1)
+def load_video(
+    sizes: torch.Tensor,
+    videos: List[str],
+    feature_idx_to_wav_idx: Callable[[torch.Tensor], torch.Tensor],
+    audio_sampling_rate: int,
+) -> list[torch.Tensor]:
+    all_frames = []
+    for size, video in zip(sizes, videos, strict=False):
+        audio_timestamps = (
+            feature_idx_to_wav_idx(torch.arange(size)) / audio_sampling_rate
+        )
+        if isinstance(video, str):
+            decoder = VideoDecoder(video, dimension_order="NCHW")
+            data = decoder.get_frames_in_range(0, len(decoder))
+            diffs = (audio_timestamps[None] - data.pts_seconds[:, None]).abs()
+            frame_idxs = diffs.argmin(dim=0)
+            frames = data.data[frame_idxs]
+        else:
+            assert video.size(1) == 3, (
+                f"Expected video tensor to be in NCHW format, but found {video.size(1)} channels"
+            )
+            idx = torch.linspace(0, video.size(0) - 1, int(size)).round().long()
+            frames = video[idx]
+        all_frames.append(frames)
+    return all_frames
+class Processor:
+    config_cls: Callable
+    def __init__(self, audio_hop_length: int, audio_sampling_rate: int):
+        self.audio_hop_length = audio_hop_length
+        self.audio_sampling_rate = audio_sampling_rate
+    @classmethod
+    def _get_config(cls, model_name_or_path: str):
+        if os.path.exists(model_name_or_path):
+            config_path = os.path.join(model_name_or_path, "config.json")
+        else:
+            config_path = hf_hub_download(
+                repo_id=model_name_or_path,
+                filename="config.json",
+                revision=cls.revision,
+            )
+        with open(config_path) as fin:
+            config = cls.config_cls(**json.load(fin))
+        return config
+    @classmethod
+    def from_pretrained(cls, model_name_or_path: str) -> "Processor":
+        config = cls._get_config(model_name_or_path)
+        return cls(
+            audio_hop_length=config.audio_codec.hop_length,
+            audio_sampling_rate=config.audio_codec.sample_rate,
+        )
+    def feature_to_wav_idx(self, feature_idx):
+        return feature_idx * self.audio_hop_length
+    def wav_to_feature_idx(self, wav_idx):
+        if torch.is_tensor(wav_idx):
+            ceil = torch.ceil
+        else:
+            ceil = math.ceil
+        return ceil(wav_idx / self.audio_hop_length)
+    def mask_videos(
+        self,
+        videos: List[str | torch.Tensor],
+        masks: List[str | torch.Tensor],
+    ) -> list[torch.Tensor]:
+        video = [VideoDecoder(v)[:] if isinstance(v, str) else v for v in videos]
+        video_mask = [VideoDecoder(v)[:] if isinstance(v, str) else v for v in masks]
+        return [v * m.eq(0) for v, m in zip(video, video_mask, strict=False)]
+class SAMAudioProcessor(Processor):
+    config_cls = SAMAudioConfig
+    revision = None
+    def __call__(
+        self,
+        descriptions: list[str],
+        audios: list[str | torch.Tensor],
+        anchors: Optional[list[list[Anchor]]] = None,
+        masked_videos: Optional[list[str | torch.Tensor]] = None,
+    ):
+        """
+        Processes input data for the model.
+        Args:
+            descriptions (list[str]): List of text descriptions corresponding to each audio sample.
+            audios (list[str]): List of audio file paths or tensors.
+                If a tensor:
+                    - should have shape (channels, time) where channels=1 for mono and 2 for stereo.
+                    - should be resampled to 48_000 hz
+            anchors (Optional[list[list[Anchor]]], optional): List of anchors for each sample,
+                where each anchor is a tuple (token, start_time, end_time).
+            masked_videos (Optional[list[str | torch.Tensor]], optional): List of masked video file paths or tensors.
+                If a tensor, should have shape (N, C, H, W)
+        Returns:
+            Batch: A Batch object containing processed audio, sizes, descriptions, anchor ids, anchor alignment, audio pad mask, and optionally masked video.
+        """
+        assert len(descriptions) == len(audios)
+        assert anchors is None or len(descriptions) == len(anchors)
+        assert masked_videos is None or len(descriptions) == len(masked_videos)
+        audios, wav_sizes = batch_audio(audios, self.audio_sampling_rate)
+        sizes = self.wav_to_feature_idx(wav_sizes)
+        audio_pad_mask = mask_from_sizes(sizes)
+        masked_video = None
+        if masked_videos is not None:
+            masked_video = load_video(
+                sizes, masked_videos, self.feature_to_wav_idx, self.audio_sampling_rate
+            )
+        return Batch(
+            audios=audios,
+            sizes=sizes,
+            descriptions=descriptions,
+            audio_pad_mask=audio_pad_mask,
+            anchors=anchors,
+            masked_video=masked_video,
+            hop_length=self.audio_hop_length,
+            audio_sampling_rate=self.audio_sampling_rate,
+            wav_sizes=wav_sizes,
+        )
+class SAMAudioJudgeProcessor(Processor):
+    config_cls = SAMAudioJudgeConfig
+    revision = "sam_audio"
+    def __init__(
+        self,
+        audio_hop_length: int,
+        audio_sampling_rate: int,
+        tokenizer: AutoTokenizer,
+    ):
+        super().__init__(audio_hop_length, audio_sampling_rate)
+        self.tokenizer = tokenizer
+    @classmethod
+    def from_pretrained(cls, model_name_or_path: str) -> "SAMAudioJudgeProcessor":
+        config = cls._get_config(model_name_or_path)
+        tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
+        return cls(
+            audio_hop_length=config.audio_codec.hop_length,
+            audio_sampling_rate=config.audio_codec.sample_rate,
+            tokenizer=tokenizer,
+        )
+    def _reflect_pad(self, wav):
+        if wav.ndim == 1:
+            wav = wav.unsqueeze(0)
+        if wav.size(-1) % self.audio_hop_length == 0:
+            return wav
+        p1d = (0, self.audio_hop_length - (wav.size(-1) % self.audio_hop_length))
+        return torch.nn.functional.pad(wav, p1d, mode="reflect")
+    def _load_audio(self, path: str):
+        ad = AudioDecoder(path, sample_rate=self.audio_sampling_rate, num_channels=1)
+        return ad.get_all_samples().data
+    def _process_audio(
+        self,
+        raw_audio,
+        sampling_rate: Optional[int] = None,
+    ):
+        from_file = False
+        if isinstance(raw_audio, str):
+            raw_audio = [raw_audio]
+        if isinstance(raw_audio, (list, tuple)) and isinstance(raw_audio[0], str):
+            loaded = []
+            for audio_file in raw_audio:
+                loaded.append(self._load_audio(audio_file))
+            raw_audio = loaded
+            from_file = True
+        if sampling_rate is not None:
+            if sampling_rate != self.audio_sampling_rate:
+                raise ValueError(
+                    f"The model corresponding to this feature extractor: {self} was trained using a sampling rate of"
+                    f" {self.audio_sampling_rate}. Please make sure that the provided audio input was sampled with"
+                    f" {self.audio_sampling_rate} and not {sampling_rate}."
+                )
+        elif not from_file:
+            logger.warning(
+                f"It is strongly recommended to pass the `sampling_rate` argument to `{self.__class__.__name__}()`. "
+                "Failing to do so can result in silent errors that might be hard to debug."
+            )
+        if isinstance(raw_audio, list):
+            raw_audio = [self._reflect_pad(x).T for x in raw_audio]
+        else:
+            raw_audio = self._reflect_pad(raw_audio).T
+        # verify inputs are valid
+        for example in raw_audio:
+            if example.ndim > 2:
+                raise ValueError(
+                    f"Expected input shape (channels, num_samples), but got shape ({example.shape})"
+                )
+        lengths = torch.tensor([x.size(0) for x in raw_audio])
+        input_values = pad_sequence(raw_audio, batch_first=True).transpose(1, 2)
+        padding_mask = torch.arange(lengths.max())[None] < lengths[:, None]
+        return BatchFeature(
+            {"input_values": input_values, "padding_mask": padding_mask}
+        )
+    def __call__(
+        self,
+        text: Optional[str] = None,
+        input_audio: Optional[
+            str | list[str] | torch.Tensor | list[torch.Tensor]
+        ] = None,
+        separated_audio: Optional[
+            str | list[str] | torch.Tensor | list[torch.Tensor]
+        ] = None,
+        sampling_rate: Optional[int] = None,
+        **kwargs,
+    ):
+        batch = BatchFeature()
+        if text is not None:
+            batch.update(
+                self.tokenizer(
+                    text,
+                    return_tensors="pt",
+                    padding="longest",
+                    max_length=512,
+                    truncation=True,
+                )
+            )
+        if input_audio is not None:
+            batch.update(self._process_audio(input_audio, sampling_rate))
+        if separated_audio is not None:
+            batch["separated_values"] = self._process_audio(
+                separated_audio, sampling_rate
+            )["input_values"]
+        return batch
+__all__ = ["SAMAudioProcessor", "SAMAudioJudgeProcessor", "Batch"]

sam_audio/ranking/__init__.py ADDED Viewed

	@@ -0,0 +1,30 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved\n
+from sam_audio.model.config import (
+    ClapRankerConfig,
+    EnsembleRankerConfig,
+    ImageBindRankerConfig,
+    JudgeRankerConfig,
+)
+from sam_audio.ranking.clap import ClapRanker
+from sam_audio.ranking.imagebind import ImageBindRanker
+from sam_audio.ranking.judge import JudgeRanker
+from sam_audio.ranking.ranker import EnsembleRanker
+def create_ranker(config):
+    if isinstance(config, ImageBindRankerConfig):
+        return ImageBindRanker(config)
+    elif isinstance(config, ClapRankerConfig):
+        return ClapRanker(config)
+    elif isinstance(config, JudgeRankerConfig):
+        return JudgeRanker(config)
+    elif isinstance(config, EnsembleRankerConfig):
+        ranker_cfgs, weights = zip(*config.rankers.values(), strict=False)
+        return EnsembleRanker(
+            rankers=[create_ranker(cfg) for cfg in ranker_cfgs],
+            weights=weights,
+        )
+    else:
+        assert config is None
+        return None

sam_audio/ranking/clap.py ADDED Viewed

	@@ -0,0 +1,84 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved\n
+import torch
+import torchaudio
+from huggingface_hub import hf_hub_download
+from sam_audio.model.config import ClapRankerConfig
+from sam_audio.ranking.ranker import Ranker
+def get_model(device="cpu"):
+    import laion_clap
+    model = laion_clap.CLAP_Module(enable_fusion=False, amodel="HTSAT-tiny").to(device)
+    checkpoint_file = hf_hub_download(
+        repo_id="lukewys/laion_clap", filename="630k-best.pt"
+    )
+    state_dict = torch.load(checkpoint_file, map_location=device, weights_only=False)[
+        "state_dict"
+    ]
+    if next(iter(state_dict.items()))[0].startswith("module"):
+        state_dict = {k[7:]: v for k, v in state_dict.items()}
+    if "text_branch.embeddings.position_ids" in state_dict:
+        del state_dict["text_branch.embeddings.position_ids"]
+    model.model.load_state_dict(state_dict)
+    return model.eval()
+class ClapRanker(Ranker):
+    def __init__(self, config: ClapRankerConfig):
+        from laion_clap.training import data
+        self.laion_data_module = data
+        super().__init__()
+        self.config = config
+        self.model = get_model()
+    def _prepare_audio(self, audio, sample_rate):
+        audio_features = []
+        for candidates in audio:
+            if sample_rate != 48_000:
+                candidates = torchaudio.functional.resample(
+                    candidates, sample_rate, 48000
+                )
+            quantized = self.laion_data_module.int16_to_float32_torch(
+                self.laion_data_module.float32_to_int16_torch(candidates)
+            ).float()
+            for sample in quantized:
+                temp_dict = {}
+                temp_dict = self.laion_data_module.get_audio_features(
+                    temp_dict,
+                    sample,
+                    480000,
+                    data_truncating=(
+                        "fusion" if self.model.enable_fusion else "rand_trunc"
+                    ),
+                    data_filling="repeatpad",
+                    audio_cfg=self.model.model_cfg["audio_cfg"],
+                    require_grad=False,
+                )
+                audio_features.append(temp_dict)
+        return audio_features
+    @torch.inference_mode()
+    def forward(
+        self,
+        extracted_audio: list[torch.Tensor],
+        descriptions: list[str],
+        sample_rate: int = 48_000,
+        **kwargs,
+    ):
+        audio_embed = self.model.model.get_audio_embedding(
+            self._prepare_audio(extracted_audio, sample_rate)
+        )
+        text_embed = self.model.get_text_embedding(descriptions, use_tensor=True)
+        bsz = len(extracted_audio)
+        candidates = len(audio_embed) // bsz
+        audio_embed = audio_embed.reshape(bsz, candidates, -1)
+        text_embed = text_embed.reshape(bsz, -1, 1)
+        scores = audio_embed @ text_embed
+        return scores.squeeze(-1)

sam_audio/ranking/imagebind.py ADDED Viewed

	@@ -0,0 +1,197 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved\n
+import math
+from typing import List, Union
+import torch
+import torchaudio
+from sam_audio.model.config import ImageBindRankerConfig
+from sam_audio.ranking.ranker import Ranker
+try:
+    from imagebind.data import (
+        ConstantClipsPerVideoSampler,
+        NormalizeVideo,
+        SpatialCrop,
+        get_clip_timepoints,
+        load_and_transform_video_data,
+        pv_transforms,
+        transforms,
+        waveform2melspec,
+    )
+    from imagebind.models.imagebind_model import ModalityType, imagebind_huge
+    __imagebind_exists__ = True
+except ImportError:
+    __imagebind_exists__ = False
+def load_and_transform_audio_data(
+    audios: List[Union[str, torch.Tensor]],
+    input_sample_rate=None,
+    num_mel_bins=128,
+    target_length=204,
+    sample_rate=16000,
+    clip_duration=2,
+    clips_per_video=3,
+    mean=-4.268,
+    std=9.138,
+    device=None,
+):
+    if audios is None:
+        return None
+    audio_outputs = []
+    clip_sampler = ConstantClipsPerVideoSampler(
+        clip_duration=clip_duration, clips_per_video=clips_per_video
+    )
+    for audio in audios:
+        if isinstance(audio, str):
+            waveform, input_sample_rate = torchaudio.load(audio)
+        else:
+            assert torch.is_tensor(audio)
+            assert sample_rate is not None
+            # Preprocessing needs to be done in full precision
+            waveform = audio.float()
+            if waveform.ndim == 1:
+                waveform = waveform[None]
+        if sample_rate != input_sample_rate:
+            waveform = torchaudio.functional.resample(
+                waveform, orig_freq=input_sample_rate, new_freq=sample_rate
+            )
+        all_clips_timepoints = get_clip_timepoints(
+            clip_sampler, waveform.size(1) / sample_rate
+        )
+        all_clips = []
+        for clip_timepoints in all_clips_timepoints:
+            waveform_clip = waveform[
+                :,
+                int(clip_timepoints[0] * sample_rate) : int(
+                    clip_timepoints[1] * sample_rate
+                ),
+            ]
+            waveform_melspec = waveform2melspec(
+                waveform_clip, sample_rate, num_mel_bins, target_length
+            )
+            all_clips.append(waveform_melspec)
+        normalize = transforms.Normalize(mean=mean, std=std)
+        all_clips = [normalize(ac).to(device) for ac in all_clips]
+        all_clips = torch.stack(all_clips, dim=0)
+        audio_outputs.append(all_clips)
+    return torch.stack(audio_outputs, dim=0)
+class VideoTransform:
+    def __init__(self, clip_duration=2, clips_per_video=5):
+        self.clip_duration = clip_duration
+        self.clips_per_video = clips_per_video
+        self.clip_sampler = ConstantClipsPerVideoSampler(
+            clip_duration=clip_duration, clips_per_video=clips_per_video
+        )
+        self.video_transform = transforms.Compose(
+            [
+                pv_transforms.ShortSideScale(224),
+                NormalizeVideo(
+                    mean=(0.48145466, 0.4578275, 0.40821073),
+                    std=(0.26862954, 0.26130258, 0.27577711),
+                ),
+            ]
+        )
+        self.spatial_crop = SpatialCrop(224, num_crops=3)
+    def load_video_fast(self, videos, durations, **kwargs):
+        result = []
+        for video, duration in zip(videos, durations, strict=False):
+            nframes = video.size(0)
+            fps = video.size(0) / duration
+            timepoints = get_clip_timepoints(
+                self.clip_sampler,
+                duration,
+            )
+            # Instead of loading 5 2s clips, and then sub-sampling frames, we figure
+            # Out the indices of the final clips we want and only decode those.
+            all_idxs = []
+            for start_time, end_time in timepoints:
+                idxs = torch.arange(
+                    min(int(math.ceil(fps * start_time)), nframes - 1),
+                    min(int(math.ceil(fps * end_time)), nframes),
+                )
+                ts = (
+                    torch.linspace(0, idxs.size(0) - 1, self.clip_duration)
+                    .clamp(max=idxs.size(0) - 1)
+                    .long()
+                )
+                all_idxs.append(idxs[ts])
+            all_idxs = torch.cat(all_idxs)
+            fast_frames = video[all_idxs].transpose(0, 1)
+            result.append(fast_frames.chunk(self.clips_per_video, dim=1))
+        return result
+    def transform_video(self, batch, device=None):
+        device = device or torch.device("cpu")
+        video_outputs = []
+        for all_video in batch:
+            all_video = [
+                self.video_transform(clip.to(device) / 255.0) for clip in all_video
+            ]
+            all_video = self.spatial_crop(all_video)
+            all_video = torch.stack(all_video, dim=0)
+            video_outputs.append(all_video)
+        return torch.stack(video_outputs, dim=0)
+    def __call__(self, videos, durations, device=None):
+        return self.transform_video(
+            self.load_video_fast(videos, durations), device=device
+        )
+class ImageBindRanker(Ranker):
+    def __init__(self, cfg: ImageBindRankerConfig):
+        super().__init__()
+        assert __imagebind_exists__, (
+            "Install ImageBind in order to use this ranker: https://github.com/facebookresearch/ImageBind/tree/main"
+        )
+        self.model = imagebind_huge(pretrained=cfg.checkpoint is None)
+        if cfg.checkpoint is not None:
+            self.model.load_state_dict(torch.load(cfg.checkpoint, map_location="cpu"))
+        self.model = self.model.eval()
+        self.video_transform = VideoTransform()
+    @torch.inference_mode()
+    def forward(
+        self,
+        extracted_audio: list[torch.Tensor],
+        videos: list[torch.Tensor | str],
+        sample_rate: int = 48_000,
+        **kwargs,
+    ):
+        audio_data = torch.cat(
+            [
+                load_and_transform_audio_data(x, input_sample_rate=sample_rate)
+                for x in extracted_audio
+            ],
+            dim=0,
+        )
+        if isinstance(videos[0], str):
+            video_data = load_and_transform_video_data(videos)
+        else:
+            durations = [x.size(-1) / sample_rate for x in extracted_audio]
+            video_data = self.video_transform(videos, durations, audio_data.device)
+        inputs = {ModalityType.AUDIO: audio_data, ModalityType.VISION: video_data}
+        embs = self.model(inputs)
+        audio_embs, video_embs = embs[ModalityType.AUDIO], embs[ModalityType.VISION]
+        audio_embs, video_embs = (
+            audio_embs / ((audio_embs**2).sum(dim=-1, keepdims=True) ** 0.5),
+            video_embs / ((video_embs**2).sum(dim=-1, keepdims=True) ** 0.5),
+        )
+        bsz = len(extracted_audio)
+        candidates = len(audio_embs) // bsz
+        scores = audio_embs.view(bsz, candidates, -1) @ video_embs.view(bsz, -1, 1)
+        return scores

sam_audio/ranking/judge.py ADDED Viewed

	@@ -0,0 +1,42 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved\n
+import torch
+from ..model.config import JudgeRankerConfig
+from ..model.judge import SAMAudioJudgeModel
+from ..processor import SAMAudioJudgeProcessor
+from .ranker import Ranker
+class JudgeRanker(Ranker):
+    def __init__(self, config: JudgeRankerConfig):
+        super().__init__()
+        self.config = config
+        self.model = SAMAudioJudgeModel.from_pretrained(config.checkpoint_or_model_id)
+        self.processor = SAMAudioJudgeProcessor.from_pretrained(
+            config.checkpoint_or_model_id
+        )
+    @torch.inference_mode()
+    def forward(
+        self,
+        input_audio: list[torch.Tensor],
+        extracted_audio: list[torch.Tensor],
+        descriptions: list[str],
+        sample_rate: int = 48_000,
+        **kwargs,
+    ):
+        bsz, ncandidates = len(input_audio), len(input_audio[0])
+        input_seqs = [x[None] for candidates in input_audio for x in candidates]
+        extracted_seqs = [x[None] for candidates in extracted_audio for x in candidates]
+        repeated_descriptions = [x for x in descriptions for _ in range(ncandidates)]
+        processed = self.processor(
+            text=repeated_descriptions,
+            input_audio=input_seqs,
+            separated_audio=extracted_seqs,
+            return_tensors="pt",
+            padding=True,
+            sampling_rate=sample_rate,
+        )
+        res = self.model(**processed.to(input_audio[0].device))
+        return res.overall.view(bsz, ncandidates)

sam_audio/ranking/ranker.py ADDED Viewed

	@@ -0,0 +1,36 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved\n
+from abc import ABCMeta, abstractmethod
+from typing import List
+import torch
+class Ranker(torch.nn.Module, metaclass=ABCMeta):
+    @abstractmethod
+    def forward(self, audio: list[torch.Tensor], **kwargs) -> torch.Tensor:
+        """
+        Args:
+            audio: (list[torch.Tensor]) where each element in the list corresponds to
+                the candidates for the i'th generation (num_candidates, num_frames)
+        Returns:
+            (torch.Tensor) of shape (batch_size, num_candidates) correspoding to the ranking scores
+        """
+        pass
+class EnsembleRanker(Ranker):
+    def __init__(self, rankers: List[Ranker], weights: List[float]):
+        super().__init__()
+        assert len(rankers) == len(weights)
+        self.rankers = torch.nn.ModuleList(rankers)
+        self.weights = weights
+    def forward(self, **kwargs) -> torch.Tensor:
+        result = None
+        for weight, ranker in zip(self.weights, self.rankers, strict=False):
+            if result is None:
+                result = weight * ranker(**kwargs)
+            else:
+                result += weight * ranker(**kwargs)
+        return result

sam_audio/ranking/sound_activity.py ADDED Viewed

	@@ -0,0 +1,129 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved\n
+from io import BytesIO
+from typing import Tuple, Union
+import torch
+from torchcodec.encoders import AudioEncoder
+from ..model.config import SoundActivityRankerConfig
+from .ranker import Ranker
+try:
+    import pydub
+except ImportError:
+    pydub = None
+def get_peak_rms(audio, win_ms=250, hop_ms=100):
+    """
+    win_length and hop_length are in ms
+    """
+    last_slice_start = len(audio) - win_ms
+    slice_starts = range(0, last_slice_start + 1, hop_ms)
+    peak_rms = -1
+    for i in slice_starts:
+        audio_slice = audio[i : i + win_ms]
+        peak_rms = max(peak_rms, audio_slice.rms / audio.max_possible_amplitude)
+    # Ensure peak_rms is positive
+    peak_rms = max(peak_rms, 0)
+    return peak_rms
+def torch_tensor_to_pydub(wav: torch.Tensor, sample_rate: int):
+    bytesio = BytesIO()
+    encoder = AudioEncoder(wav, sample_rate=sample_rate)
+    encoder.to_file_like(bytesio, format="wav")
+    bytesio.seek(0)
+    audio = pydub.AudioSegment.from_file(bytesio, format="wav")
+    return audio
+def detect_nonsilent(
+    path: Union[str, Tuple[torch.Tensor, int]],  # either a file path or pair wav & sr
+    min_sil_ms=250,
+    sil_threshold=-40,
+    threshold_mode="rel_to_max",
+):
+    TH_MODES = {"abs", "rel_to_max"}
+    SAMPLE_RATE = 24_000
+    assert threshold_mode in TH_MODES, f"{threshold_mode=} not in {TH_MODES}"
+    if isinstance(path, str):
+        audio = pydub.AudioSegment.from_file(path)
+    else:  # tuple of (tensor, sr)
+        audio = torch_tensor_to_pydub(path[0], path[1])
+    audio = audio.set_frame_rate(SAMPLE_RATE)
+    if threshold_mode == "rel_to_max":
+        peak_rms = get_peak_rms(audio)
+        sil_threshold = sil_threshold + pydub.utils.ratio_to_db(
+            peak_rms
+        )  # convert to absolute db threshold
+    elif threshold_mode == "abs":
+        pass
+    else:
+        raise NotImplementedError(f"Unknown threshold_mode '{threshold_mode}'")
+    spans = pydub.silence.detect_nonsilent(
+        audio, min_silence_len=min_sil_ms, silence_thresh=sil_threshold, seek_step=10
+    )
+    spans = [(round(start / 1000, 3), round(end / 1000, 3)) for start, end in spans]
+    return spans
+def compute_iou_recall_precision(hyp_spans, ref_spans):
+    def span_length(span):
+        return span[1] - span[0]
+    def intersection_length(span1, span2):
+        return max(0, min(span1[1], span2[1]) - max(span1[0], span2[0]))
+    total_hyp_length = sum(span_length(span) for span in hyp_spans)
+    total_ref_length = sum(span_length(span) for span in ref_spans)
+    total_intersection = 0
+    for hyp_span in hyp_spans:
+        for ref_span in ref_spans:
+            total_intersection += intersection_length(hyp_span, ref_span)
+    union_spans = hyp_spans + ref_spans  # Combine both lists to compute union
+    union_length = sum(span_length(span) for span in union_spans) - total_intersection
+    iou = total_intersection / union_length if union_length > 0 else 0
+    recall = total_intersection / total_ref_length if total_ref_length > 0 else 0
+    precision = total_intersection / total_hyp_length if total_hyp_length > 0 else 0
+    return {"iou": iou, "recall": recall, "precision": precision}
+class SoundActivityRanker(Ranker):
+    def __init__(self, config: SoundActivityRankerConfig):
+        if pydub is None:
+            raise ImportError(
+                'Install reranking dependencies: `pip install "sam-audio[reranking]"`'
+            )
+        super().__init__()
+        self.config = config
+    @torch.inference_mode()
+    def forward(
+        self,
+        extracted_audio: list[torch.Tensor],
+        spans: list[list[list[float]]],
+        sample_rate: int = 48_000,
+        **kwargs,
+    ):
+        device = extracted_audio[0].device
+        scores = []
+        for wav, current_spans in zip(extracted_audio, spans, strict=True):
+            wav = wav.to(torch.float32).cpu()
+            # get non-silent spans
+            hyp_spans = detect_nonsilent(
+                (wav, sample_rate),
+                sil_threshold=self.config.sil_threshold,
+                threshold_mode=self.config.threshold_mode,
+            )
+            timestamps = [[span[1], span[2]] for span in current_spans]
+            result = compute_iou_recall_precision(hyp_spans, timestamps)
+            scores.append(result[self.config.metric])
+        # convert to tensor
+        scores = torch.tensor(scores, device=device)
+        return scores