| --- |
| library_name: transformers |
| pipeline_tag: mask-generation |
| inference: true |
| widget: |
| - text: Hello! |
| example_title: Hello world |
| group: Python |
| base_model: |
| - facebook/sam3 |
| --- |
| |
| This tiny model is intended for debugging. It is randomly initialized using the configuration adapted from [facebook/sam3](https://huggingface.co/facebook/sam3). |
|
|
| ### Example usage: |
|
|
| ```python |
| import requests |
| import torch |
| from PIL import Image |
| from transformers import Sam3Model, Sam3Processor |
| from transformers.models.sam3.modeling_sam3 import Sam3Config |
| |
| model_id = "tiny-random/sam3" |
| device = "cuda" if torch.cuda.is_available() else "cpu" |
| model = Sam3Model.from_pretrained(model_id).to(device) |
| processor = Sam3Processor.from_pretrained(model_id) |
| |
| kitchen_url = "http://images.cocodataset.org/val2017/000000136466.jpg" |
| kitchen_image = Image.open(requests.get( |
| kitchen_url, stream=True).raw).convert("RGB") |
| # Segment "handle" but exclude the oven handle using a negative box |
| text = "handle" |
| # Negative box covering oven handle area (xyxy): [40, 183, 318, 204] |
| oven_handle_box = [40, 183, 318, 204] |
| input_boxes = [[oven_handle_box]] |
| inputs = processor( |
| images=kitchen_image, |
| text=text, |
| input_boxes=input_boxes, |
| input_boxes_labels=[[0]], # 0 = negative (exclude this region) |
| return_tensors="pt" |
| ).to(device) |
| with torch.no_grad(): |
| outputs = model(**inputs) |
| # Post-process results |
| results = processor.post_process_instance_segmentation( |
| outputs, |
| threshold=0.5, |
| mask_threshold=0.5, |
| target_sizes=inputs.get("original_sizes").tolist() |
| )[0] |
| print(results) |
| # This will segment pot handles but exclude the oven handle |
| ``` |
|
|
| ### Codes to create this repo: |
|
|
| ```python |
| import json |
| from pathlib import Path |
| |
| import accelerate |
| import torch |
| from huggingface_hub import file_exists, hf_hub_download |
| from transformers import ( |
| AutoConfig, |
| AutoModelForCausalLM, |
| AutoProcessor, |
| GenerationConfig, |
| Sam3Processor, |
| set_seed, |
| ) |
| from transformers.models.sam3.modeling_sam3 import Sam3Config, Sam3Model |
| |
| source_model_id = "facebook/sam3" |
| save_folder = "/tmp/tiny-random/sam3" |
| |
| processor = Sam3Processor.from_pretrained( |
| source_model_id, trust_remote_code=True) |
| processor.save_pretrained(save_folder) |
| |
| with open(hf_hub_download(source_model_id, filename='config.json', repo_type='model'), 'r', encoding='utf-8') as f: |
| config_json = json.load(f) |
| HIDDEN_SIZE = 16 |
| INTERMEDIATE_SIZE = 32 |
| NUM_ATTENTION_HEADS = 2 |
| config_json['detector_config']['detr_decoder_config'].update({ |
| 'hidden_size': HIDDEN_SIZE, |
| 'intermediate_size': INTERMEDIATE_SIZE, |
| 'num_attention_heads': NUM_ATTENTION_HEADS, |
| }) |
| config_json['detector_config']['detr_encoder_config'].update({ |
| 'hidden_size': HIDDEN_SIZE, |
| 'intermediate_size': INTERMEDIATE_SIZE, |
| 'num_attention_heads': NUM_ATTENTION_HEADS, |
| }) |
| config_json['detector_config']['geometry_encoder_config'].update({ |
| 'hidden_size': HIDDEN_SIZE, |
| 'intermediate_size': INTERMEDIATE_SIZE, |
| 'num_attention_heads': NUM_ATTENTION_HEADS, |
| }) |
| config_json['detector_config']['mask_decoder_config'].update({ |
| 'hidden_size': HIDDEN_SIZE, |
| 'intermediate_size': INTERMEDIATE_SIZE, |
| 'num_attention_heads': NUM_ATTENTION_HEADS, |
| }) |
| config_json['detector_config']['text_config'].update({ |
| 'hidden_size': HIDDEN_SIZE, |
| 'intermediate_size': INTERMEDIATE_SIZE, |
| 'num_attention_heads': NUM_ATTENTION_HEADS, |
| 'projection_dim': HIDDEN_SIZE, |
| 'num_hidden_layers': 2, |
| }) |
| config_json['detector_config']['vision_config']['backbone_config'].update({ |
| 'hidden_size': HIDDEN_SIZE, |
| 'intermediate_size': INTERMEDIATE_SIZE, |
| 'num_attention_heads': NUM_ATTENTION_HEADS, |
| 'fpn_hidden_size': HIDDEN_SIZE, |
| 'global_attn_indexes': [1, 3, 5, 7], |
| 'num_hidden_layers': 8, |
| }) |
| config_json['detector_config']['vision_config'].update({ |
| 'fpn_hidden_size': HIDDEN_SIZE, |
| }) |
| config_json['tracker_config']['mask_decoder_config'].update({ |
| 'hidden_size': HIDDEN_SIZE, |
| 'iou_head_hidden_dim': HIDDEN_SIZE, |
| 'num_attention_heads': NUM_ATTENTION_HEADS, |
| }) |
| config_json['tracker_config'].update({ |
| 'mask_downsampler_embed_dim': HIDDEN_SIZE, |
| 'memory_attention_feed_forward_hidden_size': HIDDEN_SIZE, |
| 'memory_attention_hidden_size': HIDDEN_SIZE, |
| 'memory_encoder_hidden_size': HIDDEN_SIZE, |
| 'memory_fuser_embed_dim': HIDDEN_SIZE, |
| 'memory_fuser_intermediate_dim': INTERMEDIATE_SIZE, |
| }) |
| config_json['tracker_config']['prompt_encoder_config'].update({ |
| 'hidden_size': HIDDEN_SIZE, |
| 'intermediate_size': INTERMEDIATE_SIZE, |
| 'num_attention_heads': NUM_ATTENTION_HEADS, |
| }) |
| config_json['tracker_config']['vision_config']['backbone_config'].update({ |
| 'hidden_size': HIDDEN_SIZE, |
| 'intermediate_size': INTERMEDIATE_SIZE, |
| 'num_attention_heads': NUM_ATTENTION_HEADS, |
| 'global_attn_indexes': [1, 3, 5, 7], |
| 'num_hidden_layers': 8, |
| }) |
| config_json['tracker_config']['vision_config'].update({ |
| 'fpn_hidden_size': HIDDEN_SIZE, |
| }) |
| |
| with open(f"{save_folder}/config.json", "w", encoding='utf-8') as f: |
| json.dump(config_json, f, indent=2) |
| |
| config = Sam3Config.from_pretrained( |
| save_folder, |
| trust_remote_code=True, |
| ) |
| print(config) |
| torch.set_default_dtype(torch.float32) |
| model = Sam3Model(config) |
| set_seed(42) |
| model = model.cpu() |
| with torch.no_grad(): |
| for name, p in sorted(model.named_parameters()): |
| torch.nn.init.normal_(p, 0, 0.1) |
| print(name, p.shape) |
| model.save_pretrained(save_folder) |
| # print(list(model.state_dict().keys())) |
| # there is some bug in model.save_pretrained... Re-save the model weights here. |
| import safetensors.torch |
| safetensors.torch.save_file( |
| tensors=model.state_dict(), |
| filename=f"{save_folder}/model.safetensors" |
| ) |
| ``` |
|
|
| ### Printing the model: |
|
|
| ```text |
| Sam3Model( |
| (vision_encoder): Sam3VisionModel( |
| (backbone): Sam3ViTModel( |
| (embeddings): Sam3ViTEmbeddings( |
| (patch_embeddings): Sam3ViTPatchEmbeddings( |
| (projection): Conv2d(3, 16, kernel_size=(14, 14), stride=(14, 14), bias=False) |
| ) |
| (dropout): Dropout(p=0.0, inplace=False) |
| ) |
| (layer_norm): LayerNorm((16,), eps=1e-06, elementwise_affine=True) |
| (layers): ModuleList( |
| (0-7): 8 x Sam3ViTLayer( |
| (layer_norm1): LayerNorm((16,), eps=1e-06, elementwise_affine=True) |
| (rotary_emb): Sam3ViTRotaryEmbedding() |
| (attention): Sam3ViTRoPEAttention( |
| (q_proj): Linear(in_features=16, out_features=16, bias=True) |
| (k_proj): Linear(in_features=16, out_features=16, bias=True) |
| (v_proj): Linear(in_features=16, out_features=16, bias=True) |
| (o_proj): Linear(in_features=16, out_features=16, bias=True) |
| ) |
| (layer_norm2): LayerNorm((16,), eps=1e-06, elementwise_affine=True) |
| (mlp): Sam3MLP( |
| (activation_fn): GELUActivation() |
| (fc1): Linear(in_features=16, out_features=32, bias=True) |
| (fc2): Linear(in_features=32, out_features=16, bias=True) |
| (dropout): Dropout(p=0.0, inplace=False) |
| ) |
| (dropout): Dropout(p=0.0, inplace=False) |
| ) |
| ) |
| ) |
| (neck): Sam3VisionNeck( |
| (position_encoding): Sam3SinePositionEmbedding() |
| (fpn_layers): ModuleList( |
| (0): Sam3FPNLayer( |
| (scale_layers): ModuleList( |
| (0): ConvTranspose2d(16, 8, kernel_size=(2, 2), stride=(2, 2)) |
| (1): GELU(approximate='none') |
| (2): ConvTranspose2d(8, 4, kernel_size=(2, 2), stride=(2, 2)) |
| ) |
| (proj1): Conv2d(4, 16, kernel_size=(1, 1), stride=(1, 1)) |
| (proj2): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) |
| ) |
| (1): Sam3FPNLayer( |
| (scale_layers): ModuleList( |
| (0): ConvTranspose2d(16, 8, kernel_size=(2, 2), stride=(2, 2)) |
| ) |
| (proj1): Conv2d(8, 16, kernel_size=(1, 1), stride=(1, 1)) |
| (proj2): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) |
| ) |
| (2): Sam3FPNLayer( |
| (scale_layers): ModuleList() |
| (proj1): Conv2d(16, 16, kernel_size=(1, 1), stride=(1, 1)) |
| (proj2): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) |
| ) |
| (3): Sam3FPNLayer( |
| (scale_layers): ModuleList( |
| (0): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False) |
| ) |
| (proj1): Conv2d(16, 16, kernel_size=(1, 1), stride=(1, 1)) |
| (proj2): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) |
| ) |
| ) |
| ) |
| ) |
| (text_encoder): CLIPTextModelWithProjection( |
| (text_model): CLIPTextTransformer( |
| (embeddings): CLIPTextEmbeddings( |
| (token_embedding): Embedding(49408, 16) |
| (position_embedding): Embedding(32, 16) |
| ) |
| (encoder): CLIPEncoder( |
| (layers): ModuleList( |
| (0-1): 2 x CLIPEncoderLayer( |
| (self_attn): CLIPAttention( |
| (k_proj): Linear(in_features=16, out_features=16, bias=True) |
| (v_proj): Linear(in_features=16, out_features=16, bias=True) |
| (q_proj): Linear(in_features=16, out_features=16, bias=True) |
| (out_proj): Linear(in_features=16, out_features=16, bias=True) |
| ) |
| (layer_norm1): LayerNorm((16,), eps=1e-05, elementwise_affine=True) |
| (mlp): CLIPMLP( |
| (activation_fn): GELUActivation() |
| (fc1): Linear(in_features=16, out_features=32, bias=True) |
| (fc2): Linear(in_features=32, out_features=16, bias=True) |
| ) |
| (layer_norm2): LayerNorm((16,), eps=1e-05, elementwise_affine=True) |
| ) |
| ) |
| ) |
| (final_layer_norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True) |
| ) |
| (text_projection): Linear(in_features=16, out_features=16, bias=False) |
| ) |
| (text_projection): Linear(in_features=16, out_features=16, bias=True) |
| (geometry_encoder): Sam3GeometryEncoder( |
| (position_encoding): Sam3SinePositionEmbedding() |
| (label_embed): Embedding(2, 16) |
| (cls_embed): Embedding(1, 16) |
| (boxes_direct_project): Linear(in_features=4, out_features=16, bias=True) |
| (boxes_pool_project): Conv2d(16, 16, kernel_size=(7, 7), stride=(1, 1)) |
| (boxes_pos_enc_project): Linear(in_features=18, out_features=16, bias=True) |
| (vision_layer_norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True) |
| (final_proj): Linear(in_features=16, out_features=16, bias=True) |
| (prompt_layer_norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True) |
| (layers): ModuleList( |
| (0-2): 3 x Sam3GeometryEncoderLayer( |
| (layer_norm1): LayerNorm((16,), eps=1e-05, elementwise_affine=True) |
| (self_attn): Sam3Attention( |
| (q_proj): Linear(in_features=16, out_features=16, bias=True) |
| (k_proj): Linear(in_features=16, out_features=16, bias=True) |
| (v_proj): Linear(in_features=16, out_features=16, bias=True) |
| (o_proj): Linear(in_features=16, out_features=16, bias=True) |
| ) |
| (dropout): Dropout(p=0.1, inplace=False) |
| (cross_attn): Sam3Attention( |
| (q_proj): Linear(in_features=16, out_features=16, bias=True) |
| (k_proj): Linear(in_features=16, out_features=16, bias=True) |
| (v_proj): Linear(in_features=16, out_features=16, bias=True) |
| (o_proj): Linear(in_features=16, out_features=16, bias=True) |
| ) |
| (layer_norm2): LayerNorm((16,), eps=1e-05, elementwise_affine=True) |
| (mlp): Sam3MLP( |
| (activation_fn): ReLU() |
| (fc1): Linear(in_features=16, out_features=32, bias=True) |
| (fc2): Linear(in_features=32, out_features=16, bias=True) |
| (dropout): Dropout(p=0.0, inplace=False) |
| ) |
| (layer_norm3): LayerNorm((16,), eps=1e-05, elementwise_affine=True) |
| ) |
| ) |
| (output_layer_norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True) |
| ) |
| (detr_encoder): Sam3DetrEncoder( |
| (layers): ModuleList( |
| (0-5): 6 x Sam3DetrEncoderLayer( |
| (layer_norm1): LayerNorm((16,), eps=1e-05, elementwise_affine=True) |
| (self_attn): Sam3Attention( |
| (q_proj): Linear(in_features=16, out_features=16, bias=True) |
| (k_proj): Linear(in_features=16, out_features=16, bias=True) |
| (v_proj): Linear(in_features=16, out_features=16, bias=True) |
| (o_proj): Linear(in_features=16, out_features=16, bias=True) |
| ) |
| (dropout): Dropout(p=0.1, inplace=False) |
| (cross_attn): Sam3Attention( |
| (q_proj): Linear(in_features=16, out_features=16, bias=True) |
| (k_proj): Linear(in_features=16, out_features=16, bias=True) |
| (v_proj): Linear(in_features=16, out_features=16, bias=True) |
| (o_proj): Linear(in_features=16, out_features=16, bias=True) |
| ) |
| (layer_norm2): LayerNorm((16,), eps=1e-05, elementwise_affine=True) |
| (mlp): Sam3MLP( |
| (activation_fn): ReLU() |
| (fc1): Linear(in_features=16, out_features=32, bias=True) |
| (fc2): Linear(in_features=32, out_features=16, bias=True) |
| (dropout): Dropout(p=0.0, inplace=False) |
| ) |
| (layer_norm3): LayerNorm((16,), eps=1e-05, elementwise_affine=True) |
| ) |
| ) |
| ) |
| (detr_decoder): Sam3DetrDecoder( |
| (layers): ModuleList( |
| (0-5): 6 x Sam3DetrDecoderLayer( |
| (self_attn): Sam3Attention( |
| (q_proj): Linear(in_features=16, out_features=16, bias=True) |
| (k_proj): Linear(in_features=16, out_features=16, bias=True) |
| (v_proj): Linear(in_features=16, out_features=16, bias=True) |
| (o_proj): Linear(in_features=16, out_features=16, bias=True) |
| ) |
| (self_attn_dropout): Dropout(p=0.1, inplace=False) |
| (self_attn_layer_norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True) |
| (text_cross_attn): Sam3Attention( |
| (q_proj): Linear(in_features=16, out_features=16, bias=True) |
| (k_proj): Linear(in_features=16, out_features=16, bias=True) |
| (v_proj): Linear(in_features=16, out_features=16, bias=True) |
| (o_proj): Linear(in_features=16, out_features=16, bias=True) |
| ) |
| (text_cross_attn_dropout): Dropout(p=0.1, inplace=False) |
| (text_cross_attn_layer_norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True) |
| (vision_cross_attn): Sam3Attention( |
| (q_proj): Linear(in_features=16, out_features=16, bias=True) |
| (k_proj): Linear(in_features=16, out_features=16, bias=True) |
| (v_proj): Linear(in_features=16, out_features=16, bias=True) |
| (o_proj): Linear(in_features=16, out_features=16, bias=True) |
| ) |
| (vision_cross_attn_dropout): Dropout(p=0.1, inplace=False) |
| (vision_cross_attn_layer_norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True) |
| (mlp): Sam3MLP( |
| (activation_fn): ReLU() |
| (fc1): Linear(in_features=16, out_features=32, bias=True) |
| (fc2): Linear(in_features=32, out_features=16, bias=True) |
| (dropout): Dropout(p=0.0, inplace=False) |
| ) |
| (mlp_layer_norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True) |
| (mlp_dropout): Dropout(p=0.1, inplace=False) |
| ) |
| ) |
| (output_layer_norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True) |
| (box_head): Sam3DecoderMLP( |
| (layer1): Linear(in_features=16, out_features=16, bias=True) |
| (layer2): Linear(in_features=16, out_features=16, bias=True) |
| (layer3): Linear(in_features=16, out_features=4, bias=True) |
| ) |
| (query_embed): Embedding(200, 16) |
| (reference_points): Embedding(200, 4) |
| (presence_token): Embedding(1, 16) |
| (presence_head): Sam3DecoderMLP( |
| (layer1): Linear(in_features=16, out_features=16, bias=True) |
| (layer2): Linear(in_features=16, out_features=16, bias=True) |
| (layer3): Linear(in_features=16, out_features=1, bias=True) |
| ) |
| (presence_layer_norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True) |
| (ref_point_head): Sam3DecoderMLP( |
| (layer1): Linear(in_features=32, out_features=16, bias=True) |
| (layer2): Linear(in_features=16, out_features=16, bias=True) |
| ) |
| (box_rpb_embed_x): Sam3DecoderMLP( |
| (layer1): Linear(in_features=2, out_features=16, bias=True) |
| (layer2): Linear(in_features=16, out_features=2, bias=True) |
| ) |
| (box_rpb_embed_y): Sam3DecoderMLP( |
| (layer1): Linear(in_features=2, out_features=16, bias=True) |
| (layer2): Linear(in_features=16, out_features=2, bias=True) |
| ) |
| (position_encoding): Sam3SinePositionEmbedding() |
| ) |
| (mask_decoder): Sam3MaskDecoder( |
| (pixel_decoder): Sam3PixelDecoder( |
| (conv_layers): ModuleList( |
| (0-2): 3 x Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) |
| ) |
| (norms): ModuleList( |
| (0-2): 3 x GroupNorm(8, 16, eps=1e-05, affine=True) |
| ) |
| ) |
| (mask_embedder): Sam3MaskEmbedder( |
| (layers): ModuleList( |
| (0-2): 3 x Linear(in_features=16, out_features=16, bias=True) |
| ) |
| (activation): ReLU() |
| ) |
| (instance_projection): Conv2d(16, 16, kernel_size=(1, 1), stride=(1, 1)) |
| (semantic_projection): Conv2d(16, 1, kernel_size=(1, 1), stride=(1, 1)) |
| (prompt_cross_attn): Sam3Attention( |
| (q_proj): Linear(in_features=16, out_features=16, bias=True) |
| (k_proj): Linear(in_features=16, out_features=16, bias=True) |
| (v_proj): Linear(in_features=16, out_features=16, bias=True) |
| (o_proj): Linear(in_features=16, out_features=16, bias=True) |
| ) |
| (prompt_cross_attn_norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True) |
| (prompt_cross_attn_dropout): Dropout(p=0.0, inplace=False) |
| ) |
| (dot_product_scoring): Sam3DotProductScoring( |
| (text_mlp): Sam3DecoderMLP( |
| (layer1): Linear(in_features=16, out_features=32, bias=True) |
| (layer2): Linear(in_features=32, out_features=16, bias=True) |
| ) |
| (text_mlp_dropout): Dropout(p=0.1, inplace=False) |
| (text_mlp_out_norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True) |
| (text_proj): Linear(in_features=16, out_features=16, bias=True) |
| (query_proj): Linear(in_features=16, out_features=16, bias=True) |
| ) |
| ) |
| ``` |