bellmake's picture
SAM3 Video Segmentation - Clean deployment
14114e8
# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved
import base64
import os
from typing import Any, Optional
from openai import OpenAI
def get_image_base64_and_mime(image_path):
"""Convert image file to base64 string and get MIME type"""
try:
# Get MIME type based on file extension
ext = os.path.splitext(image_path)[1].lower()
mime_types = {
".jpg": "image/jpeg",
".jpeg": "image/jpeg",
".png": "image/png",
".gif": "image/gif",
".webp": "image/webp",
".bmp": "image/bmp",
}
mime_type = mime_types.get(ext, "image/jpeg") # Default to JPEG
# Convert image to base64
with open(image_path, "rb") as image_file:
base64_data = base64.b64encode(image_file.read()).decode("utf-8")
return base64_data, mime_type
except Exception as e:
print(f"Error converting image to base64: {e}")
return None, None
def send_generate_request(
messages,
server_url=None,
model="meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
api_key=None,
max_tokens=4096,
):
"""
Sends a request to the OpenAI-compatible API endpoint using the OpenAI client library.
Args:
server_url (str): The base URL of the server, e.g. "http://127.0.0.1:8000"
messages (list): A list of message dicts, each containing role and content.
model (str): The model to use for generation (default: "llama-4")
max_tokens (int): Maximum number of tokens to generate (default: 4096)
Returns:
str: The generated response text from the server.
"""
# Process messages to convert image paths to base64
processed_messages = []
for message in messages:
processed_message = message.copy()
if message["role"] == "user" and "content" in message:
processed_content = []
for c in message["content"]:
if isinstance(c, dict) and c.get("type") == "image":
# Convert image path to base64 format
image_path = c["image"]
print("image_path", image_path)
new_image_path = image_path.replace(
"?", "%3F"
) # Escape ? in the path
# Read the image file and convert to base64
try:
base64_image, mime_type = get_image_base64_and_mime(
new_image_path
)
if base64_image is None:
print(
f"Warning: Could not convert image to base64: {new_image_path}"
)
continue
# Create the proper image_url structure with base64 data
processed_content.append(
{
"type": "image_url",
"image_url": {
"url": f"data:{mime_type};base64,{base64_image}",
"detail": "high",
},
}
)
except FileNotFoundError:
print(f"Warning: Image file not found: {new_image_path}")
continue
except Exception as e:
print(f"Warning: Error processing image {new_image_path}: {e}")
continue
else:
processed_content.append(c)
processed_message["content"] = processed_content
processed_messages.append(processed_message)
# Create OpenAI client with custom base URL
client = OpenAI(api_key=api_key, base_url=server_url)
try:
print(f"πŸ” Calling model {model}...")
response = client.chat.completions.create(
model=model,
messages=processed_messages,
max_completion_tokens=max_tokens,
n=1,
)
# print(f"Received response: {response.choices[0].message}")
# Extract the response content
if response.choices and len(response.choices) > 0:
return response.choices[0].message.content
else:
print(f"Unexpected response format: {response}")
return None
except Exception as e:
print(f"Request failed: {e}")
return None
def send_direct_request(
llm: Any,
messages: list[dict[str, Any]],
sampling_params: Any,
) -> Optional[str]:
"""
Run inference on a vLLM model instance directly without using a server.
Args:
llm: Initialized vLLM LLM instance (passed from external initialization)
messages: List of message dicts with role and content (OpenAI format)
sampling_params: vLLM SamplingParams instance (initialized externally)
Returns:
str: Generated response text, or None if inference fails
"""
try:
# Process messages to handle images (convert to base64 if needed)
processed_messages = []
for message in messages:
processed_message = message.copy()
if message["role"] == "user" and "content" in message:
processed_content = []
for c in message["content"]:
if isinstance(c, dict) and c.get("type") == "image":
# Convert image path to base64 format
image_path = c["image"]
new_image_path = image_path.replace("?", "%3F")
try:
base64_image, mime_type = get_image_base64_and_mime(
new_image_path
)
if base64_image is None:
print(
f"Warning: Could not convert image: {new_image_path}"
)
continue
# vLLM expects image_url format
processed_content.append(
{
"type": "image_url",
"image_url": {
"url": f"data:{mime_type};base64,{base64_image}"
},
}
)
except Exception as e:
print(
f"Warning: Error processing image {new_image_path}: {e}"
)
continue
else:
processed_content.append(c)
processed_message["content"] = processed_content
processed_messages.append(processed_message)
print("πŸ” Running direct inference with vLLM...")
# Run inference using vLLM's chat interface
outputs = llm.chat(
messages=processed_messages,
sampling_params=sampling_params,
)
# Extract the generated text from the first output
if outputs and len(outputs) > 0:
generated_text = outputs[0].outputs[0].text
return generated_text
else:
print(f"Unexpected output format: {outputs}")
return None
except Exception as e:
print(f"Direct inference failed: {e}")
return None