openaitestclip / handler.py

Update handler.py

270502a verified 6 months ago

5.46 kB

	# handler.py
	import io, base64, time, torch
	from PIL import Image
	from transformers import CLIPModel, CLIPProcessor

	class EndpointHandler:
	def __init__(self, path=""):
	self.model = CLIPModel.from_pretrained(path)
	self.processor = CLIPProcessor.from_pretrained(path)
	self.device = "cuda" if torch.cuda.is_available() else "cpu"
	self.model.to(self.device).eval()
	self.cache: dict[str, torch.Tensor] = {}

	# -------------------------------------------------------
	def __call__(self, data):
	T = {} # timing dict
	t0 = time.perf_counter()

	payload = data.get("inputs", data)
	img_b64 = payload["image"]
	prompts = payload["candidate_labels"]

	# —— text embeddings (cache) ————————————————
	t = time.perf_counter()
	missing = [p for p in prompts if p not in self.cache]
	if missing:
	tok = self.processor(text=missing, return_tensors="pt",
	padding=True).to(self.device)
	with torch.no_grad():
	emb = self.model.get_text_features(**tok)
	emb = emb / emb.norm(dim=-1, keepdim=True)
	for p, e in zip(missing, emb):
	self.cache[p] = e
	txt_feat = torch.stack([self.cache[p] for p in prompts])
	T["encode_text"] = (time.perf_counter() - t) * 1000 # ms

	# —— image preprocessing ————————————————
	t = time.perf_counter()
	img = Image.open(io.BytesIO(base64.b64decode(img_b64))).convert("RGB")
	img_in = self.processor(images=img, return_tensors="pt").to(self.device)
	T["decode_resize"] = (time.perf_counter() - t) * 1000

	# —— image embedding ————————————————
	t = time.perf_counter()
	with torch.no_grad(), torch.cuda.amp.autocast():
	img_feat = self.model.get_image_features(**img_in)
	img_feat = img_feat / img_feat.norm(dim=-1, keepdim=True)
	img_feat = img_feat.float(); txt_feat = txt_feat.float()
	T["encode_image"] = (time.perf_counter() - t) * 1000

	# —— similarity & softmax ————————————————
	t = time.perf_counter()
	probs = (100 * img_feat @ txt_feat.T).softmax(dim=-1)[0].tolist()
	T["similarity_softmax"] = (time.perf_counter() - t) * 1000

	# —— log timings ————————————————
	total = (time.perf_counter() - t0) * 1000
	print(f"[CLIP timings] total={total:.1f} ms \| " +
	" \| ".join(f"{k}={v:.1f}" for k, v in T.items()),
	flush=True)

	# —— build response ————————————————
	return [
	{"label": p, "score": float(s)}
	for p, s in sorted(zip(prompts, probs), key=lambda x: x[1], reverse=True)
	]

	# import io, base64, torch
	# from PIL import Image
	# from transformers import CLIPModel, CLIPProcessor

	# class EndpointHandler:
	# """
	# CLIP ViT‑L/14 zero‑shot classifier.
	# Expects JSON: {
	# "inputs": {
	# "image": "<base64>",
	# "candidate_labels": ["prompt‑1", "prompt‑2", ...]
	# }
	# }
	# """

	# def __init__(self, path=""):
	# self.model = CLIPModel.from_pretrained(path)
	# self.processor = CLIPProcessor.from_pretrained(path)
	# self.device = "cuda" if torch.cuda.is_available() else "cpu"
	# self.model.to(self.device).eval()
	# self.cache: dict[str, torch.Tensor] = {} # prompt -> emb

	# def __call__(self, data):
	# payload = data.get("inputs", data)
	# img_b64 = payload["image"]
	# prompts = payload.get("candidate_labels", [])
	# if not prompts:
	# return {"error": "candidate_labels list is empty"}

	# # --- text embeddings with per‑process cache ----------
	# missing = [p for p in prompts if p not in self.cache]
	# if missing:
	# tok = self.processor(text=missing, return_tensors="pt",
	# padding=True).to(self.device)
	# with torch.no_grad():
	# emb = self.model.get_text_features(**tok)
	# emb = emb / emb.norm(dim=-1, keepdim=True)
	# for p, e in zip(missing, emb):
	# self.cache[p] = e
	# txt_feat = torch.stack([self.cache[p] for p in prompts])

	# # --- image embedding ---------------------------------
	# img = Image.open(io.BytesIO(base64.b64decode(img_b64))).convert("RGB")
	# img_in = self.processor(images=img, return_tensors="pt").to(self.device)

	# with torch.no_grad(), torch.cuda.amp.autocast():
	# img_feat = self.model.get_image_features(**img_in)

	# img_feat = img_feat / img_feat.norm(dim=-1, keepdim=True)
	# # txt_feat = txt_feat / txt_feat.norm(dim=-1, keepdim=True)

	# img_feat = img_feat.float() # ← add these two lines
	# txt_feat = txt_feat.float() # ←

	# probs = (100 * img_feat @ txt_feat.T).softmax(dim=-1)[0].tolist()



	# return [
	# {"label": p, "score": float(s)}
	# for p, s in sorted(zip(prompts, probs), key=lambda x: x[1], reverse=True)
	# ]