"""OliVAS Saliency Cloud Run Service. Runs DeepGaze saliency inference and returns: - saliency map (base64 float32 bytes) - gaze sequence - hotspots - design effectiveness scores """ import base64 import io import logging import os from contextlib import asynccontextmanager import numpy as np from fastapi import FastAPI, File, Form, Header, HTTPException, UploadFile from PIL import Image from scipy.ndimage import gaussian_filter, zoom from scipy.special import logsumexp logging.basicConfig(level=logging.INFO) logger = logging.getLogger("olivas.saliency") INTERNAL_SECRET = os.environ.get("CLOUD_RUN_SECRET", "") DEVICE = os.environ.get("DEVICE", "auto") # Global model cache: {model_key: {"model": ..., "centerbias": ...}} _model_cache: dict = {} VARIANT_MAP = { "deepgaze_i": ("DeepGazeI", "I"), "deepgaze_iie": ("DeepGazeIIE", "IIE"), "deepgaze_iii": ("DeepGazeIII", "III"), } @asynccontextmanager async def lifespan(app: FastAPI): logger.info("OliVAS Saliency service starting") yield _model_cache.clear() logger.info("OliVAS Saliency service stopped") app = FastAPI(title="OliVAS Saliency Service", lifespan=lifespan) def _check_auth(x_internal_secret: str | None) -> None: if INTERNAL_SECRET and x_internal_secret != INTERNAL_SECRET: raise HTTPException(status_code=401, detail="Unauthorized") def _resolve_device() -> str: if DEVICE == "auto": try: import torch return "cuda" if torch.cuda.is_available() else "cpu" except ImportError: return "cpu" return DEVICE def _get_model(model_name: str) -> dict: device = _resolve_device() key = f"{model_name}:{device}" if key in _model_cache: return _model_cache[key] if model_name not in VARIANT_MAP: raise ValueError(f"Unknown model: {model_name}. Choose from {list(VARIANT_MAP)}") class_name, _ = VARIANT_MAP[model_name] import torch import deepgaze_pytorch logger.info(f"Loading {class_name} on {device}...") device_obj = torch.device(device) model_cls = getattr(deepgaze_pytorch, class_name) model = model_cls(pretrained=True).to(device_obj) model.eval() # Pre-compute centerbias template size = 1024 x = np.linspace(-1, 1, size) y = np.linspace(-1, 1, size) xx, yy = np.meshgrid(x, y) centerbias = -0.5 * (xx**2 + yy**2) / 0.5**2 _model_cache[key] = {"model": model, "centerbias": centerbias, "device": device_obj} logger.info(f"Loaded {class_name}") return _model_cache[key] def _run_inference(image: Image.Image, model_name: str) -> np.ndarray: import torch model_data = _get_model(model_name) model = model_data["model"] centerbias_template = model_data["centerbias"] device_obj = model_data["device"] img_np = np.array(image.convert("RGB")) h, w = img_np.shape[:2] image_tensor = torch.tensor([img_np.transpose(2, 0, 1)]).float().to(device_obj) cb = zoom( centerbias_template, (h / centerbias_template.shape[0], w / centerbias_template.shape[1]), order=0, ) cb -= logsumexp(cb) centerbias_tensor = torch.tensor([cb]).float().to(device_obj) with torch.no_grad(): log_density = model(image_tensor, centerbias_tensor) saliency = torch.exp(log_density).cpu().numpy().squeeze() sal_min, sal_max = saliency.min(), saliency.max() if sal_max - sal_min > 1e-10: saliency = (saliency - sal_min) / (sal_max - sal_min) else: saliency = np.zeros_like(saliency) return saliency def _prepare_for_inference(image: Image.Image, max_size: int = 1024) -> tuple[Image.Image, float]: w, h = image.size scale = max_size / max(w, h) if scale < 1.0: new_size = (int(w * scale), int(h * scale)) return image.resize(new_size, Image.LANCZOS), scale return image, 1.0 def _upscale_saliency(saliency: np.ndarray, target_h: int, target_w: int) -> np.ndarray: if saliency.shape == (target_h, target_w): return saliency h_scale = target_h / saliency.shape[0] w_scale = target_w / saliency.shape[1] return zoom(saliency, (h_scale, w_scale), order=1) def _extract_gaze_sequence(saliency: np.ndarray, num_fixations: int = 5) -> list[dict]: sal = saliency.copy().astype(np.float64) h, w = sal.shape inhibition_radius = int(max(h, w) * 0.1) fixations = [] for rank in range(1, num_fixations + 1): smoothed = gaussian_filter(sal, sigma=max(h, w) * 0.01) if smoothed.max() < 1e-10: break peak_idx = np.unravel_index(np.argmax(smoothed), smoothed.shape) py, px = int(peak_idx[0]), int(peak_idx[1]) prob = float(saliency[py, px]) fixations.append({ "rank": rank, "x": px, "y": py, "x_pct": round(px / w * 100, 1), "y_pct": round(py / h * 100, 1), "probability": round(prob, 4), }) yy, xx = np.ogrid[:h, :w] mask = (xx - px) ** 2 + (yy - py) ** 2 <= inhibition_radius**2 sal[mask] = 0.0 return fixations def _extract_hotspots(saliency: np.ndarray, num_hotspots: int = 5) -> list[dict]: sal = saliency.copy() h, w = sal.shape hotspots = [] radius = int(max(h, w) * 0.08) for i in range(num_hotspots): smoothed = gaussian_filter(sal, sigma=max(h, w) * 0.015) peak_idx = np.unravel_index(np.argmax(smoothed), smoothed.shape) py, px = int(peak_idx[0]), int(peak_idx[1]) intensity = float(saliency[py, px]) x1, y1 = max(0, px - radius), max(0, py - radius) x2, y2 = min(w, px + radius), min(h, py + radius) hotspots.append({ "rank": i + 1, "center_x": px, "center_y": py, "x": x1, "y": y1, "width": x2 - x1, "height": y2 - y1, "intensity": round(intensity, 4), }) yy, xx = np.ogrid[:h, :w] mask = (xx - px) ** 2 + (yy - py) ** 2 <= radius**2 sal[mask] = 0.0 return hotspots def _compute_design_score( saliency_full: np.ndarray, hotspots: list[dict], gaze_seq: list[dict] ) -> tuple[float, float]: sal_sum = saliency_full.sum() if sal_sum > 0: prob_dist = saliency_full / sal_sum prob_dist = prob_dist[prob_dist > 0] entropy = -np.sum(prob_dist * np.log2(prob_dist)) max_entropy = np.log2(saliency_full.size) raw_concentration = (1 - entropy / max_entropy) * 100 else: raw_concentration = 0.0 entropy_score = round(float(np.clip(raw_concentration, 0, 100)), 1) entropy_adjusted = float(np.sqrt(max(raw_concentration, 0) / 100)) * 100 if len(hotspots) >= 2: top_intensity = hotspots[0]["intensity"] rest_mean = float(np.mean([h["intensity"] for h in hotspots[1:]])) dominance_ratio = top_intensity / rest_mean if rest_mean > 0 else 10.0 peak_dominance = float(100 * (1 - np.exp(-0.5 * dominance_ratio))) elif len(hotspots) == 1: peak_dominance = 95.0 else: peak_dominance = 50.0 intensities = [h["intensity"] for h in hotspots] n = len(intensities) if n >= 2: concordant = sum( 1 for i in range(n) for j in range(i + 1, n) if intensities[i] > intensities[j] ) total_pairs = n * (n - 1) // 2 monotonicity = concordant / total_pairs if total_pairs > 0 else 1.0 drop_ratio = 1 - (intensities[-1] / intensities[0]) if intensities[0] > 0 else 0.0 hierarchy_clarity = float((0.6 * monotonicity + 0.4 * drop_ratio) * 100) else: hierarchy_clarity = 70.0 gaze_points = [(g["x"], g["y"]) for g in gaze_seq] ng = len(gaze_points) if ng >= 3: angles = [] for i in range(ng - 2): ax = gaze_points[i + 1][0] - gaze_points[i][0] ay = gaze_points[i + 1][1] - gaze_points[i][1] bx = gaze_points[i + 2][0] - gaze_points[i + 1][0] by = gaze_points[i + 2][1] - gaze_points[i + 1][1] mag_a = np.sqrt(ax**2 + ay**2) mag_b = np.sqrt(bx**2 + by**2) if mag_a > 0 and mag_b > 0: cos_angle = np.clip((ax * bx + ay * by) / (mag_a * mag_b), -1, 1) angles.append(float(np.degrees(np.arccos(cos_angle)))) avg_angle = float(np.mean(angles)) if angles else 70.0 angle_smoothness = max(0.0, 100 - (avg_angle / 180) * 100) total_path = sum( np.sqrt( (gaze_points[i + 1][0] - gaze_points[i][0]) ** 2 + (gaze_points[i + 1][1] - gaze_points[i][1]) ** 2 ) for i in range(ng - 1) ) direct_dist = np.sqrt( (gaze_points[-1][0] - gaze_points[0][0]) ** 2 + (gaze_points[-1][1] - gaze_points[0][1]) ** 2 ) path_efficiency = float(direct_dist / total_path) if total_path > 0 else 1.0 gaze_coherence = 0.7 * angle_smoothness + 0.3 * (path_efficiency * 100) else: gaze_coherence = 70.0 composite = ( 0.30 * peak_dominance + 0.25 * hierarchy_clarity + 0.25 * gaze_coherence + 0.20 * entropy_adjusted ) overall_score = round(float(np.clip(composite, 0, 100)), 1) return overall_score, entropy_score @app.get("/health") async def health(): return {"status": "ok", "device": _resolve_device()} @app.post("/predict") async def predict( image: UploadFile = File(...), model: str = Form("deepgaze_iie"), x_internal_secret: str | None = Header(None), ): _check_auth(x_internal_secret) image_data = await image.read() pil_image = Image.open(io.BytesIO(image_data)).convert("RGB") orig_w, orig_h = pil_image.size resized, _ = _prepare_for_inference(pil_image) logger.info(f"Inference: model={model} original={orig_w}x{orig_h} resized={resized.size}") saliency = _run_inference(resized, model) saliency_full = _upscale_saliency(saliency, orig_h, orig_w) gaze_sequence = _extract_gaze_sequence(saliency_full, num_fixations=5) hotspots = _extract_hotspots(saliency_full, num_hotspots=5) overall_score, entropy_score = _compute_design_score(saliency_full, hotspots, gaze_sequence) saliency_b64 = base64.b64encode(saliency_full.astype(np.float32).tobytes()).decode() logger.info(f"Done: score={overall_score} entropy={entropy_score}") return { "saliency_b64": saliency_b64, "shape": [orig_h, orig_w], "gaze_sequence": gaze_sequence, "hotspots": hotspots, "overall_score": overall_score, "entropy_score": entropy_score, }