Source code for cw.lib.video_analysis.object_detection

"""
Object detection using YOLO v8.

Detects objects, people, and products in video frames.
"""

import logging
from pathlib import Path
from typing import Dict, List

from PIL import Image
from ultralytics import YOLO

logger = logging.getLogger(__name__)

# Module-level model cache
_yolo_model = None


def get_yolo_model(model_name: str = "yolov8x.pt") -> YOLO:
    """
    Get or load YOLO model (cached at module level).

    Args:
        model_name: YOLO model variant (yolov8n/s/m/l/x.pt)
                   Default: yolov8x.pt (highest accuracy)

    Returns:
        Loaded YOLO model instance
    """
    global _yolo_model

    if _yolo_model is None:
        logger.info(f"Loading YOLO model: {model_name}")
        _yolo_model = YOLO(model_name)
        logger.info(f"YOLO model loaded successfully")

    return _yolo_model



[docs]
def detect_objects(
    image_path: str,
    conf_threshold: float = 0.5,
    model_name: str = "yolov8x.pt",
) -> List[Dict]:
    """
    Detect objects in a single image using YOLO v8.

    Args:
        image_path: Path to image file
        conf_threshold: Minimum confidence threshold (0.0-1.0)
        model_name: YOLO model variant to use

    Returns:
        List of detected objects:
        [
            {
                "class": "person",
                "class_id": 0,
                "confidence": 0.95,
                "bbox": [x1, y1, x2, y2],  # Coordinates in pixels
            },
            ...
        ]

    Raises:
        FileNotFoundError: If image file doesn't exist
        Exception: If detection fails
    """
    if not Path(image_path).exists():
        raise FileNotFoundError(f"Image not found: {image_path}")

    logger.info(f"Detecting objects in: {image_path}")

    # Get cached model
    model = get_yolo_model(model_name)

    # Run inference
    results = model(image_path, conf=conf_threshold, verbose=False)

    # Extract detections from first result (single image)
    detections = []
    for result in results:
        boxes = result.boxes
        for i in range(len(boxes)):
            # Get bounding box coordinates
            box = boxes.xyxy[i].cpu().numpy()  # [x1, y1, x2, y2]

            detection = {
                "class": result.names[int(boxes.cls[i])],
                "class_id": int(boxes.cls[i]),
                "confidence": round(float(boxes.conf[i]), 3),
                "bbox": [round(float(coord), 1) for coord in box],
            }
            detections.append(detection)

    logger.info(f"Detected {len(detections)} objects")
    return detections




[docs]
def detect_objects_batch(
    image_paths: List[str],
    conf_threshold: float = 0.5,
    model_name: str = "yolov8x.pt",
) -> Dict[str, List[Dict]]:
    """
    Detect objects in multiple images (batch processing).

    Args:
        image_paths: List of image file paths
        conf_threshold: Minimum confidence threshold
        model_name: YOLO model variant to use

    Returns:
        Dictionary mapping image paths to detection lists:
        {
            "frame_001.jpg": [{"class": "person", ...}, ...],
            "frame_002.jpg": [{"class": "car", ...}, ...],
            ...
        }

    Raises:
        Exception: If batch detection fails
    """
    logger.info(f"Batch detecting objects in {len(image_paths)} images")

    # Get cached model
    model = get_yolo_model(model_name)

    # Run batch inference
    results = model(image_paths, conf=conf_threshold, verbose=False)

    # Process each image's results
    detections_map = {}
    for image_path, result in zip(image_paths, results):
        detections = []
        boxes = result.boxes

        for i in range(len(boxes)):
            box = boxes.xyxy[i].cpu().numpy()
            detection = {
                "class": result.names[int(boxes.cls[i])],
                "class_id": int(boxes.cls[i]),
                "confidence": round(float(boxes.conf[i]), 3),
                "bbox": [round(float(coord), 1) for coord in box],
            }
            detections.append(detection)

        detections_map[image_path] = detections

    logger.info(f"Batch detection complete: {sum(len(d) for d in detections_map.values())} total objects")
    return detections_map




[docs]
def summarize_objects(detections: List[Dict]) -> Dict:
    """
    Summarize object detections with counts and confidence scores.

    Args:
        detections: List of detections from detect_objects()

    Returns:
        Summary dictionary:
        {
            "total_objects": 15,
            "classes": {
                "person": {"count": 5, "avg_confidence": 0.92},
                "car": {"count": 2, "avg_confidence": 0.85},
                ...
            },
            "most_common": ["person", "car", "bottle"]
        }
    """
    if not detections:
        return {
            "total_objects": 0,
            "classes": {},
            "most_common": [],
        }

    # Count objects by class
    class_stats = {}
    for det in detections:
        class_name = det["class"]
        if class_name not in class_stats:
            class_stats[class_name] = {
                "count": 0,
                "confidences": [],
            }

        class_stats[class_name]["count"] += 1
        class_stats[class_name]["confidences"].append(det["confidence"])

    # Calculate average confidences
    for class_name in class_stats:
        confidences = class_stats[class_name]["confidences"]
        avg_conf = sum(confidences) / len(confidences)
        class_stats[class_name]["avg_confidence"] = round(avg_conf, 3)
        del class_stats[class_name]["confidences"]  # Remove raw list

    # Get most common classes (sorted by count)
    most_common = sorted(
        class_stats.keys(),
        key=lambda c: class_stats[c]["count"],
        reverse=True,
    )

    return {
        "total_objects": len(detections),
        "classes": class_stats,
        "most_common": most_common[:10],  # Top 10 most common
    }