Source code for cw.lib.video_analysis.categorization

"""
Scene categorization for video content.

Categorizes scenes based on visual content, objects, and context.
"""

import logging
from typing import Dict, List, Set

logger = logging.getLogger(__name__)


# Scene category definitions with associated object classes
CATEGORY_PATTERNS = {
    "people": {
        "required_objects": {"person"},
        "keywords": ["people", "person", "human", "group", "crowd", "family"],
        "description": "Scenes featuring people",
    },
    "product": {
        "required_objects": {"bottle", "cup", "bowl", "knife", "fork", "spoon"},
        "keywords": ["product", "package", "container", "food", "drink"],
        "description": "Product shots and demonstrations",
    },
    "food": {
        "required_objects": {"pizza", "cake", "sandwich", "hot dog", "donut", "carrot", "apple", "orange", "banana"},
        "keywords": ["food", "meal", "eat", "cook", "kitchen", "recipe"],
        "description": "Food and culinary scenes",
    },
    "lifestyle": {
        "required_objects": {"person", "couch", "bed", "dining table", "tv"},
        "keywords": ["home", "family", "relax", "comfortable", "living"],
        "description": "Lifestyle and home scenes",
    },
    "outdoor": {
        "required_objects": {"bicycle", "car", "motorcycle", "bus", "truck", "boat"},
        "keywords": ["outdoor", "outside", "nature", "park", "street", "road"],
        "description": "Outdoor and travel scenes",
    },
    "technology": {
        "required_objects": {"cell phone", "laptop", "keyboard", "mouse", "tv", "remote"},
        "keywords": ["technology", "tech", "digital", "device", "phone", "computer"],
        "description": "Technology and digital devices",
    },
    "sports": {
        "required_objects": {"sports ball", "tennis racket", "baseball bat", "skateboard", "surfboard", "skis"},
        "keywords": ["sport", "game", "play", "active", "fitness", "exercise"],
        "description": "Sports and athletic activities",
    },
    "animals": {
        "required_objects": {"bird", "cat", "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe"},
        "keywords": ["animal", "pet", "dog", "cat", "bird", "wildlife"],
        "description": "Animals and pets",
    },
    "text": {
        "required_objects": {"book", "clock"},
        "keywords": ["text", "title", "message", "words", "logo", "brand"],
        "description": "Text overlays and branding",
    },
    "abstract": {
        "required_objects": set(),  # No specific objects
        "keywords": ["abstract", "graphic", "animation", "pattern", "color"],
        "description": "Abstract visuals and graphics",
    },
}


def categorize_scene(
    scene: Dict,
    detections: List[Dict],
    transcription_text: str = "",
) -> List[str]:
    """
    Categorize a single scene based on detected objects and context.

    Args:
        scene: Scene dictionary with metadata
        detections: List of object detections for the scene
        transcription_text: Optional transcribed text from the scene

    Returns:
        List of category names (can be multiple)
        e.g., ["people", "lifestyle", "product"]
    """
    categories = []

    # Get detected object classes
    detected_classes = {det["class"] for det in detections}

    # Normalize transcription text
    text_lower = transcription_text.lower() if transcription_text else ""

    # Check each category pattern
    for category_name, pattern in CATEGORY_PATTERNS.items():
        required_objects = pattern["required_objects"]
        keywords = pattern["keywords"]

        # Check if required objects are present
        has_objects = bool(required_objects.intersection(detected_classes))

        # Check if keywords are in transcription
        has_keywords = any(keyword in text_lower for keyword in keywords)

        # Category matches if:
        # - Required objects detected (if pattern has required objects), OR
        # - Keywords found in transcription (even if pattern has required objects)
        if has_objects or has_keywords:
            categories.append(category_name)

    # Special case: if no categories matched, default to "abstract"
    if not categories:
        categories.append("abstract")

    return sorted(categories)


[docs] def categorize_scenes( scenes: List[Dict], keyframe_detections: Dict[int, List[Dict]], transcription: Dict, ) -> List[Dict]: """ Categorize all scenes in a video. Args: scenes: List of scene dictionaries keyframe_detections: Dict mapping scene_number to detections transcription: Full transcription data Returns: List of scene dictionaries with added "categories" field: [ { "scene_number": 1, "start_time": 0.0, "end_time": 5.0, "categories": ["people", "lifestyle"], ... }, ... ] """ logger.info(f"Categorizing {len(scenes)} scenes") categorized_scenes = [] for scene in scenes: scene_number = scene["scene_number"] start_time = scene["start_time"] end_time = scene["end_time"] # Get detections for this scene's keyframe detections = keyframe_detections.get(scene_number, []) # Get transcription text for this time range scene_text = " ".join( segment["text"] for segment in transcription.get("segments", []) if start_time <= segment["start"] < end_time ) # Categorize the scene categories = categorize_scene(scene, detections, scene_text) # Add categories to scene scene_with_categories = scene.copy() scene_with_categories["categories"] = categories categorized_scenes.append(scene_with_categories) logger.info("Scene categorization complete") return categorized_scenes
[docs] def summarize_categories(categorized_scenes: List[Dict]) -> Dict: """ Summarize category distribution across all scenes. Args: categorized_scenes: List of scenes with categories Returns: Category summary: { "total_scenes": 10, "category_counts": { "people": 6, "product": 4, "lifestyle": 3, ... }, "primary_categories": ["people", "product"], # Top 3 } """ category_counts = {} for scene in categorized_scenes: for category in scene.get("categories", []): category_counts[category] = category_counts.get(category, 0) + 1 # Get top categories (sorted by count) sorted_categories = sorted( category_counts.items(), key=lambda x: x[1], reverse=True, ) primary_categories = [cat for cat, _ in sorted_categories[:3]] return { "total_scenes": len(categorized_scenes), "category_counts": category_counts, "primary_categories": primary_categories, }