"""
Audience insights generation using LLM.
Analyzes video content to generate audience targeting recommendations.
"""
import json
import logging
from typing import Dict
from cw.lib.prompts import render_prompt
logger = logging.getLogger(__name__)
[docs]
def generate_audience_insights(
script: Dict,
visual_style: Dict,
sentiment: Dict,
transcription: Dict,
categories: Dict,
model_id: str = "Qwen/Qwen2.5-3B-Instruct",
load_in_4bit: bool = False,
) -> Dict:
"""
Generate audience targeting insights using LLM analysis.
Args:
script: Generated script data with scenes
visual_style: Visual style analysis results
sentiment: Sentiment analysis results
transcription: Audio transcription data
categories: Scene categorization summary
model_id: LLM model to use for generation
load_in_4bit: Whether to use 4-bit quantization
Returns:
Audience insights dictionary matching AudienceInsights schema
Raises:
Exception: If LLM generation fails (falls back to rule-based)
"""
# Prepare context data for insights generation
context = {
"script_scenes": script.get("scenes", []),
"dominant_colors": visual_style.get("dominant_colors", []),
"avg_brightness": visual_style.get("avg_brightness", 0),
"lighting_distribution": visual_style.get("lighting_distribution", {}),
"overall_sentiment": sentiment.get("overall_sentiment", "neutral"),
"sentiment_score": sentiment.get("overall_score", 0),
"transcription_language": transcription.get("language", "unknown"),
"primary_categories": categories.get("primary_categories", []),
"category_counts": categories.get("category_counts", {}),
}
try:
from cw.lib.pipeline.model_loader import get_model_loader
from cw.lib.video_analysis.schemas import AudienceInsights
logger.info(
"Generating audience insights with LLM",
extra={
"model_id": model_id,
"load_in_4bit": load_in_4bit,
},
)
# Get model loader and generator
loader = get_model_loader(model_id=model_id, load_in_4bit=load_in_4bit)
generator = loader.get_generator(output_schema=AudienceInsights)
# Render prompt template
user_prompt = render_prompt("audience-insights", **context)
# Apply chat template
system_message = (
"You are an expert marketing analyst specializing in audience "
"segmentation and targeting. Produce ONLY valid JSON matching "
"the requested schema — no commentary."
)
messages = [
{"role": "system", "content": system_message},
{"role": "user", "content": user_prompt},
]
prompt = loader.tokenizer.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
# Generate insights with structured output
logger.info("Invoking LLM for structured audience insights generation")
raw_output = generator(prompt, max_new_tokens=4096)
# Validate and parse output
result = AudienceInsights.model_validate(
json.loads(raw_output) if isinstance(raw_output, str) else raw_output
)
# Convert to dict format
insights = result.model_dump()
logger.info(
"Audience insights generation complete",
extra={
"primary_audience_age": insights["primary_audience"]["demographics"][
"age_range"
],
"secondary_audiences_count": len(insights["secondary_audiences"]),
"high_fit_markets_count": len(
insights["market_potential"]["high_fit_markets"]
),
},
)
return insights
except Exception as e:
logger.warning(
f"LLM-based audience insights failed: {e}. Falling back to rule-based approach.",
exc_info=True,
)
# Return fallback insights
return _generate_fallback_insights(context)
def _generate_fallback_insights(context: Dict) -> Dict:
"""
Generate basic fallback insights when LLM is unavailable.
Uses rule-based heuristics from video analysis data.
"""
logger.info("Generating fallback audience insights (rule-based)")
primary_categories = context.get("primary_categories", [])
sentiment = context.get("overall_sentiment", "neutral")
# Simple demographic inference from categories
demographics = {"age_range": "all", "gender": "all", "income_level": "all"}
if "people" in primary_categories and "family" in str(
context.get("script_scenes", [])
).lower():
demographics["age_range"] = "25-54"
demographics["income_level"] = "middle to upper-middle"
if "technology" in primary_categories:
demographics["age_range"] = "18-45"
if "sports" in primary_categories:
demographics["age_range"] = "18-34"
# Simple psychographic inference
values = []
interests = []
if "people" in primary_categories:
values.append("relationships")
interests.append("social activities")
if "food" in primary_categories:
values.append("quality")
interests.append("cooking")
if "lifestyle" in primary_categories:
values.append("comfort")
values.append("family")
if "technology" in primary_categories:
values.append("innovation")
interests.append("technology")
# Market potential based on language and categories
language = context.get("transcription_language", "en")
high_fit_markets = []
if language == "en":
high_fit_markets = ["US", "UK", "CA", "AU"]
elif language == "es":
high_fit_markets = ["ES", "MX", "AR"]
elif language == "de":
high_fit_markets = ["DE", "AT", "CH"]
elif language == "fr":
high_fit_markets = ["FR", "CA", "BE"]
else:
high_fit_markets = ["US"] # Default
# Messaging recommendations based on sentiment
messaging = []
if sentiment == "positive":
messaging.append("Emphasize happiness and satisfaction")
messaging.append("Use uplifting, energetic tone")
elif sentiment == "negative":
messaging.append("Address pain points and solutions")
messaging.append("Focus on problem-solving benefits")
else:
messaging.append("Present clear, factual information")
messaging.append("Balance emotional and rational appeals")
return {
"reasoning": "Generated using rule-based heuristics from video analysis (LLM unavailable)",
"primary_audience": {
"demographics": demographics,
"psychographics": {
"values": values if values else ["quality", "value"],
"interests": interests if interests else ["general"],
"lifestyle": "varied",
},
},
"secondary_audiences": [],
"market_potential": {
"high_fit_markets": high_fit_markets,
"adaptation_needed": [],
"considerations": [
f"Content language is {language}",
f"Primary sentiment is {sentiment}",
"Consider cultural adaptation for non-primary markets",
],
},
"messaging_recommendations": messaging,
}