video-ai-analysis/video_ai_analysis_poc/cli.py

from __future__ import annotations

import argparse
import json
from pathlib import Path
from typing import Sequence

from .aggregator import aggregate_outputs
from .clips import build_clip_records
from .config import DEFAULT_CONFIG_PATH, load_config
from .discovery import discover_videos
from .ffmpeg_sampler import sample_video_frames
from .hik_cloud import download_hik_cloud_recordings
from .manifest import read_jsonl, write_manifest
from .paths import stable_video_id
from .probe import probe_video
from .result_parser import build_clip_result
from .timeline import DEFAULT_TIMEZONE, format_beijing_time, timeline_start_epoch
from .vlm_client import infer_clip


def main(argv: Sequence[str] | None = None) -> int:
    parser = argparse.ArgumentParser(
        description="Local video batch analysis PoC entrypoint."
    )
    parser.add_argument("--config", default=str(DEFAULT_CONFIG_PATH))
    parser.add_argument("--input-dir")
    parser.add_argument("--output-dir")
    parser.add_argument("--dry-run", action="store_true")
    parser.add_argument("--until", choices=["clips", "inference"])
    parser.add_argument("--limit-clips", type=int)
    args = parser.parse_args(argv)

    config = load_config(
        args.config,
        input_dir=args.input_dir,
        output_dir=args.output_dir,
    )
    if args.dry_run and args.until:
        parser.error("--dry-run cannot be combined with --until")
    if args.limit_clips is not None and args.limit_clips < 0:
        parser.error("--limit-clips must be non-negative")

    output_dir = Path(config["output"]["dir"])
    output_dir.mkdir(parents=True, exist_ok=True)

    video_manifest_path = output_dir / "video_manifest.jsonl"
    resume_enabled = bool(config.get("output", {}).get("resume", False))
    records = _load_resume_records(
        video_manifest_path,
        resume=resume_enabled,
    )
    record_indexes = {
        _record_key(record): index
        for index, record in enumerate(records)
        if _record_key(record) is not None
    }

    try:
        _acquire_source_records(
            config,
            output_dir,
            records,
            record_indexes,
            download_source=not args.dry_run,
        )
    except ValueError as exc:
        parser.error(str(exc))

    write_manifest(video_manifest_path, records)
    if args.dry_run:
        return 0

    clip_manifest_path = output_dir / "clip_manifest.jsonl"
    existing_clip_records = read_jsonl(clip_manifest_path) if resume_enabled else []
    existing_clip_video_ids = {
        str(record.get("video_id"))
        for record in existing_clip_records
        if record.get("video_id")
    }

    frame_manifest_path = output_dir / "frame_manifest.jsonl"
    frame_records = read_jsonl(frame_manifest_path) if resume_enabled else []
    timezone_name = str(config.get("runtime", {}).get("timezone", DEFAULT_TIMEZONE))
    backfilled_frame_video_ids = _backfill_frame_beijing_times(
        frame_records,
        records,
        timezone_name=timezone_name,
    )
    existing_sampled_video_ids = {
        str(record.get("video_id"))
        for record in frame_records
        if record.get("status") == "sampled" and record.get("video_id")
    }
    changed_frame_video_ids: set[str] = set(backfilled_frame_video_ids)
    for record in records:
        if record.get("status") != "probed":
            continue
        video_id = str(record.get("video_id"))
        if args.until == "inference" and video_id in existing_clip_video_ids:
            continue
        if video_id in existing_sampled_video_ids:
            continue
        frame_records = _without_video_records(frame_records, video_id)
        ffmpeg_config = dict(config["ffmpeg"])
        ffmpeg_config["timezone"] = timezone_name
        frame_records.extend(
            sample_video_frames(
                record,
                output_dir,
                ffmpeg_config,
                manifest_path=None,
            )
        )
        changed_frame_video_ids.add(video_id)
    write_manifest(frame_manifest_path, frame_records)

    sampled_video_ids = {
        str(record.get("video_id"))
        for record in frame_records
        if record.get("status") == "sampled" and record.get("video_id")
    }
    clip_rebuild_video_ids = changed_frame_video_ids | (
        sampled_video_ids - existing_clip_video_ids
    )
    clip_records = [
        record
        for record in existing_clip_records
        if str(record.get("video_id")) not in clip_rebuild_video_ids
    ]
    frames_to_build = [
        record
        for record in frame_records
        if str(record.get("video_id")) in clip_rebuild_video_ids
    ]
    clip_records.extend(build_clip_records(frames_to_build, config["clip"]))
    write_manifest(output_dir / "clip_manifest.jsonl", clip_records)
    if args.until == "clips":
        return 0

    _run_inference(
        clip_records,
        records,
        output_dir,
        config,
        limit_clips=args.limit_clips,
        resume=resume_enabled,
    )
    if args.until == "inference":
        return 0
    aggregate_outputs(output_dir, config)
    return 0


def _load_resume_records(path: Path, *, resume: bool) -> list[dict[str, object]]:
    if not resume:
        return []
    return read_jsonl(path)


def _record_key(record: dict[str, object]) -> str | None:
    video_id = record.get("video_id")
    if video_id:
        return str(video_id)
    path = record.get("path")
    if path:
        return stable_video_id(str(path))
    return None


def _acquire_source_records(
    config: dict[str, object],
    output_dir: Path,
    records: list[dict[str, object]],
    record_indexes: dict[str, int],
    *,
    download_source: bool = True,
) -> None:
    for source_record in _source_video_records(
        config,
        output_dir,
        download_source=download_source,
    ):
        path = source_record.get("path")
        if not path:
            continue
        video_id = stable_video_id(str(path))
        existing_index = record_indexes.get(video_id)
        if (
            existing_index is not None
            and records[existing_index].get("status") == "probed"
        ):
            continue

        probe_record = probe_video(
            str(path),
            timeout_seconds=config["ffprobe"]["timeout_seconds"],
        )
        record = {**source_record, **probe_record, "video_id": video_id}
        if existing_index is None:
            record_indexes[video_id] = len(records)
            records.append(record)
        else:
            records[existing_index] = record


def _source_video_records(
    config: dict[str, object],
    output_dir: Path,
    *,
    download_source: bool = True,
) -> list[dict[str, object]]:
    source_config = config.get("source", {})
    source_mode = "local"
    if isinstance(source_config, dict):
        source_mode = str(source_config.get("mode", "local"))

    if source_mode == "local":
        videos = discover_videos(
            config["input"]["dir"],
            config["input"]["extensions"],
            recursive=config["input"]["recursive"],
        )
        return [{"path": path} for path in videos]

    if source_mode == "hik_cloud":
        return [
            record
            for record in download_hik_cloud_recordings(
                config,
                output_dir,
                download=download_source,
            )
            if record.get("status") == "downloaded"
        ]

    raise ValueError(f"unsupported source.mode: {source_mode}")


def _without_video_records(
    records: list[dict[str, object]],
    video_id: str,
) -> list[dict[str, object]]:
    return [record for record in records if str(record.get("video_id")) != video_id]


def _backfill_frame_beijing_times(
    frame_records: list[dict[str, object]],
    video_records: list[dict[str, object]],
    *,
    timezone_name: str,
) -> set[str]:
    video_by_id = {
        str(record.get("video_id")): record
        for record in video_records
        if record.get("video_id")
    }
    changed_video_ids: set[str] = set()
    for frame_record in frame_records:
        if frame_record.get("status") != "sampled" or frame_record.get("beijing_time"):
            continue
        video_id = str(frame_record.get("video_id") or "")
        start_epoch = timeline_start_epoch(video_by_id.get(video_id, {}))
        beijing_time = format_beijing_time(
            start_epoch,
            offset_seconds=float(frame_record.get("offset_seconds") or 0),
            timezone_name=timezone_name,
        )
        if beijing_time is None:
            continue
        frame_record["beijing_time"] = beijing_time
        changed_video_ids.add(video_id)
    return changed_video_ids


def _run_inference(
    clip_records: list[dict[str, object]],
    video_records: list[dict[str, object]],
    output_dir: Path,
    config: dict[str, object],
    *,
    limit_clips: int | None,
    resume: bool,
) -> None:
    results_path = output_dir / "clip_results.jsonl"
    result_records = read_jsonl(results_path) if resume else []
    clip_by_id = {
        str(record.get("clip_id")): record
        for record in clip_records
        if record.get("clip_id")
    }
    result_records = [
        _refresh_result_timeline(record, clip_by_id, config)
        for record in result_records
    ]
    ok_clip_ids = {
        str(record.get("clip_id"))
        for record in result_records
        if record.get("status") == "ok" and record.get("clip_id")
    }
    video_by_id = {
        str(record.get("video_id")): record
        for record in video_records
        if record.get("video_id")
    }
    processed = 0
    for clip_record in clip_records:
        clip_id = str(clip_record.get("clip_id"))
        if clip_id in ok_clip_ids:
            continue
        if limit_clips is not None and processed >= limit_clips:
            break

        result_records = [
            record for record in result_records if str(record.get("clip_id")) != clip_id
        ]
        video_record = video_by_id.get(str(clip_record.get("video_id")), {})
        result = _infer_and_parse_clip(clip_record, video_record, output_dir, config)
        result_records.append(result)
        _write_jsonl_exact(results_path, result_records)
        processed += 1

    _write_jsonl_exact(results_path, result_records)


def _refresh_result_timeline(
    result_record: dict[str, object],
    clip_by_id: dict[str, dict[str, object]],
    config: dict[str, object],
) -> dict[str, object]:
    clip_record = clip_by_id.get(str(result_record.get("clip_id")))
    if not clip_record:
        return result_record
    if not _clip_has_beijing_timing(clip_record):
        return result_record
    timeline = dict(result_record.get("monitoring_timeline") or {})
    timeline.update(
        {
            "timezone": config.get("runtime", {}).get("timezone", DEFAULT_TIMEZONE),
            "clip_start_seconds": clip_record.get("clip_start_seconds"),
            "clip_end_seconds": clip_record.get("clip_end_seconds"),
            "clip_start_timecode": clip_record.get("clip_start_timecode"),
            "clip_end_timecode": clip_record.get("clip_end_timecode"),
            "clip_start_beijing_time": clip_record.get("clip_start_beijing_time"),
            "clip_end_beijing_time": clip_record.get("clip_end_beijing_time"),
            "frame_times": clip_record.get("frame_times", []),
        }
    )
    refreshed = dict(result_record)
    refreshed["monitoring_timeline"] = timeline
    return refreshed


def _clip_has_beijing_timing(clip_record: dict[str, object]) -> bool:
    if clip_record.get("clip_start_beijing_time") or clip_record.get("clip_end_beijing_time"):
        return True
    for frame in clip_record.get("frame_times", []) or []:
        if isinstance(frame, dict) and frame.get("beijing_time"):
            return True
    return False


def _infer_and_parse_clip(
    clip_record: dict[str, object],
    video_record: dict[str, object],
    output_dir: Path,
    config: dict[str, object],
) -> dict[str, object]:
    schema_config = config.get("schema", {})
    parse_retry = 0
    if isinstance(schema_config, dict):
        parse_retry = int(schema_config.get("parse_retry", 0))

    attempts = parse_retry + 1
    result: dict[str, object] | None = None
    for attempt in range(attempts):
        try:
            inference = infer_clip(
                clip_record,
                output_dir,
                config["vlm"],
                config["prompt"],
            )
        except Exception as exc:
            return build_clip_result(
                "",
                clip_record,
                video_record,
                config,
                processing={},
                status="inference_failed",
                error=str(exc),
            )

        result = build_clip_result(
            str(inference.get("raw_response", "")),
            clip_record,
            video_record,
            config,
            processing={
                "latency_ms": inference.get("latency_ms"),
                "http_status": inference.get("http_status"),
                "attempt": attempt + 1,
            },
        )
        if result.get("status") != "parse_failed":
            return result
    if result is None:
        raise RuntimeError("unreachable inference state")
    return result


def _write_jsonl_exact(
    path: Path,
    records: list[dict[str, object]],
) -> None:
    path.parent.mkdir(parents=True, exist_ok=True)
    with path.open("w", encoding="utf-8") as handle:
        for record in records:
            handle.write(json.dumps(record, ensure_ascii=False, sort_keys=True) + "\n")


if __name__ == "__main__":
    raise SystemExit(main())