Initial video AI analysis project

2026-06-17 11:33:54 +08:00
commit ef0047af6d
35 changed files with 8613 additions and 0 deletions
--- a/video_ai_analysis_poc/cli.py
+++ b/video_ai_analysis_poc/cli.py
@@ -0,0 +1,424 @@
+from __future__ import annotations
+
+import argparse
+import json
+from pathlib import Path
+from typing import Sequence
+
+from .aggregator import aggregate_outputs
+from .clips import build_clip_records
+from .config import DEFAULT_CONFIG_PATH, load_config
+from .discovery import discover_videos
+from .ffmpeg_sampler import sample_video_frames
+from .hik_cloud import download_hik_cloud_recordings
+from .manifest import read_jsonl, write_manifest
+from .paths import stable_video_id
+from .probe import probe_video
+from .result_parser import build_clip_result
+from .timeline import DEFAULT_TIMEZONE, format_beijing_time, timeline_start_epoch
+from .vlm_client import infer_clip
+
+
+def main(argv: Sequence[str] | None = None) -> int:
+    parser = argparse.ArgumentParser(
+        description="Local video batch analysis PoC entrypoint."
+    )
+    parser.add_argument("--config", default=str(DEFAULT_CONFIG_PATH))
+    parser.add_argument("--input-dir")
+    parser.add_argument("--output-dir")
+    parser.add_argument("--dry-run", action="store_true")
+    parser.add_argument("--until", choices=["clips", "inference"])
+    parser.add_argument("--limit-clips", type=int)
+    args = parser.parse_args(argv)
+
+    config = load_config(
+        args.config,
+        input_dir=args.input_dir,
+        output_dir=args.output_dir,
+    )
+    if args.dry_run and args.until:
+        parser.error("--dry-run cannot be combined with --until")
+    if args.limit_clips is not None and args.limit_clips < 0:
+        parser.error("--limit-clips must be non-negative")
+
+    output_dir = Path(config["output"]["dir"])
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    video_manifest_path = output_dir / "video_manifest.jsonl"
+    resume_enabled = bool(config.get("output", {}).get("resume", False))
+    records = _load_resume_records(
+        video_manifest_path,
+        resume=resume_enabled,
+    )
+    record_indexes = {
+        _record_key(record): index
+        for index, record in enumerate(records)
+        if _record_key(record) is not None
+    }
+
+    try:
+        _acquire_source_records(
+            config,
+            output_dir,
+            records,
+            record_indexes,
+            download_source=not args.dry_run,
+        )
+    except ValueError as exc:
+        parser.error(str(exc))
+
+    write_manifest(video_manifest_path, records)
+    if args.dry_run:
+        return 0
+
+    clip_manifest_path = output_dir / "clip_manifest.jsonl"
+    existing_clip_records = read_jsonl(clip_manifest_path) if resume_enabled else []
+    existing_clip_video_ids = {
+        str(record.get("video_id"))
+        for record in existing_clip_records
+        if record.get("video_id")
+    }
+
+    frame_manifest_path = output_dir / "frame_manifest.jsonl"
+    frame_records = read_jsonl(frame_manifest_path) if resume_enabled else []
+    timezone_name = str(config.get("runtime", {}).get("timezone", DEFAULT_TIMEZONE))
+    backfilled_frame_video_ids = _backfill_frame_beijing_times(
+        frame_records,
+        records,
+        timezone_name=timezone_name,
+    )
+    existing_sampled_video_ids = {
+        str(record.get("video_id"))
+        for record in frame_records
+        if record.get("status") == "sampled" and record.get("video_id")
+    }
+    changed_frame_video_ids: set[str] = set(backfilled_frame_video_ids)
+    for record in records:
+        if record.get("status") != "probed":
+            continue
+        video_id = str(record.get("video_id"))
+        if args.until == "inference" and video_id in existing_clip_video_ids:
+            continue
+        if video_id in existing_sampled_video_ids:
+            continue
+        frame_records = _without_video_records(frame_records, video_id)
+        ffmpeg_config = dict(config["ffmpeg"])
+        ffmpeg_config["timezone"] = timezone_name
+        frame_records.extend(
+            sample_video_frames(
+                record,
+                output_dir,
+                ffmpeg_config,
+                manifest_path=None,
+            )
+        )
+        changed_frame_video_ids.add(video_id)
+    write_manifest(frame_manifest_path, frame_records)
+
+    sampled_video_ids = {
+        str(record.get("video_id"))
+        for record in frame_records
+        if record.get("status") == "sampled" and record.get("video_id")
+    }
+    clip_rebuild_video_ids = changed_frame_video_ids | (
+        sampled_video_ids - existing_clip_video_ids
+    )
+    clip_records = [
+        record
+        for record in existing_clip_records
+        if str(record.get("video_id")) not in clip_rebuild_video_ids
+    ]
+    frames_to_build = [
+        record
+        for record in frame_records
+        if str(record.get("video_id")) in clip_rebuild_video_ids
+    ]
+    clip_records.extend(build_clip_records(frames_to_build, config["clip"]))
+    write_manifest(output_dir / "clip_manifest.jsonl", clip_records)
+    if args.until == "clips":
+        return 0
+
+    _run_inference(
+        clip_records,
+        records,
+        output_dir,
+        config,
+        limit_clips=args.limit_clips,
+        resume=resume_enabled,
+    )
+    if args.until == "inference":
+        return 0
+    aggregate_outputs(output_dir, config)
+    return 0
+
+
+def _load_resume_records(path: Path, *, resume: bool) -> list[dict[str, object]]:
+    if not resume:
+        return []
+    return read_jsonl(path)
+
+
+def _record_key(record: dict[str, object]) -> str | None:
+    video_id = record.get("video_id")
+    if video_id:
+        return str(video_id)
+    path = record.get("path")
+    if path:
+        return stable_video_id(str(path))
+    return None
+
+
+def _acquire_source_records(
+    config: dict[str, object],
+    output_dir: Path,
+    records: list[dict[str, object]],
+    record_indexes: dict[str, int],
+    *,
+    download_source: bool = True,
+) -> None:
+    for source_record in _source_video_records(
+        config,
+        output_dir,
+        download_source=download_source,
+    ):
+        path = source_record.get("path")
+        if not path:
+            continue
+        video_id = stable_video_id(str(path))
+        existing_index = record_indexes.get(video_id)
+        if (
+            existing_index is not None
+            and records[existing_index].get("status") == "probed"
+        ):
+            continue
+
+        probe_record = probe_video(
+            str(path),
+            timeout_seconds=config["ffprobe"]["timeout_seconds"],
+        )
+        record = {**source_record, **probe_record, "video_id": video_id}
+        if existing_index is None:
+            record_indexes[video_id] = len(records)
+            records.append(record)
+        else:
+            records[existing_index] = record
+
+
+def _source_video_records(
+    config: dict[str, object],
+    output_dir: Path,
+    *,
+    download_source: bool = True,
+) -> list[dict[str, object]]:
+    source_config = config.get("source", {})
+    source_mode = "local"
+    if isinstance(source_config, dict):
+        source_mode = str(source_config.get("mode", "local"))
+
+    if source_mode == "local":
+        videos = discover_videos(
+            config["input"]["dir"],
+            config["input"]["extensions"],
+            recursive=config["input"]["recursive"],
+        )
+        return [{"path": path} for path in videos]
+
+    if source_mode == "hik_cloud":
+        return [
+            record
+            for record in download_hik_cloud_recordings(
+                config,
+                output_dir,
+                download=download_source,
+            )
+            if record.get("status") == "downloaded"
+        ]
+
+    raise ValueError(f"unsupported source.mode: {source_mode}")
+
+
+def _without_video_records(
+    records: list[dict[str, object]],
+    video_id: str,
+) -> list[dict[str, object]]:
+    return [record for record in records if str(record.get("video_id")) != video_id]
+
+
+def _backfill_frame_beijing_times(
+    frame_records: list[dict[str, object]],
+    video_records: list[dict[str, object]],
+    *,
+    timezone_name: str,
+) -> set[str]:
+    video_by_id = {
+        str(record.get("video_id")): record
+        for record in video_records
+        if record.get("video_id")
+    }
+    changed_video_ids: set[str] = set()
+    for frame_record in frame_records:
+        if frame_record.get("status") != "sampled" or frame_record.get("beijing_time"):
+            continue
+        video_id = str(frame_record.get("video_id") or "")
+        start_epoch = timeline_start_epoch(video_by_id.get(video_id, {}))
+        beijing_time = format_beijing_time(
+            start_epoch,
+            offset_seconds=float(frame_record.get("offset_seconds") or 0),
+            timezone_name=timezone_name,
+        )
+        if beijing_time is None:
+            continue
+        frame_record["beijing_time"] = beijing_time
+        changed_video_ids.add(video_id)
+    return changed_video_ids
+
+
+def _run_inference(
+    clip_records: list[dict[str, object]],
+    video_records: list[dict[str, object]],
+    output_dir: Path,
+    config: dict[str, object],
+    *,
+    limit_clips: int | None,
+    resume: bool,
+) -> None:
+    results_path = output_dir / "clip_results.jsonl"
+    result_records = read_jsonl(results_path) if resume else []
+    clip_by_id = {
+        str(record.get("clip_id")): record
+        for record in clip_records
+        if record.get("clip_id")
+    }
+    result_records = [
+        _refresh_result_timeline(record, clip_by_id, config)
+        for record in result_records
+    ]
+    ok_clip_ids = {
+        str(record.get("clip_id"))
+        for record in result_records
+        if record.get("status") == "ok" and record.get("clip_id")
+    }
+    video_by_id = {
+        str(record.get("video_id")): record
+        for record in video_records
+        if record.get("video_id")
+    }
+    processed = 0
+    for clip_record in clip_records:
+        clip_id = str(clip_record.get("clip_id"))
+        if clip_id in ok_clip_ids:
+            continue
+        if limit_clips is not None and processed >= limit_clips:
+            break
+
+        result_records = [
+            record for record in result_records if str(record.get("clip_id")) != clip_id
+        ]
+        video_record = video_by_id.get(str(clip_record.get("video_id")), {})
+        result = _infer_and_parse_clip(clip_record, video_record, output_dir, config)
+        result_records.append(result)
+        _write_jsonl_exact(results_path, result_records)
+        processed += 1
+
+    _write_jsonl_exact(results_path, result_records)
+
+
+def _refresh_result_timeline(
+    result_record: dict[str, object],
+    clip_by_id: dict[str, dict[str, object]],
+    config: dict[str, object],
+) -> dict[str, object]:
+    clip_record = clip_by_id.get(str(result_record.get("clip_id")))
+    if not clip_record:
+        return result_record
+    if not _clip_has_beijing_timing(clip_record):
+        return result_record
+    timeline = dict(result_record.get("monitoring_timeline") or {})
+    timeline.update(
+        {
+            "timezone": config.get("runtime", {}).get("timezone", DEFAULT_TIMEZONE),
+            "clip_start_seconds": clip_record.get("clip_start_seconds"),
+            "clip_end_seconds": clip_record.get("clip_end_seconds"),
+            "clip_start_timecode": clip_record.get("clip_start_timecode"),
+            "clip_end_timecode": clip_record.get("clip_end_timecode"),
+            "clip_start_beijing_time": clip_record.get("clip_start_beijing_time"),
+            "clip_end_beijing_time": clip_record.get("clip_end_beijing_time"),
+            "frame_times": clip_record.get("frame_times", []),
+        }
+    )
+    refreshed = dict(result_record)
+    refreshed["monitoring_timeline"] = timeline
+    return refreshed
+
+
+def _clip_has_beijing_timing(clip_record: dict[str, object]) -> bool:
+    if clip_record.get("clip_start_beijing_time") or clip_record.get("clip_end_beijing_time"):
+        return True
+    for frame in clip_record.get("frame_times", []) or []:
+        if isinstance(frame, dict) and frame.get("beijing_time"):
+            return True
+    return False
+
+
+def _infer_and_parse_clip(
+    clip_record: dict[str, object],
+    video_record: dict[str, object],
+    output_dir: Path,
+    config: dict[str, object],
+) -> dict[str, object]:
+    schema_config = config.get("schema", {})
+    parse_retry = 0
+    if isinstance(schema_config, dict):
+        parse_retry = int(schema_config.get("parse_retry", 0))
+
+    attempts = parse_retry + 1
+    result: dict[str, object] | None = None
+    for attempt in range(attempts):
+        try:
+            inference = infer_clip(
+                clip_record,
+                output_dir,
+                config["vlm"],
+                config["prompt"],
+            )
+        except Exception as exc:
+            return build_clip_result(
+                "",
+                clip_record,
+                video_record,
+                config,
+                processing={},
+                status="inference_failed",
+                error=str(exc),
+            )
+
+        result = build_clip_result(
+            str(inference.get("raw_response", "")),
+            clip_record,
+            video_record,
+            config,
+            processing={
+                "latency_ms": inference.get("latency_ms"),
+                "http_status": inference.get("http_status"),
+                "attempt": attempt + 1,
+            },
+        )
+        if result.get("status") != "parse_failed":
+            return result
+    if result is None:
+        raise RuntimeError("unreachable inference state")
+    return result
+
+
+def _write_jsonl_exact(
+    path: Path,
+    records: list[dict[str, object]],
+) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    with path.open("w", encoding="utf-8") as handle:
+        for record in records:
+            handle.write(json.dumps(record, ensure_ascii=False, sort_keys=True) + "\n")
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())