Files
video-ai-analysis/video_ai_analysis_poc/cli.py
2026-06-17 11:33:54 +08:00

425 lines
14 KiB
Python

from __future__ import annotations
import argparse
import json
from pathlib import Path
from typing import Sequence
from .aggregator import aggregate_outputs
from .clips import build_clip_records
from .config import DEFAULT_CONFIG_PATH, load_config
from .discovery import discover_videos
from .ffmpeg_sampler import sample_video_frames
from .hik_cloud import download_hik_cloud_recordings
from .manifest import read_jsonl, write_manifest
from .paths import stable_video_id
from .probe import probe_video
from .result_parser import build_clip_result
from .timeline import DEFAULT_TIMEZONE, format_beijing_time, timeline_start_epoch
from .vlm_client import infer_clip
def main(argv: Sequence[str] | None = None) -> int:
parser = argparse.ArgumentParser(
description="Local video batch analysis PoC entrypoint."
)
parser.add_argument("--config", default=str(DEFAULT_CONFIG_PATH))
parser.add_argument("--input-dir")
parser.add_argument("--output-dir")
parser.add_argument("--dry-run", action="store_true")
parser.add_argument("--until", choices=["clips", "inference"])
parser.add_argument("--limit-clips", type=int)
args = parser.parse_args(argv)
config = load_config(
args.config,
input_dir=args.input_dir,
output_dir=args.output_dir,
)
if args.dry_run and args.until:
parser.error("--dry-run cannot be combined with --until")
if args.limit_clips is not None and args.limit_clips < 0:
parser.error("--limit-clips must be non-negative")
output_dir = Path(config["output"]["dir"])
output_dir.mkdir(parents=True, exist_ok=True)
video_manifest_path = output_dir / "video_manifest.jsonl"
resume_enabled = bool(config.get("output", {}).get("resume", False))
records = _load_resume_records(
video_manifest_path,
resume=resume_enabled,
)
record_indexes = {
_record_key(record): index
for index, record in enumerate(records)
if _record_key(record) is not None
}
try:
_acquire_source_records(
config,
output_dir,
records,
record_indexes,
download_source=not args.dry_run,
)
except ValueError as exc:
parser.error(str(exc))
write_manifest(video_manifest_path, records)
if args.dry_run:
return 0
clip_manifest_path = output_dir / "clip_manifest.jsonl"
existing_clip_records = read_jsonl(clip_manifest_path) if resume_enabled else []
existing_clip_video_ids = {
str(record.get("video_id"))
for record in existing_clip_records
if record.get("video_id")
}
frame_manifest_path = output_dir / "frame_manifest.jsonl"
frame_records = read_jsonl(frame_manifest_path) if resume_enabled else []
timezone_name = str(config.get("runtime", {}).get("timezone", DEFAULT_TIMEZONE))
backfilled_frame_video_ids = _backfill_frame_beijing_times(
frame_records,
records,
timezone_name=timezone_name,
)
existing_sampled_video_ids = {
str(record.get("video_id"))
for record in frame_records
if record.get("status") == "sampled" and record.get("video_id")
}
changed_frame_video_ids: set[str] = set(backfilled_frame_video_ids)
for record in records:
if record.get("status") != "probed":
continue
video_id = str(record.get("video_id"))
if args.until == "inference" and video_id in existing_clip_video_ids:
continue
if video_id in existing_sampled_video_ids:
continue
frame_records = _without_video_records(frame_records, video_id)
ffmpeg_config = dict(config["ffmpeg"])
ffmpeg_config["timezone"] = timezone_name
frame_records.extend(
sample_video_frames(
record,
output_dir,
ffmpeg_config,
manifest_path=None,
)
)
changed_frame_video_ids.add(video_id)
write_manifest(frame_manifest_path, frame_records)
sampled_video_ids = {
str(record.get("video_id"))
for record in frame_records
if record.get("status") == "sampled" and record.get("video_id")
}
clip_rebuild_video_ids = changed_frame_video_ids | (
sampled_video_ids - existing_clip_video_ids
)
clip_records = [
record
for record in existing_clip_records
if str(record.get("video_id")) not in clip_rebuild_video_ids
]
frames_to_build = [
record
for record in frame_records
if str(record.get("video_id")) in clip_rebuild_video_ids
]
clip_records.extend(build_clip_records(frames_to_build, config["clip"]))
write_manifest(output_dir / "clip_manifest.jsonl", clip_records)
if args.until == "clips":
return 0
_run_inference(
clip_records,
records,
output_dir,
config,
limit_clips=args.limit_clips,
resume=resume_enabled,
)
if args.until == "inference":
return 0
aggregate_outputs(output_dir, config)
return 0
def _load_resume_records(path: Path, *, resume: bool) -> list[dict[str, object]]:
if not resume:
return []
return read_jsonl(path)
def _record_key(record: dict[str, object]) -> str | None:
video_id = record.get("video_id")
if video_id:
return str(video_id)
path = record.get("path")
if path:
return stable_video_id(str(path))
return None
def _acquire_source_records(
config: dict[str, object],
output_dir: Path,
records: list[dict[str, object]],
record_indexes: dict[str, int],
*,
download_source: bool = True,
) -> None:
for source_record in _source_video_records(
config,
output_dir,
download_source=download_source,
):
path = source_record.get("path")
if not path:
continue
video_id = stable_video_id(str(path))
existing_index = record_indexes.get(video_id)
if (
existing_index is not None
and records[existing_index].get("status") == "probed"
):
continue
probe_record = probe_video(
str(path),
timeout_seconds=config["ffprobe"]["timeout_seconds"],
)
record = {**source_record, **probe_record, "video_id": video_id}
if existing_index is None:
record_indexes[video_id] = len(records)
records.append(record)
else:
records[existing_index] = record
def _source_video_records(
config: dict[str, object],
output_dir: Path,
*,
download_source: bool = True,
) -> list[dict[str, object]]:
source_config = config.get("source", {})
source_mode = "local"
if isinstance(source_config, dict):
source_mode = str(source_config.get("mode", "local"))
if source_mode == "local":
videos = discover_videos(
config["input"]["dir"],
config["input"]["extensions"],
recursive=config["input"]["recursive"],
)
return [{"path": path} for path in videos]
if source_mode == "hik_cloud":
return [
record
for record in download_hik_cloud_recordings(
config,
output_dir,
download=download_source,
)
if record.get("status") == "downloaded"
]
raise ValueError(f"unsupported source.mode: {source_mode}")
def _without_video_records(
records: list[dict[str, object]],
video_id: str,
) -> list[dict[str, object]]:
return [record for record in records if str(record.get("video_id")) != video_id]
def _backfill_frame_beijing_times(
frame_records: list[dict[str, object]],
video_records: list[dict[str, object]],
*,
timezone_name: str,
) -> set[str]:
video_by_id = {
str(record.get("video_id")): record
for record in video_records
if record.get("video_id")
}
changed_video_ids: set[str] = set()
for frame_record in frame_records:
if frame_record.get("status") != "sampled" or frame_record.get("beijing_time"):
continue
video_id = str(frame_record.get("video_id") or "")
start_epoch = timeline_start_epoch(video_by_id.get(video_id, {}))
beijing_time = format_beijing_time(
start_epoch,
offset_seconds=float(frame_record.get("offset_seconds") or 0),
timezone_name=timezone_name,
)
if beijing_time is None:
continue
frame_record["beijing_time"] = beijing_time
changed_video_ids.add(video_id)
return changed_video_ids
def _run_inference(
clip_records: list[dict[str, object]],
video_records: list[dict[str, object]],
output_dir: Path,
config: dict[str, object],
*,
limit_clips: int | None,
resume: bool,
) -> None:
results_path = output_dir / "clip_results.jsonl"
result_records = read_jsonl(results_path) if resume else []
clip_by_id = {
str(record.get("clip_id")): record
for record in clip_records
if record.get("clip_id")
}
result_records = [
_refresh_result_timeline(record, clip_by_id, config)
for record in result_records
]
ok_clip_ids = {
str(record.get("clip_id"))
for record in result_records
if record.get("status") == "ok" and record.get("clip_id")
}
video_by_id = {
str(record.get("video_id")): record
for record in video_records
if record.get("video_id")
}
processed = 0
for clip_record in clip_records:
clip_id = str(clip_record.get("clip_id"))
if clip_id in ok_clip_ids:
continue
if limit_clips is not None and processed >= limit_clips:
break
result_records = [
record for record in result_records if str(record.get("clip_id")) != clip_id
]
video_record = video_by_id.get(str(clip_record.get("video_id")), {})
result = _infer_and_parse_clip(clip_record, video_record, output_dir, config)
result_records.append(result)
_write_jsonl_exact(results_path, result_records)
processed += 1
_write_jsonl_exact(results_path, result_records)
def _refresh_result_timeline(
result_record: dict[str, object],
clip_by_id: dict[str, dict[str, object]],
config: dict[str, object],
) -> dict[str, object]:
clip_record = clip_by_id.get(str(result_record.get("clip_id")))
if not clip_record:
return result_record
if not _clip_has_beijing_timing(clip_record):
return result_record
timeline = dict(result_record.get("monitoring_timeline") or {})
timeline.update(
{
"timezone": config.get("runtime", {}).get("timezone", DEFAULT_TIMEZONE),
"clip_start_seconds": clip_record.get("clip_start_seconds"),
"clip_end_seconds": clip_record.get("clip_end_seconds"),
"clip_start_timecode": clip_record.get("clip_start_timecode"),
"clip_end_timecode": clip_record.get("clip_end_timecode"),
"clip_start_beijing_time": clip_record.get("clip_start_beijing_time"),
"clip_end_beijing_time": clip_record.get("clip_end_beijing_time"),
"frame_times": clip_record.get("frame_times", []),
}
)
refreshed = dict(result_record)
refreshed["monitoring_timeline"] = timeline
return refreshed
def _clip_has_beijing_timing(clip_record: dict[str, object]) -> bool:
if clip_record.get("clip_start_beijing_time") or clip_record.get("clip_end_beijing_time"):
return True
for frame in clip_record.get("frame_times", []) or []:
if isinstance(frame, dict) and frame.get("beijing_time"):
return True
return False
def _infer_and_parse_clip(
clip_record: dict[str, object],
video_record: dict[str, object],
output_dir: Path,
config: dict[str, object],
) -> dict[str, object]:
schema_config = config.get("schema", {})
parse_retry = 0
if isinstance(schema_config, dict):
parse_retry = int(schema_config.get("parse_retry", 0))
attempts = parse_retry + 1
result: dict[str, object] | None = None
for attempt in range(attempts):
try:
inference = infer_clip(
clip_record,
output_dir,
config["vlm"],
config["prompt"],
)
except Exception as exc:
return build_clip_result(
"",
clip_record,
video_record,
config,
processing={},
status="inference_failed",
error=str(exc),
)
result = build_clip_result(
str(inference.get("raw_response", "")),
clip_record,
video_record,
config,
processing={
"latency_ms": inference.get("latency_ms"),
"http_status": inference.get("http_status"),
"attempt": attempt + 1,
},
)
if result.get("status") != "parse_failed":
return result
if result is None:
raise RuntimeError("unreachable inference state")
return result
def _write_jsonl_exact(
path: Path,
records: list[dict[str, object]],
) -> None:
path.parent.mkdir(parents=True, exist_ok=True)
with path.open("w", encoding="utf-8") as handle:
for record in records:
handle.write(json.dumps(record, ensure_ascii=False, sort_keys=True) + "\n")
if __name__ == "__main__":
raise SystemExit(main())