Initial video AI analysis project

2026-06-17 11:33:54 +08:00
commit ef0047af6d
35 changed files with 8613 additions and 0 deletions
--- a/video_ai_analysis_poc/init.py
+++ b/video_ai_analysis_poc/init.py
@@ -0,0 +1,9 @@
+"""Local video batch analysis PoC."""
+
+__all__ = [
+    "config",
+    "discovery",
+    "manifest",
+    "paths",
+    "probe",
+]
--- a/video_ai_analysis_poc/aggregator.py
+++ b/video_ai_analysis_poc/aggregator.py
@@ -0,0 +1,403 @@
+from __future__ import annotations
+
+import json
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Any
+
+from .manifest import read_jsonl
+
+
+def aggregate_outputs(
+    output_dir: str | Path,
+    config: dict[str, Any],
+) -> dict[str, Any]:
+    root = Path(output_dir).expanduser().resolve(strict=False)
+    started_at = _now_iso()
+    video_records = read_jsonl(root / "video_manifest.jsonl")
+    clip_records = read_jsonl(root / "clip_manifest.jsonl")
+    clip_results = read_jsonl(root / "clip_results.jsonl")
+
+    schema_version = str(config.get("schema", {}).get("version", "local-batch-v1"))
+    merge_gap_seconds = float(config.get("schema", {}).get("merge_gap_seconds", 30))
+    clips_by_video = _group_by_video(clip_records)
+    results_by_video = _group_by_video(clip_results)
+
+    videos_summary = []
+    folder_event_counts: dict[str, int] = {}
+    processed_video_count = 0
+    failed_video_count = 0
+
+    for video_record in video_records:
+        video_id = str(video_record.get("video_id") or "")
+        if not video_id:
+            continue
+        video_clips = clips_by_video.get(video_id, [])
+        video_results = results_by_video.get(video_id, [])
+        video_result = _build_video_result(
+            video_record,
+            video_clips,
+            video_results,
+            schema_version=schema_version,
+            merge_gap_seconds=merge_gap_seconds,
+            started_at=started_at,
+        )
+        result_path = root / "videos" / video_id / "video_result.json"
+        _write_json(result_path, video_result)
+
+        failed_clip_count = int(video_result["failed_clip_count"])
+        video_failed = video_record.get("status") != "probed" or failed_clip_count > 0
+        if video_failed:
+            failed_video_count += 1
+        else:
+            processed_video_count += 1
+        for event_type, count in video_result["event_counts"].items():
+            folder_event_counts[event_type] = folder_event_counts.get(event_type, 0) + int(count)
+        videos_summary.append(
+            {
+                "video_id": video_id,
+                "video_path": video_result["video_path"],
+                "status": "failed" if video_failed else "processed",
+                "clip_count": video_result["clip_count"],
+                "failed_clip_count": failed_clip_count,
+                "failed_clip_counts": video_result["failed_clip_counts"],
+                "event_counts": video_result["event_counts"],
+                "outputs": {"video_result_json": f"videos/{video_id}/video_result.json"},
+                "error": video_record.get("last_error"),
+            }
+        )
+
+    folder_summary = {
+        "schema_version": schema_version,
+        "input_dir": str(config.get("input", {}).get("dir")),
+        "video_count": len(video_records),
+        "processed_video_count": processed_video_count,
+        "failed_video_count": failed_video_count,
+        "event_counts": dict(sorted(folder_event_counts.items())),
+        "videos": videos_summary,
+        "processing": {
+            "started_at": started_at,
+            "finished_at": _now_iso(),
+        },
+    }
+    _write_json(root / "folder_summary.json", folder_summary)
+    return folder_summary
+
+
+def _build_video_result(
+    video_record: dict[str, Any],
+    clip_records: list[dict[str, Any]],
+    clip_results: list[dict[str, Any]],
+    *,
+    schema_version: str,
+    merge_gap_seconds: float,
+    started_at: str,
+) -> dict[str, Any]:
+    video_id = str(video_record.get("video_id"))
+    failed_clip_counts = _failed_clip_counts(clip_results)
+    merged_events = _merge_events(_event_records(clip_results), merge_gap_seconds)
+    event_counts = _event_counts(merged_events)
+    video_duration = _first_present(
+        video_record,
+        ("duration_seconds", "video_duration_seconds", "duration"),
+    )
+    video_start_time = _video_start_time(video_record, clip_results)
+    return {
+        "schema_version": schema_version,
+        "video_id": video_id,
+        "video_path": _video_path(video_record, clip_results),
+        "probe": _probe(video_record),
+        "monitoring_timeline": {
+            "video_start_time": video_start_time,
+            "video_duration_seconds": video_duration,
+        },
+        "clip_count": len(clip_records),
+        "failed_clip_count": sum(failed_clip_counts.values()),
+        "failed_clip_counts": failed_clip_counts,
+        "event_counts": event_counts,
+        "events": merged_events,
+        "outputs": {"clip_results_jsonl": "clip_results.jsonl"},
+        "processing": {
+            "started_at": started_at,
+            "finished_at": _now_iso(),
+        },
+    }
+
+
+def _event_records(clip_results: list[dict[str, Any]]) -> list[dict[str, Any]]:
+    records = []
+    for result in clip_results:
+        if result.get("status") != "ok":
+            continue
+        timeline = result.get("monitoring_timeline") or {}
+        if not isinstance(timeline, dict):
+            timeline = {}
+        for event in result.get("events") or []:
+            if not isinstance(event, dict):
+                continue
+            event_record = _normalize_event(event, result, timeline)
+            records.append(event_record)
+    return sorted(
+        records,
+        key=lambda event: (
+            str(event.get("video_id")),
+            str(event.get("event_type")),
+            float(event.get("start_offset_seconds") or 0),
+            float(event.get("end_offset_seconds") or 0),
+        ),
+    )
+
+
+def _normalize_event(
+    event: dict[str, Any],
+    result: dict[str, Any],
+    timeline: dict[str, Any],
+) -> dict[str, Any]:
+    clip_id = str(result.get("clip_id"))
+    frame_times = [
+        dict(frame)
+        for frame in timeline.get("frame_times", [])
+        if isinstance(frame, dict)
+    ]
+    frame_paths = [
+        str(frame.get("frame_path"))
+        for frame in frame_times
+        if frame.get("frame_path") is not None
+    ]
+    start = event.get("start_offset_seconds", timeline.get("clip_start_seconds"))
+    end = event.get("end_offset_seconds", timeline.get("clip_end_seconds"))
+    screen_time = str(timeline.get("screen_time") or "")
+    normalized = {
+        "video_id": str(result.get("video_id")),
+        "event_type": str(event.get("event_type") or "unknown"),
+        "start_time": event.get("start_time"),
+        "end_time": event.get("end_time"),
+        "start_offset_seconds": _float_or_none(start),
+        "end_offset_seconds": _float_or_none(end),
+        "confidence": event.get("confidence"),
+        "severity": event.get("severity"),
+        "attributes": event.get("attributes") if isinstance(event.get("attributes"), dict) else {},
+        "screen_times": [screen_time] if screen_time else [],
+        "evidence": {
+            "clip_ids": [clip_id],
+            "frame_paths": frame_paths,
+            "frame_times": frame_times,
+            "clips": [
+                {
+                    "clip_id": clip_id,
+                    "clip_start_seconds": timeline.get("clip_start_seconds"),
+                    "clip_end_seconds": timeline.get("clip_end_seconds"),
+                    "clip_start_timecode": timeline.get("clip_start_timecode"),
+                    "clip_end_timecode": timeline.get("clip_end_timecode"),
+                    "clip_start_beijing_time": timeline.get("clip_start_beijing_time"),
+                    "clip_end_beijing_time": timeline.get("clip_end_beijing_time"),
+                    "screen_time": screen_time,
+                }
+            ],
+        },
+        "source_event_count": 1,
+    }
+    original_evidence = event.get("evidence")
+    if isinstance(original_evidence, dict):
+        original_clip_id = original_evidence.get("clip_id")
+        if original_clip_id:
+            normalized["evidence"]["clip_ids"] = _unique(
+                [*normalized["evidence"]["clip_ids"], str(original_clip_id)]
+            )
+        original_frame_paths = original_evidence.get("frame_paths")
+        if isinstance(original_frame_paths, list):
+            normalized["evidence"]["frame_paths"] = _unique(
+                [*normalized["evidence"]["frame_paths"], *map(str, original_frame_paths)]
+            )
+    return normalized
+
+
+def _merge_events(
+    events: list[dict[str, Any]],
+    merge_gap_seconds: float,
+) -> list[dict[str, Any]]:
+    merged: list[dict[str, Any]] = []
+    for event in events:
+        if not merged or not _can_merge(merged[-1], event, merge_gap_seconds):
+            merged.append(_copy_event(event))
+            continue
+        _merge_into(merged[-1], event)
+    for event in merged:
+        event.pop("video_id", None)
+    return merged
+
+
+def _can_merge(
+    previous: dict[str, Any],
+    current: dict[str, Any],
+    merge_gap_seconds: float,
+) -> bool:
+    if previous.get("video_id") != current.get("video_id"):
+        return False
+    if previous.get("event_type") != current.get("event_type"):
+        return False
+    previous_end = _float_or_none(previous.get("end_offset_seconds"))
+    current_start = _float_or_none(current.get("start_offset_seconds"))
+    if previous_end is None or current_start is None:
+        return False
+    return current_start - previous_end <= merge_gap_seconds
+
+
+def _merge_into(target: dict[str, Any], event: dict[str, Any]) -> None:
+    target["start_offset_seconds"] = _min_number(
+        target.get("start_offset_seconds"),
+        event.get("start_offset_seconds"),
+    )
+    target["end_offset_seconds"] = _max_number(
+        target.get("end_offset_seconds"),
+        event.get("end_offset_seconds"),
+    )
+    target["screen_times"] = _unique(
+        [*target.get("screen_times", []), *event.get("screen_times", [])]
+    )
+    target["source_event_count"] = int(target.get("source_event_count", 1)) + int(
+        event.get("source_event_count", 1)
+    )
+    target["evidence"]["clip_ids"] = _unique(
+        [*target["evidence"].get("clip_ids", []), *event["evidence"].get("clip_ids", [])]
+    )
+    target["evidence"]["frame_paths"] = _unique(
+        [
+            *target["evidence"].get("frame_paths", []),
+            *event["evidence"].get("frame_paths", []),
+        ]
+    )
+    target["evidence"]["frame_times"].extend(event["evidence"].get("frame_times", []))
+    target["evidence"]["clips"].extend(event["evidence"].get("clips", []))
+    if target.get("confidence") is None:
+        target["confidence"] = event.get("confidence")
+    elif event.get("confidence") is not None:
+        target["confidence"] = max(float(target["confidence"]), float(event["confidence"]))
+
+
+def _copy_event(event: dict[str, Any]) -> dict[str, Any]:
+    copied = dict(event)
+    copied["screen_times"] = list(event.get("screen_times", []))
+    copied["attributes"] = dict(event.get("attributes", {}))
+    copied["evidence"] = {
+        "clip_ids": list(event["evidence"].get("clip_ids", [])),
+        "frame_paths": list(event["evidence"].get("frame_paths", [])),
+        "frame_times": [dict(frame) for frame in event["evidence"].get("frame_times", [])],
+        "clips": [dict(clip) for clip in event["evidence"].get("clips", [])],
+    }
+    return copied
+
+
+def _group_by_video(records: list[dict[str, Any]]) -> dict[str, list[dict[str, Any]]]:
+    grouped: dict[str, list[dict[str, Any]]] = {}
+    for record in records:
+        video_id = record.get("video_id")
+        if video_id:
+            grouped.setdefault(str(video_id), []).append(record)
+    return grouped
+
+
+def _failed_clip_counts(clip_results: list[dict[str, Any]]) -> dict[str, int]:
+    counts = {"parse_failed": 0, "inference_failed": 0}
+    for result in clip_results:
+        status = result.get("status")
+        if status in counts:
+            counts[str(status)] += 1
+    return counts
+
+
+def _event_counts(events: list[dict[str, Any]]) -> dict[str, int]:
+    counts: dict[str, int] = {}
+    for event in events:
+        event_type = str(event.get("event_type") or "unknown")
+        counts[event_type] = counts.get(event_type, 0) + 1
+    return dict(sorted(counts.items()))
+
+
+def _probe(video_record: dict[str, Any]) -> dict[str, Any]:
+    excluded = {"video_id", "path", "source_path", "status", "retry_count", "last_error"}
+    probe = {
+        key: value
+        for key, value in video_record.items()
+        if key not in excluded
+    }
+    probe["status"] = video_record.get("status")
+    if video_record.get("last_error") is not None:
+        probe["last_error"] = video_record.get("last_error")
+    return probe
+
+
+def _video_path(
+    video_record: dict[str, Any],
+    clip_results: list[dict[str, Any]],
+) -> str | None:
+    path = video_record.get("path") or video_record.get("source_path")
+    if path is not None:
+        return str(path)
+    for result in clip_results:
+        if result.get("video_path") is not None:
+            return str(result["video_path"])
+    return None
+
+
+def _video_start_time(
+    video_record: dict[str, Any],
+    clip_results: list[dict[str, Any]],
+) -> Any:
+    if video_record.get("video_start_time") is not None:
+        return video_record.get("video_start_time")
+    for result in clip_results:
+        timeline = result.get("monitoring_timeline")
+        if isinstance(timeline, dict) and timeline.get("video_start_time") is not None:
+            return timeline.get("video_start_time")
+    return None
+
+
+def _first_present(record: dict[str, Any], keys: tuple[str, ...]) -> Any:
+    for key in keys:
+        if record.get(key) is not None:
+            return record.get(key)
+    return None
+
+
+def _float_or_none(value: Any) -> float | None:
+    if value is None:
+        return None
+    try:
+        return float(value)
+    except (TypeError, ValueError):
+        return None
+
+
+def _min_number(left: Any, right: Any) -> float | None:
+    values = [value for value in (_float_or_none(left), _float_or_none(right)) if value is not None]
+    return min(values) if values else None
+
+
+def _max_number(left: Any, right: Any) -> float | None:
+    values = [value for value in (_float_or_none(left), _float_or_none(right)) if value is not None]
+    return max(values) if values else None
+
+
+def _unique(values: list[Any]) -> list[Any]:
+    seen = set()
+    unique_values = []
+    for value in values:
+        marker = json.dumps(value, sort_keys=True) if isinstance(value, dict) else value
+        if marker in seen:
+            continue
+        seen.add(marker)
+        unique_values.append(value)
+    return unique_values
+
+
+def _write_json(path: Path, payload: dict[str, Any]) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    path.write_text(
+        json.dumps(payload, ensure_ascii=False, indent=2, sort_keys=True) + "\n",
+        encoding="utf-8",
+    )
+
+
+def _now_iso() -> str:
+    return datetime.now(timezone.utc).isoformat()
--- a/video_ai_analysis_poc/cli.py
+++ b/video_ai_analysis_poc/cli.py
@@ -0,0 +1,424 @@
+from __future__ import annotations
+
+import argparse
+import json
+from pathlib import Path
+from typing import Sequence
+
+from .aggregator import aggregate_outputs
+from .clips import build_clip_records
+from .config import DEFAULT_CONFIG_PATH, load_config
+from .discovery import discover_videos
+from .ffmpeg_sampler import sample_video_frames
+from .hik_cloud import download_hik_cloud_recordings
+from .manifest import read_jsonl, write_manifest
+from .paths import stable_video_id
+from .probe import probe_video
+from .result_parser import build_clip_result
+from .timeline import DEFAULT_TIMEZONE, format_beijing_time, timeline_start_epoch
+from .vlm_client import infer_clip
+
+
+def main(argv: Sequence[str] | None = None) -> int:
+    parser = argparse.ArgumentParser(
+        description="Local video batch analysis PoC entrypoint."
+    )
+    parser.add_argument("--config", default=str(DEFAULT_CONFIG_PATH))
+    parser.add_argument("--input-dir")
+    parser.add_argument("--output-dir")
+    parser.add_argument("--dry-run", action="store_true")
+    parser.add_argument("--until", choices=["clips", "inference"])
+    parser.add_argument("--limit-clips", type=int)
+    args = parser.parse_args(argv)
+
+    config = load_config(
+        args.config,
+        input_dir=args.input_dir,
+        output_dir=args.output_dir,
+    )
+    if args.dry_run and args.until:
+        parser.error("--dry-run cannot be combined with --until")
+    if args.limit_clips is not None and args.limit_clips < 0:
+        parser.error("--limit-clips must be non-negative")
+
+    output_dir = Path(config["output"]["dir"])
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    video_manifest_path = output_dir / "video_manifest.jsonl"
+    resume_enabled = bool(config.get("output", {}).get("resume", False))
+    records = _load_resume_records(
+        video_manifest_path,
+        resume=resume_enabled,
+    )
+    record_indexes = {
+        _record_key(record): index
+        for index, record in enumerate(records)
+        if _record_key(record) is not None
+    }
+
+    try:
+        _acquire_source_records(
+            config,
+            output_dir,
+            records,
+            record_indexes,
+            download_source=not args.dry_run,
+        )
+    except ValueError as exc:
+        parser.error(str(exc))
+
+    write_manifest(video_manifest_path, records)
+    if args.dry_run:
+        return 0
+
+    clip_manifest_path = output_dir / "clip_manifest.jsonl"
+    existing_clip_records = read_jsonl(clip_manifest_path) if resume_enabled else []
+    existing_clip_video_ids = {
+        str(record.get("video_id"))
+        for record in existing_clip_records
+        if record.get("video_id")
+    }
+
+    frame_manifest_path = output_dir / "frame_manifest.jsonl"
+    frame_records = read_jsonl(frame_manifest_path) if resume_enabled else []
+    timezone_name = str(config.get("runtime", {}).get("timezone", DEFAULT_TIMEZONE))
+    backfilled_frame_video_ids = _backfill_frame_beijing_times(
+        frame_records,
+        records,
+        timezone_name=timezone_name,
+    )
+    existing_sampled_video_ids = {
+        str(record.get("video_id"))
+        for record in frame_records
+        if record.get("status") == "sampled" and record.get("video_id")
+    }
+    changed_frame_video_ids: set[str] = set(backfilled_frame_video_ids)
+    for record in records:
+        if record.get("status") != "probed":
+            continue
+        video_id = str(record.get("video_id"))
+        if args.until == "inference" and video_id in existing_clip_video_ids:
+            continue
+        if video_id in existing_sampled_video_ids:
+            continue
+        frame_records = _without_video_records(frame_records, video_id)
+        ffmpeg_config = dict(config["ffmpeg"])
+        ffmpeg_config["timezone"] = timezone_name
+        frame_records.extend(
+            sample_video_frames(
+                record,
+                output_dir,
+                ffmpeg_config,
+                manifest_path=None,
+            )
+        )
+        changed_frame_video_ids.add(video_id)
+    write_manifest(frame_manifest_path, frame_records)
+
+    sampled_video_ids = {
+        str(record.get("video_id"))
+        for record in frame_records
+        if record.get("status") == "sampled" and record.get("video_id")
+    }
+    clip_rebuild_video_ids = changed_frame_video_ids | (
+        sampled_video_ids - existing_clip_video_ids
+    )
+    clip_records = [
+        record
+        for record in existing_clip_records
+        if str(record.get("video_id")) not in clip_rebuild_video_ids
+    ]
+    frames_to_build = [
+        record
+        for record in frame_records
+        if str(record.get("video_id")) in clip_rebuild_video_ids
+    ]
+    clip_records.extend(build_clip_records(frames_to_build, config["clip"]))
+    write_manifest(output_dir / "clip_manifest.jsonl", clip_records)
+    if args.until == "clips":
+        return 0
+
+    _run_inference(
+        clip_records,
+        records,
+        output_dir,
+        config,
+        limit_clips=args.limit_clips,
+        resume=resume_enabled,
+    )
+    if args.until == "inference":
+        return 0
+    aggregate_outputs(output_dir, config)
+    return 0
+
+
+def _load_resume_records(path: Path, *, resume: bool) -> list[dict[str, object]]:
+    if not resume:
+        return []
+    return read_jsonl(path)
+
+
+def _record_key(record: dict[str, object]) -> str | None:
+    video_id = record.get("video_id")
+    if video_id:
+        return str(video_id)
+    path = record.get("path")
+    if path:
+        return stable_video_id(str(path))
+    return None
+
+
+def _acquire_source_records(
+    config: dict[str, object],
+    output_dir: Path,
+    records: list[dict[str, object]],
+    record_indexes: dict[str, int],
+    *,
+    download_source: bool = True,
+) -> None:
+    for source_record in _source_video_records(
+        config,
+        output_dir,
+        download_source=download_source,
+    ):
+        path = source_record.get("path")
+        if not path:
+            continue
+        video_id = stable_video_id(str(path))
+        existing_index = record_indexes.get(video_id)
+        if (
+            existing_index is not None
+            and records[existing_index].get("status") == "probed"
+        ):
+            continue
+
+        probe_record = probe_video(
+            str(path),
+            timeout_seconds=config["ffprobe"]["timeout_seconds"],
+        )
+        record = {**source_record, **probe_record, "video_id": video_id}
+        if existing_index is None:
+            record_indexes[video_id] = len(records)
+            records.append(record)
+        else:
+            records[existing_index] = record
+
+
+def _source_video_records(
+    config: dict[str, object],
+    output_dir: Path,
+    *,
+    download_source: bool = True,
+) -> list[dict[str, object]]:
+    source_config = config.get("source", {})
+    source_mode = "local"
+    if isinstance(source_config, dict):
+        source_mode = str(source_config.get("mode", "local"))
+
+    if source_mode == "local":
+        videos = discover_videos(
+            config["input"]["dir"],
+            config["input"]["extensions"],
+            recursive=config["input"]["recursive"],
+        )
+        return [{"path": path} for path in videos]
+
+    if source_mode == "hik_cloud":
+        return [
+            record
+            for record in download_hik_cloud_recordings(
+                config,
+                output_dir,
+                download=download_source,
+            )
+            if record.get("status") == "downloaded"
+        ]
+
+    raise ValueError(f"unsupported source.mode: {source_mode}")
+
+
+def _without_video_records(
+    records: list[dict[str, object]],
+    video_id: str,
+) -> list[dict[str, object]]:
+    return [record for record in records if str(record.get("video_id")) != video_id]
+
+
+def _backfill_frame_beijing_times(
+    frame_records: list[dict[str, object]],
+    video_records: list[dict[str, object]],
+    *,
+    timezone_name: str,
+) -> set[str]:
+    video_by_id = {
+        str(record.get("video_id")): record
+        for record in video_records
+        if record.get("video_id")
+    }
+    changed_video_ids: set[str] = set()
+    for frame_record in frame_records:
+        if frame_record.get("status") != "sampled" or frame_record.get("beijing_time"):
+            continue
+        video_id = str(frame_record.get("video_id") or "")
+        start_epoch = timeline_start_epoch(video_by_id.get(video_id, {}))
+        beijing_time = format_beijing_time(
+            start_epoch,
+            offset_seconds=float(frame_record.get("offset_seconds") or 0),
+            timezone_name=timezone_name,
+        )
+        if beijing_time is None:
+            continue
+        frame_record["beijing_time"] = beijing_time
+        changed_video_ids.add(video_id)
+    return changed_video_ids
+
+
+def _run_inference(
+    clip_records: list[dict[str, object]],
+    video_records: list[dict[str, object]],
+    output_dir: Path,
+    config: dict[str, object],
+    *,
+    limit_clips: int | None,
+    resume: bool,
+) -> None:
+    results_path = output_dir / "clip_results.jsonl"
+    result_records = read_jsonl(results_path) if resume else []
+    clip_by_id = {
+        str(record.get("clip_id")): record
+        for record in clip_records
+        if record.get("clip_id")
+    }
+    result_records = [
+        _refresh_result_timeline(record, clip_by_id, config)
+        for record in result_records
+    ]
+    ok_clip_ids = {
+        str(record.get("clip_id"))
+        for record in result_records
+        if record.get("status") == "ok" and record.get("clip_id")
+    }
+    video_by_id = {
+        str(record.get("video_id")): record
+        for record in video_records
+        if record.get("video_id")
+    }
+    processed = 0
+    for clip_record in clip_records:
+        clip_id = str(clip_record.get("clip_id"))
+        if clip_id in ok_clip_ids:
+            continue
+        if limit_clips is not None and processed >= limit_clips:
+            break
+
+        result_records = [
+            record for record in result_records if str(record.get("clip_id")) != clip_id
+        ]
+        video_record = video_by_id.get(str(clip_record.get("video_id")), {})
+        result = _infer_and_parse_clip(clip_record, video_record, output_dir, config)
+        result_records.append(result)
+        _write_jsonl_exact(results_path, result_records)
+        processed += 1
+
+    _write_jsonl_exact(results_path, result_records)
+
+
+def _refresh_result_timeline(
+    result_record: dict[str, object],
+    clip_by_id: dict[str, dict[str, object]],
+    config: dict[str, object],
+) -> dict[str, object]:
+    clip_record = clip_by_id.get(str(result_record.get("clip_id")))
+    if not clip_record:
+        return result_record
+    if not _clip_has_beijing_timing(clip_record):
+        return result_record
+    timeline = dict(result_record.get("monitoring_timeline") or {})
+    timeline.update(
+        {
+            "timezone": config.get("runtime", {}).get("timezone", DEFAULT_TIMEZONE),
+            "clip_start_seconds": clip_record.get("clip_start_seconds"),
+            "clip_end_seconds": clip_record.get("clip_end_seconds"),
+            "clip_start_timecode": clip_record.get("clip_start_timecode"),
+            "clip_end_timecode": clip_record.get("clip_end_timecode"),
+            "clip_start_beijing_time": clip_record.get("clip_start_beijing_time"),
+            "clip_end_beijing_time": clip_record.get("clip_end_beijing_time"),
+            "frame_times": clip_record.get("frame_times", []),
+        }
+    )
+    refreshed = dict(result_record)
+    refreshed["monitoring_timeline"] = timeline
+    return refreshed
+
+
+def _clip_has_beijing_timing(clip_record: dict[str, object]) -> bool:
+    if clip_record.get("clip_start_beijing_time") or clip_record.get("clip_end_beijing_time"):
+        return True
+    for frame in clip_record.get("frame_times", []) or []:
+        if isinstance(frame, dict) and frame.get("beijing_time"):
+            return True
+    return False
+
+
+def _infer_and_parse_clip(
+    clip_record: dict[str, object],
+    video_record: dict[str, object],
+    output_dir: Path,
+    config: dict[str, object],
+) -> dict[str, object]:
+    schema_config = config.get("schema", {})
+    parse_retry = 0
+    if isinstance(schema_config, dict):
+        parse_retry = int(schema_config.get("parse_retry", 0))
+
+    attempts = parse_retry + 1
+    result: dict[str, object] | None = None
+    for attempt in range(attempts):
+        try:
+            inference = infer_clip(
+                clip_record,
+                output_dir,
+                config["vlm"],
+                config["prompt"],
+            )
+        except Exception as exc:
+            return build_clip_result(
+                "",
+                clip_record,
+                video_record,
+                config,
+                processing={},
+                status="inference_failed",
+                error=str(exc),
+            )
+
+        result = build_clip_result(
+            str(inference.get("raw_response", "")),
+            clip_record,
+            video_record,
+            config,
+            processing={
+                "latency_ms": inference.get("latency_ms"),
+                "http_status": inference.get("http_status"),
+                "attempt": attempt + 1,
+            },
+        )
+        if result.get("status") != "parse_failed":
+            return result
+    if result is None:
+        raise RuntimeError("unreachable inference state")
+    return result
+
+
+def _write_jsonl_exact(
+    path: Path,
+    records: list[dict[str, object]],
+) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    with path.open("w", encoding="utf-8") as handle:
+        for record in records:
+            handle.write(json.dumps(record, ensure_ascii=False, sort_keys=True) + "\n")
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
--- a/video_ai_analysis_poc/clips.py
+++ b/video_ai_analysis_poc/clips.py
@@ -0,0 +1,158 @@
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Any
+
+from .frames import seconds_to_timecode
+from .manifest import read_jsonl, write_manifest
+from .timeline import derive_time_from_reference
+
+
+def build_clip_records(
+    frame_records: list[dict[str, Any]],
+    clip_config: dict[str, Any],
+) -> list[dict[str, Any]]:
+    sampled_frames = [
+        record for record in frame_records if record.get("status") == "sampled"
+    ]
+    by_video: dict[str, list[dict[str, Any]]] = {}
+    for frame in sampled_frames:
+        by_video.setdefault(str(frame["video_id"]), []).append(frame)
+
+    clips = []
+    for video_id, frames in sorted(by_video.items()):
+        clips.extend(_build_video_clips(video_id, frames, clip_config))
+    return clips
+
+
+def build_clip_records_from_manifest(
+    frame_manifest_path: str | Path,
+    clip_manifest_path: str | Path,
+    clip_config: dict[str, Any],
+) -> list[dict[str, Any]]:
+    clips = build_clip_records(read_jsonl(frame_manifest_path), clip_config)
+    write_manifest(clip_manifest_path, clips)
+    return clips
+
+
+def _build_video_clips(
+    video_id: str,
+    frames: list[dict[str, Any]],
+    clip_config: dict[str, Any],
+) -> list[dict[str, Any]]:
+    sorted_frames = sorted(frames, key=lambda frame: float(frame["offset_seconds"]))
+    if not sorted_frames:
+        return []
+
+    length_seconds = float(clip_config.get("length_seconds", 10))
+    stride_seconds = float(clip_config.get("stride_seconds", length_seconds))
+    frames_per_clip = int(clip_config.get("frames_per_clip", 8))
+    min_frames_per_clip = int(clip_config.get("min_frames_per_clip", 4))
+    max_offset = max(float(frame["offset_seconds"]) for frame in sorted_frames)
+    timeline_end = _estimated_timeline_end(sorted_frames)
+
+    clips = []
+    clip_index = 1
+    start = 0.0
+    while start <= max_offset:
+        end = min(start + length_seconds, timeline_end)
+        in_window = [
+            frame
+            for frame in sorted_frames
+            if start <= float(frame["offset_seconds"]) < end
+        ]
+        if len(in_window) >= min_frames_per_clip:
+            selected_frames = _uniform_sample(in_window, frames_per_clip)
+            start_beijing_time, end_beijing_time = _clip_beijing_time_range(
+                in_window,
+                start,
+                end,
+            )
+            clip = {
+                "video_id": video_id,
+                "clip_id": f"{video_id}_c{clip_index:06d}",
+                "clip_start_seconds": round(start, 6),
+                "clip_end_seconds": round(end, 6),
+                "clip_start_timecode": seconds_to_timecode(start),
+                "clip_end_timecode": seconds_to_timecode(end),
+                "frame_times": [_frame_time(frame) for frame in selected_frames],
+                "status": "pending",
+                "retry_count": 0,
+                "last_error": None,
+            }
+            if start_beijing_time is not None:
+                clip["clip_start_beijing_time"] = start_beijing_time
+            if end_beijing_time is not None:
+                clip["clip_end_beijing_time"] = end_beijing_time
+            clips.append(clip)
+            clip_index += 1
+        start += stride_seconds
+    return clips
+
+
+def _estimated_timeline_end(frames: list[dict[str, Any]]) -> float:
+    offsets = [float(frame["offset_seconds"]) for frame in frames]
+    if len(offsets) < 2:
+        return offsets[-1]
+    intervals = [
+        current - previous
+        for previous, current in zip(offsets, offsets[1:])
+        if current > previous
+    ]
+    if not intervals:
+        return offsets[-1]
+    return offsets[-1] + min(intervals)
+
+
+def _uniform_sample(
+    frames: list[dict[str, Any]],
+    frames_per_clip: int,
+) -> list[dict[str, Any]]:
+    if len(frames) <= frames_per_clip:
+        return frames
+    if frames_per_clip <= 1:
+        return [frames[0]]
+    last_index = len(frames) - 1
+    indexes = [
+        round(position * last_index / (frames_per_clip - 1))
+        for position in range(frames_per_clip)
+    ]
+    return [frames[index] for index in indexes]
+
+
+def _frame_time(frame: dict[str, Any]) -> dict[str, Any]:
+    record = {
+        "frame_id": frame.get("frame_id"),
+        "frame_path": frame.get("frame_path"),
+        "offset_seconds": frame.get("offset_seconds"),
+        "timecode": frame.get("timecode"),
+        "pts_time": frame.get("pts_time"),
+    }
+    if frame.get("beijing_time") is not None:
+        record["beijing_time"] = frame.get("beijing_time")
+    return record
+
+
+def _clip_beijing_time_range(
+    frames: list[dict[str, Any]],
+    start: float,
+    end: float,
+) -> tuple[str | None, str | None]:
+    for frame in frames:
+        reference_time = frame.get("beijing_time")
+        if not reference_time:
+            continue
+        reference_offset = frame.get("offset_seconds")
+        return (
+            derive_time_from_reference(
+                str(reference_time),
+                reference_offset_seconds=reference_offset,
+                target_offset_seconds=start,
+            ),
+            derive_time_from_reference(
+                str(reference_time),
+                reference_offset_seconds=reference_offset,
+                target_offset_seconds=end,
+            ),
+        )
+    return None, None
--- a/video_ai_analysis_poc/config.py
+++ b/video_ai_analysis_poc/config.py
@@ -0,0 +1,278 @@
+from __future__ import annotations
+
+import ast
+from pathlib import Path
+from typing import Any
+
+from .paths import resolve_path, validate_output_dir
+
+
+DEFAULT_CONFIG_PATH = Path(__file__).resolve().parent.parent / "config" / "local_batch.yaml"
+
+
+def load_config(
+    config_path: str | Path = DEFAULT_CONFIG_PATH,
+    *,
+    input_dir: str | Path | None = None,
+    output_dir: str | Path | None = None,
+) -> dict[str, Any]:
+    path = Path(config_path).expanduser().resolve(strict=False)
+    raw_config = _parse_simple_yaml(path)
+    config = _with_defaults(raw_config)
+
+    base_dir = path.parent.parent if path.parent.name == "config" else path.parent
+
+    if input_dir is not None:
+        config["input"]["dir"] = str(input_dir)
+    if output_dir is not None:
+        config["output"]["dir"] = str(output_dir)
+
+    config["input"]["dir"] = str(resolve_path(config["input"]["dir"], base_dir=base_dir))
+    config["output"]["dir"] = str(
+        resolve_path(config["output"]["dir"], base_dir=base_dir)
+    )
+    validate_output_dir(config["input"]["dir"], config["output"]["dir"])
+
+    extensions = config["input"].get("extensions", [])
+    config["input"]["extensions"] = _normalize_extensions(extensions)
+    config["input"]["recursive"] = bool(config["input"].get("recursive", True))
+    config.setdefault("ffprobe", {})
+    config["ffprobe"]["timeout_seconds"] = int(
+        config["ffprobe"].get("timeout_seconds", 30)
+    )
+    return config
+
+
+def _with_defaults(config: dict[str, Any]) -> dict[str, Any]:
+    merged: dict[str, Any] = {
+        "input": {
+            "dir": "./videos",
+            "recursive": True,
+            "extensions": [".mp4", ".mov", ".mkv", ".avi", ".flv", ".ts", ".m4v"],
+        },
+        "output": {
+            "dir": "./outputs/local-batch",
+            "overwrite": False,
+            "resume": True,
+            "keep_frames": True,
+        },
+        "source": {"mode": "local"},
+        "hik_cloud": {
+            "api_base_url": "https://api2.hik-cloud.com",
+            "download_path": "/v1/carrier/cstorage/open/play/download",
+            "access_token": None,
+            "access_token_env": "HIK_CLOUD_ACCESS_TOKEN",
+            "devices": [],
+            "time_ranges": [],
+            "chunk_seconds": 600,
+            "timeout_seconds": 60,
+            "download_timeout_seconds": 600,
+        },
+        "ffprobe": {"timeout_seconds": 30},
+        "ffmpeg": {
+            "prefer_nvdec": True,
+            "allow_cpu_fallback": False,
+            "hwaccel": "cuda",
+            "codec_decoders": {"h264": "h264_cuvid", "hevc": "hevc_cuvid"},
+            "frame_fps": 1,
+            "frame_width": 640,
+            "jpeg_quality": 4,
+            "timeout_seconds_per_video": 3600,
+        },
+        "clip": {
+            "length_seconds": 10,
+            "stride_seconds": 10,
+            "frames_per_clip": 8,
+            "min_frames_per_clip": 4,
+        },
+        "vlm": {
+            "api_base_url": "http://localhost:8679",
+            "chat_completions_path": "/v1/chat/completions",
+            "model": "memai-zhengxin-v3-20260413",
+            "timeout_seconds": 120,
+            "max_tokens": 512,
+            "temperature": 0,
+            "batch_size": 1,
+            "image_transport": "data_uri",
+            "retries": 1,
+        },
+        "prompt": {
+            "system": "You are a store video analysis assistant. Return strict JSON only.",
+            "user": "Analyze this clip. Return events and screen_time. If no event, return events: [].",
+        },
+        "schema": {
+            "version": "local-batch-v1",
+            "event_types": [
+                "customer_enter",
+                "customer_leave",
+                "queue_detected",
+                "staff_absent",
+                "staff_present",
+                "area_crowded",
+                "abnormal_behavior",
+                "unknown",
+            ],
+            "require_strict_json": True,
+            "parse_retry": 1,
+            "merge_gap_seconds": 30,
+        },
+        "runtime": {"timezone": "Asia/Shanghai", "log_level": "INFO"},
+    }
+    for section, values in config.items():
+        if isinstance(values, dict) and isinstance(merged.get(section), dict):
+            merged[section].update(values)
+        else:
+            merged[section] = values
+    return merged
+
+
+def _normalize_extensions(extensions: list[str]) -> list[str]:
+    normalized = []
+    for extension in extensions:
+        value = str(extension).lower()
+        if not value.startswith("."):
+            value = f".{value}"
+        normalized.append(value)
+    return normalized
+
+
+def _parse_simple_yaml(path: Path) -> dict[str, Any]:
+    if not path.exists():
+        raise FileNotFoundError(f"config file not found: {path}")
+
+    root: dict[str, Any] = {}
+    stack: list[tuple[int, dict[str, Any] | list[Any]]] = [(-1, root)]
+    lines = path.read_text(encoding="utf-8").splitlines()
+
+    index = 0
+    while index < len(lines):
+        raw_line = lines[index].rstrip()
+        stripped = raw_line.strip()
+        if not stripped or raw_line.lstrip().startswith("#"):
+            index += 1
+            continue
+
+        indent = len(raw_line) - len(raw_line.lstrip(" "))
+        while indent <= stack[-1][0]:
+            stack.pop()
+        parent = stack[-1][1]
+
+        if stripped.startswith("- "):
+            if not isinstance(parent, list):
+                raise ValueError(f"list item without list parent: {raw_line}")
+            item = stripped[2:].strip()
+            if ":" in item:
+                key, value = item.split(":", 1)
+                mapping: dict[str, Any] = {}
+                parent.append(mapping)
+                key = key.strip()
+                value = value.strip()
+                if not value:
+                    next_stripped = _next_stripped(lines, index)
+                    child: dict[str, Any] | list[Any]
+                    child = [] if next_stripped and next_stripped.startswith("- ") else {}
+                    mapping[key] = child
+                    stack.append((indent, mapping))
+                    stack.append((indent + 2, child))
+                else:
+                    mapping[key] = _parse_scalar(value)
+                    stack.append((indent, mapping))
+            else:
+                parent.append(_parse_scalar(item))
+            index += 1
+            continue
+
+        if not isinstance(parent, dict):
+            raise ValueError(f"mapping entry inside list is not supported: {raw_line}")
+
+        if ":" not in stripped:
+            raise ValueError(f"unsupported config line: {raw_line}")
+
+        key, value = stripped.split(":", 1)
+        key = key.strip()
+        value = value.strip()
+        if _is_block_scalar(value):
+            parent[key], index = _parse_block_scalar(lines, index, indent, value)
+            continue
+        if not value:
+            next_stripped = _next_stripped(lines, index)
+            child: dict[str, Any] | list[Any]
+            child = [] if next_stripped and next_stripped.startswith("- ") else {}
+            parent[key] = child
+            stack.append((indent, child))
+        else:
+            parent[key] = _parse_scalar(value)
+        index += 1
+
+    return root
+
+
+def _next_stripped(lines: list[str], current_index: int) -> str | None:
+    for raw_line in lines[current_index + 1 :]:
+        stripped = raw_line.strip()
+        if stripped and not raw_line.lstrip().startswith("#"):
+            return stripped
+    return None
+
+
+def _is_block_scalar(value: str) -> bool:
+    return value in {">", ">-", "|", "|-"}
+
+
+def _parse_block_scalar(
+    lines: list[str],
+    start_index: int,
+    parent_indent: int,
+    marker: str,
+) -> tuple[str, int]:
+    content_lines: list[str] = []
+    content_indent: int | None = None
+    index = start_index + 1
+
+    while index < len(lines):
+        raw_line = lines[index].rstrip()
+        stripped = raw_line.strip()
+        if not stripped:
+            content_lines.append("")
+            index += 1
+            continue
+
+        indent = len(raw_line) - len(raw_line.lstrip(" "))
+        if indent <= parent_indent:
+            break
+        if content_indent is None:
+            content_indent = indent
+        content_lines.append(raw_line[content_indent:])
+        index += 1
+
+    if marker.endswith("-"):
+        while content_lines and content_lines[-1] == "":
+            content_lines.pop()
+    return "\n".join(content_lines), index
+
+
+def _parse_scalar(value: str) -> Any:
+    lower = value.lower()
+    if lower == "true":
+        return True
+    if lower == "false":
+        return False
+    if lower in {"null", "none"}:
+        return None
+    if value.startswith("[") and value.endswith("]"):
+        parsed = ast.literal_eval(value)
+        if not isinstance(parsed, list):
+            raise ValueError(f"expected list value: {value}")
+        return parsed
+    if (value.startswith('"') and value.endswith('"')) or (
+        value.startswith("'") and value.endswith("'")
+    ):
+        return ast.literal_eval(value)
+    try:
+        return int(value)
+    except ValueError:
+        pass
+    try:
+        return float(value)
+    except ValueError:
+        return value
--- a/video_ai_analysis_poc/discovery.py
+++ b/video_ai_analysis_poc/discovery.py
@@ -0,0 +1,27 @@
+from __future__ import annotations
+
+from pathlib import Path
+
+
+def discover_videos(
+    input_dir: str | Path,
+    extensions: list[str],
+    *,
+    recursive: bool,
+) -> list[Path]:
+    root = Path(input_dir).expanduser()
+    if not root.exists():
+        raise FileNotFoundError(f"input dir not found: {root}")
+    if not root.is_dir():
+        raise NotADirectoryError(f"input path is not a directory: {root}")
+
+    allowed = {
+        extension.lower() if extension.startswith(".") else f".{extension.lower()}"
+        for extension in extensions
+    }
+    iterator = root.rglob("*") if recursive else root.iterdir()
+    return sorted(
+        path
+        for path in iterator
+        if path.is_file() and path.suffix.lower() in allowed
+    )
--- a/video_ai_analysis_poc/ffmpeg_sampler.py
+++ b/video_ai_analysis_poc/ffmpeg_sampler.py
@@ -0,0 +1,243 @@
+from __future__ import annotations
+
+import math
+import subprocess
+from pathlib import Path
+from typing import Any
+
+from .frames import build_frame_records
+from .manifest import read_jsonl, write_manifest
+from .timeline import DEFAULT_TIMEZONE, timeline_start_epoch
+
+
+NVDEC_CODECS = {"h264", "hevc"}
+
+
+def build_sample_command(
+    video_path: str | Path,
+    output_dir: str | Path,
+    video_id: str,
+    ffmpeg_config: dict[str, Any],
+    *,
+    codec_name: str | None,
+    max_frames: int | None = None,
+    max_duration_seconds: float | None = None,
+) -> list[str]:
+    frame_dir = Path(output_dir).expanduser() / "frames" / video_id
+    frame_pattern = frame_dir / "%06d.jpg"
+    command = ["ffmpeg", "-hide_banner", "-y"]
+
+    codec = (codec_name or "").lower()
+    prefer_nvdec = bool(ffmpeg_config.get("prefer_nvdec", True))
+    allow_cpu_fallback = bool(ffmpeg_config.get("allow_cpu_fallback", False))
+    decoders = ffmpeg_config.get("codec_decoders", {})
+    decoder = decoders.get(codec) if isinstance(decoders, dict) else None
+
+    if prefer_nvdec and codec in NVDEC_CODECS and decoder:
+        command.extend(
+            [
+                "-hwaccel",
+                str(ffmpeg_config.get("hwaccel", "cuda")),
+                "-c:v",
+                str(decoder),
+            ]
+        )
+    elif not allow_cpu_fallback:
+        raise ValueError(
+            f"NVDEC decoder is required for codec {codec_name!r}; CPU fallback is disabled"
+        )
+
+    frame_fps = ffmpeg_config.get("frame_fps", 1)
+    frame_width = ffmpeg_config.get("frame_width", 640)
+    jpeg_quality = ffmpeg_config.get("jpeg_quality", 4)
+    command.extend(
+        [
+            "-i",
+            str(Path(video_path).expanduser()),
+        ]
+    )
+    if max_duration_seconds is not None and max_duration_seconds > 0:
+        command.extend(["-t", f"{max_duration_seconds:g}"])
+    command.extend(
+        [
+            "-vf",
+            f"fps={frame_fps},scale={frame_width}:-2",
+            "-q:v",
+            str(jpeg_quality),
+        ]
+    )
+    if max_frames is not None and max_frames > 0:
+        command.extend(["-frames:v", str(max_frames)])
+    command.append(str(frame_pattern))
+    return command
+
+
+def sample_video_frames(
+    video_record: dict[str, Any],
+    output_dir: str | Path,
+    ffmpeg_config: dict[str, Any],
+    *,
+    manifest_path: str | Path | None = None,
+) -> list[dict[str, Any]]:
+    video_id = str(video_record["video_id"])
+    output_root = Path(output_dir).expanduser().resolve(strict=False)
+    frame_dir = output_root / "frames" / video_id
+    frame_dir.mkdir(parents=True, exist_ok=True)
+
+    try:
+        max_frames = _max_output_frames(video_record, ffmpeg_config)
+        timezone_name = str(ffmpeg_config.get("timezone", DEFAULT_TIMEZONE))
+        start_epoch = timeline_start_epoch(video_record)
+        command = build_sample_command(
+            video_record.get("path") or video_record.get("source_path"),
+            output_root,
+            video_id,
+            ffmpeg_config,
+            codec_name=video_record.get("codec_name"),
+            max_frames=max_frames,
+            max_duration_seconds=_record_duration_seconds(video_record),
+        )
+        completed = subprocess.run(
+            command,
+            capture_output=True,
+            text=True,
+            check=True,
+            timeout=int(ffmpeg_config.get("timeout_seconds_per_video", 3600)),
+        )
+        records = build_frame_records(
+            video_id,
+            output_root,
+            frame_dir.glob("*.jpg"),
+            frame_fps=float(ffmpeg_config.get("frame_fps", 1)),
+            timeline_start_epoch=start_epoch,
+            timezone_name=timezone_name,
+        )
+        _attach_success_evidence(
+            records,
+            command,
+            stderr=completed.stderr,
+        )
+    except subprocess.CalledProcessError as exc:
+        records = build_frame_records(
+            video_id,
+            output_root,
+            frame_dir.glob("*.jpg"),
+            frame_fps=float(ffmpeg_config.get("frame_fps", 1)),
+            timeline_start_epoch=start_epoch,
+            timezone_name=timezone_name,
+        )
+        if records and (max_frames is None or len(records) >= max_frames):
+            _attach_success_evidence(
+                records,
+                command,
+                stderr=exc.stderr,
+            )
+        else:
+            records = [_failure_record(video_id, exc)]
+    except (subprocess.TimeoutExpired, ValueError) as exc:
+        records = [_failure_record(video_id, exc)]
+
+    if manifest_path is not None:
+        _replace_video_records(Path(manifest_path), video_id, records)
+    return records
+
+
+def _replace_video_records(
+    manifest_path: Path,
+    video_id: str,
+    new_records: list[dict[str, Any]],
+) -> None:
+    existing = [
+        record
+        for record in read_jsonl(manifest_path)
+        if str(record.get("video_id")) != video_id
+    ]
+    write_manifest(manifest_path, [*existing, *new_records])
+
+
+def _failure_record(video_id: str, exc: BaseException) -> dict[str, Any]:
+    return {
+        "video_id": video_id,
+        "frame_id": None,
+        "frame_path": None,
+        "offset_seconds": None,
+        "timecode": None,
+        "pts_time": None,
+        "status": "sample_failed",
+        "retry_count": 0,
+        "last_error": _error_text(exc),
+    }
+
+
+def _attach_success_evidence(
+    records: list[dict[str, Any]],
+    command: list[str],
+    *,
+    stderr: str | None,
+) -> None:
+    evidence = {
+        "ffmpeg_command": command,
+        "decoder": _command_value_after(command, "-c:v"),
+        "hwaccel": _command_value_after(command, "-hwaccel"),
+        "stderr_summary": _stderr_summary(stderr),
+    }
+    for record in records:
+        record.update(evidence)
+
+
+def _command_value_after(command: list[str], flag: str) -> str | None:
+    try:
+        index = command.index(flag)
+    except ValueError:
+        return None
+    if index + 1 >= len(command):
+        return None
+    return command[index + 1]
+
+
+def _stderr_summary(stderr: str | None, *, limit: int = 2000) -> str:
+    if not stderr:
+        return ""
+    text = stderr.strip()
+    if len(text) <= limit:
+        return text
+    return text[:limit]
+
+
+def _error_text(exc: BaseException) -> str:
+    if isinstance(exc, subprocess.CalledProcessError):
+        return str(exc.stderr or exc.stdout or exc)
+    if isinstance(exc, subprocess.TimeoutExpired):
+        return f"ffmpeg timed out after {exc.timeout}s"
+    return str(exc)
+
+
+def _max_output_frames(
+    video_record: dict[str, Any],
+    ffmpeg_config: dict[str, Any],
+) -> int | None:
+    frame_fps = _optional_float(ffmpeg_config.get("frame_fps", 1))
+    if frame_fps is None or frame_fps <= 0:
+        return None
+    duration_seconds = _record_duration_seconds(video_record)
+    if duration_seconds is None or duration_seconds <= 0:
+        return None
+    return max(1, math.ceil(duration_seconds * frame_fps) + 1)
+
+
+def _record_duration_seconds(video_record: dict[str, Any]) -> float | None:
+    for begin_key, end_key in (
+        ("actual_begin", "actual_end"),
+        ("requested_begin", "requested_end"),
+    ):
+        begin = _optional_float(video_record.get(begin_key))
+        end = _optional_float(video_record.get(end_key))
+        if begin is not None and end is not None and end > begin:
+            return end - begin
+    return _optional_float(video_record.get("duration_seconds"))
+
+
+def _optional_float(value: Any) -> float | None:
+    if value is None or value == "":
+        return None
+    return float(value)
--- a/video_ai_analysis_poc/frames.py
+++ b/video_ai_analysis_poc/frames.py
@@ -0,0 +1,59 @@
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Any, Iterable
+
+from .timeline import DEFAULT_TIMEZONE, format_beijing_time
+
+
+def seconds_to_timecode(seconds: float | int | None) -> str | None:
+    if seconds is None:
+        return None
+    total_seconds = int(float(seconds))
+    hours = total_seconds // 3600
+    minutes = (total_seconds % 3600) // 60
+    remaining_seconds = total_seconds % 60
+    return f"{hours:02d}:{minutes:02d}:{remaining_seconds:02d}"
+
+
+def build_frame_records(
+    video_id: str,
+    output_dir: str | Path,
+    frame_paths: Iterable[str | Path],
+    *,
+    frame_fps: float,
+    timeline_start_epoch: float | int | str | None = None,
+    timezone_name: str = DEFAULT_TIMEZONE,
+) -> list[dict[str, Any]]:
+    base_dir = Path(output_dir).expanduser().resolve(strict=False)
+    records = []
+    for index, frame_path in enumerate(sorted(Path(path) for path in frame_paths), start=1):
+        offset_seconds = round((index - 1) / frame_fps, 6)
+        record = {
+            "video_id": video_id,
+            "frame_id": f"{video_id}_f{index:06d}",
+            "frame_path": _relative_frame_path(frame_path, base_dir),
+            "offset_seconds": offset_seconds,
+            "timecode": seconds_to_timecode(offset_seconds),
+            "pts_time": offset_seconds,
+            "status": "sampled",
+            "retry_count": 0,
+            "last_error": None,
+        }
+        beijing_time = format_beijing_time(
+            timeline_start_epoch,
+            offset_seconds=offset_seconds,
+            timezone_name=timezone_name,
+        )
+        if beijing_time is not None:
+            record["beijing_time"] = beijing_time
+        records.append(record)
+    return records
+
+
+def _relative_frame_path(frame_path: Path, base_dir: Path) -> str:
+    resolved = frame_path.expanduser().resolve(strict=False)
+    try:
+        return resolved.relative_to(base_dir).as_posix()
+    except ValueError:
+        return resolved.as_posix()
--- a/video_ai_analysis_poc/hik_cloud.py
+++ b/video_ai_analysis_poc/hik_cloud.py
@@ -0,0 +1,450 @@
+from __future__ import annotations
+
+import json
+import os
+import re
+from datetime import datetime
+from pathlib import Path
+from typing import Any
+from urllib.parse import urlparse, urlunparse
+import urllib.request
+from zoneinfo import ZoneInfo
+
+from .manifest import read_jsonl, write_manifest
+from .paths import hik_cloud_download_path
+
+
+DEFAULT_TIMEZONE = "Asia/Shanghai"
+DEFAULT_CHUNK_SECONDS = 600
+MAX_CHUNK_SECONDS = 3600
+DEFAULT_API_BASE_URL = "https://api2.hik-cloud.com"
+DEFAULT_DOWNLOAD_PATH = "/v1/carrier/cstorage/open/play/download"
+DEFAULT_TIMEOUT_SECONDS = 60
+DEFAULT_DOWNLOAD_TIMEOUT_SECONDS = 600
+DOWNLOAD_MANIFEST_NAME = "hik_cloud_download_manifest.jsonl"
+NO_RECORDING_CODE = 80438027
+TIME_FORMAT = "%Y-%m-%d %H:%M:%S"
+
+
+def parse_hik_time(value: str | int | float, timezone: str = DEFAULT_TIMEZONE) -> int:
+    if isinstance(value, bool):
+        raise ValueError(f"unsupported time value: {value!r}")
+    if isinstance(value, int | float):
+        return int(value)
+    if isinstance(value, str):
+        parsed = datetime.strptime(value, TIME_FORMAT)
+        return int(parsed.replace(tzinfo=ZoneInfo(timezone)).timestamp())
+    raise ValueError(f"unsupported time value: {value!r}")
+
+
+def build_download_chunks(config: dict[str, Any]) -> list[dict[str, Any]]:
+    hik_config = config.get("hik_cloud", {})
+    runtime_config = config.get("runtime", {})
+    timezone = runtime_config.get("timezone", DEFAULT_TIMEZONE)
+    chunk_seconds = int(hik_config.get("chunk_seconds", DEFAULT_CHUNK_SECONDS))
+    if chunk_seconds <= 0:
+        raise ValueError("chunk_seconds must be greater than 0")
+    if chunk_seconds > MAX_CHUNK_SECONDS:
+        raise ValueError("chunk_seconds must be less than or equal to 3600")
+
+    chunks: list[dict[str, Any]] = []
+    devices = hik_config.get("devices", [])
+    time_ranges = hik_config.get("time_ranges", [])
+    for device in devices:
+        for time_range in time_ranges:
+            requested_begin = parse_hik_time(time_range["begin"], timezone)
+            requested_end = parse_hik_time(time_range["end"], timezone)
+            if requested_end <= requested_begin:
+                raise ValueError("time range end must be after begin")
+
+            time_begin = requested_begin
+            while time_begin < requested_end:
+                time_end = min(time_begin + chunk_seconds, requested_end)
+                chunks.append(
+                    {
+                        "device_serial": device["device_serial"],
+                        "channel_no": device["channel_no"],
+                        "requested_begin": requested_begin,
+                        "requested_end": requested_end,
+                        "time_begin": time_begin,
+                        "time_end": time_end,
+                    }
+                )
+                time_begin = time_end
+    return chunks
+
+
+def resolve_access_token(config_or_hik_config: dict[str, Any]) -> str:
+    hik_config = _hik_config(config_or_hik_config)
+    access_token = hik_config.get("access_token")
+    if access_token:
+        return str(access_token)
+
+    access_token_env = hik_config.get("access_token_env")
+    if access_token_env:
+        env_token = os.environ.get(str(access_token_env))
+        if env_token:
+            return env_token
+
+    raise ValueError(
+        "missing hik_cloud access_token; configure access_token or access_token_env"
+    )
+
+
+def request_download_address(
+    chunk: dict[str, Any],
+    hik_config: dict[str, Any],
+    *,
+    http_post: Any | None = None,
+) -> dict[str, Any]:
+    token = resolve_access_token(hik_config)
+    api_base_url = str(hik_config.get("api_base_url") or DEFAULT_API_BASE_URL)
+    download_path = str(hik_config.get("download_path") or DEFAULT_DOWNLOAD_PATH)
+    url = api_base_url.rstrip("/") + download_path
+    headers = {
+        "Authorization": f"bearer {token}",
+        "Content-Type": "application/json",
+    }
+    json_body = {
+        "deviceSerial": chunk["device_serial"],
+        "channelNo": chunk["channel_no"],
+        "timeBegin": chunk["time_begin"],
+        "timeEnd": chunk["time_end"],
+    }
+    timeout_seconds = int(hik_config.get("timeout_seconds", DEFAULT_TIMEOUT_SECONDS))
+    post = http_post or _post_json
+
+    try:
+        response = post(url, json_body, headers, timeout_seconds)
+    except Exception as exc:  # pragma: no cover - exact urllib failures vary.
+        return {
+            **_chunk_metadata(chunk),
+            "status": "address_failed",
+            "code": None,
+            "last_error": _sanitize_error(exc, token),
+        }
+
+    code = _optional_int(response.get("code"))
+    if code == 0:
+        data = response.get("data") or {}
+        return {
+            **_chunk_metadata(chunk),
+            "status": "address_ok",
+            "code": code,
+            "url": data.get("url"),
+            "actual_begin": _optional_int(data.get("actualBeginTime")),
+            "actual_end": _optional_int(data.get("actualEndTime")),
+        }
+
+    status = "no_recording" if code == NO_RECORDING_CODE else "address_failed"
+    result = {
+        **_chunk_metadata(chunk),
+        "status": status,
+        "code": code,
+        "last_error": _api_error_message(response, token),
+    }
+    return result
+
+
+def download_hik_cloud_recordings(
+    config: dict[str, Any],
+    output_dir: str | Path,
+    *,
+    address_client: Any | None = None,
+    download_url: Any | None = None,
+    download: bool = True,
+) -> list[dict[str, Any]]:
+    output_path = Path(output_dir).expanduser().resolve(strict=False)
+    manifest_path = output_path / DOWNLOAD_MANIFEST_NAME
+    hik_config = _hik_config(config)
+    chunks = build_download_chunks(config)
+    resume = bool(config.get("output", {}).get("resume", False))
+    manifest_records = read_jsonl(manifest_path) if resume else []
+    existing_downloads = {
+        _manifest_key(record): record
+        for record in manifest_records
+        if _is_resumable_download(record)
+    }
+    get_address = address_client or request_download_address
+    fetch = download_url or _download_url
+    download_timeout_seconds = int(
+        hik_config.get("download_timeout_seconds", DEFAULT_DOWNLOAD_TIMEOUT_SECONDS)
+    )
+    token = _redaction_token(hik_config)
+
+    video_records: list[dict[str, Any]] = []
+    for chunk in chunks:
+        key = _chunk_key(chunk)
+        existing_record = existing_downloads.get(key)
+        if download and existing_record is not None:
+            video_records.append(_video_record_from_manifest(existing_record))
+            continue
+
+        address_result = get_address(chunk, hik_config)
+        status = address_result.get("status")
+        if status != "address_ok":
+            _upsert_manifest_record(
+                manifest_records,
+                _manifest_record(
+                    chunk,
+                    address_result,
+                    status=str(status or "address_failed"),
+                    token=token,
+                ),
+            )
+            continue
+
+        if not download:
+            _upsert_manifest_record(
+                manifest_records,
+                _manifest_record(
+                    chunk,
+                    address_result,
+                    status="address_ok",
+                    token=token,
+                ),
+            )
+            continue
+
+        url = str(address_result.get("url") or "")
+        target_path = hik_cloud_download_path(
+            output_path,
+            str(chunk["device_serial"]),
+            chunk["channel_no"],
+            int(chunk["time_begin"]),
+            int(chunk["time_end"]),
+        )
+        try:
+            payload = fetch(url, timeout_seconds=download_timeout_seconds)
+            target_path.parent.mkdir(parents=True, exist_ok=True)
+            target_path.write_bytes(payload)
+        except Exception as exc:  # pragma: no cover - concrete network failures vary.
+            _upsert_manifest_record(
+                manifest_records,
+                _manifest_record(
+                    chunk,
+                    address_result,
+                    status="download_failed",
+                    path=target_path,
+                    last_error=_sanitize_error(exc, token),
+                    token=token,
+                ),
+            )
+            continue
+
+        record = _downloaded_video_record(chunk, address_result, target_path)
+        video_records.append(record)
+        _upsert_manifest_record(
+            manifest_records,
+            _manifest_record(
+                chunk,
+                address_result,
+                status="downloaded",
+                path=target_path,
+                token=token,
+            ),
+        )
+
+    write_manifest(manifest_path, manifest_records)
+    return video_records
+
+
+def _post_json(
+    url: str,
+    json_body: dict[str, Any],
+    headers: dict[str, str],
+    timeout_seconds: int,
+) -> dict[str, Any]:
+    request = urllib.request.Request(
+        url,
+        data=json.dumps(json_body).encode("utf-8"),
+        headers=headers,
+        method="POST",
+    )
+    with urllib.request.urlopen(request, timeout=timeout_seconds) as response:
+        return json.loads(response.read().decode("utf-8"))
+
+
+def _download_url(url: str, *, timeout_seconds: int | None = None) -> bytes:
+    with urllib.request.urlopen(url, timeout=timeout_seconds) as response:
+        return response.read()
+
+
+def _hik_config(config_or_hik_config: dict[str, Any]) -> dict[str, Any]:
+    hik_config = config_or_hik_config.get("hik_cloud")
+    if isinstance(hik_config, dict):
+        return hik_config
+    return config_or_hik_config
+
+
+def _chunk_metadata(chunk: dict[str, Any]) -> dict[str, Any]:
+    return {
+        "device_serial": chunk["device_serial"],
+        "channel_no": chunk["channel_no"],
+        "requested_begin": chunk.get("requested_begin"),
+        "requested_end": chunk.get("requested_end"),
+        "time_begin": chunk["time_begin"],
+        "time_end": chunk["time_end"],
+    }
+
+
+def _optional_int(value: Any) -> int | None:
+    if value is None or value == "":
+        return None
+    return int(value)
+
+
+def _api_error_message(response: dict[str, Any], token: str) -> str:
+    code = response.get("code")
+    message = response.get("msg") or response.get("message") or "hik api error"
+    return _sanitize_error(f"hik api code {code}: {message}", token)
+
+
+def _sanitize_error(value: Any, token: str = "") -> str | None:
+    if value is None:
+        return None
+    message = str(value)
+    for raw_url in re.findall(r"https?://[^\s'\"<>]+", message):
+        parsed = urlparse(raw_url)
+        sanitized_url = urlunparse(
+            (parsed.scheme, parsed.netloc, parsed.path, "", "", "")
+        )
+        message = message.replace(raw_url, sanitized_url)
+    message = re.sub(
+        r"\b(?:sign|sig|token|access_token)=[^&\s'\"<>]+",
+        "[redacted-query]",
+        message,
+        flags=re.IGNORECASE,
+    )
+    if token:
+        message = message.replace(token, "[redacted]")
+    message = message.replace("Authorization", "[redacted-header]")
+    return message
+
+
+def _downloaded_video_record(
+    chunk: dict[str, Any],
+    address_result: dict[str, Any],
+    path: Path,
+) -> dict[str, Any]:
+    return {
+        "source": "hik_cloud",
+        "path": str(path),
+        "source_path": _source_path(chunk),
+        "device_serial": chunk["device_serial"],
+        "channel_no": chunk["channel_no"],
+        "requested_begin": chunk["time_begin"],
+        "requested_end": chunk["time_end"],
+        "actual_begin": address_result.get("actual_begin"),
+        "actual_end": address_result.get("actual_end"),
+        "status": "downloaded",
+        "retry_count": 0,
+        "last_error": None,
+    }
+
+
+def _manifest_record(
+    chunk: dict[str, Any],
+    address_result: dict[str, Any],
+    *,
+    status: str,
+    token: str,
+    path: Path | None = None,
+    last_error: str | None = None,
+) -> dict[str, Any]:
+    url = address_result.get("url")
+    record = {
+        "source": "hik_cloud",
+        "device_serial": chunk["device_serial"],
+        "channel_no": chunk["channel_no"],
+        "requested_begin": chunk["time_begin"],
+        "requested_end": chunk["time_end"],
+        "actual_begin": address_result.get("actual_begin"),
+        "actual_end": address_result.get("actual_end"),
+        "path": str(path) if path is not None else None,
+        "status": status,
+        "retry_count": 0,
+        "last_error": _sanitize_error(last_error or address_result.get("last_error"), token),
+    }
+    if url:
+        record["download_url_host"] = urlparse(str(url)).netloc
+    if "code" in address_result:
+        record["code"] = address_result.get("code")
+    if status == "downloaded":
+        record["source_path"] = _source_path(chunk)
+    return record
+
+
+def _source_path(chunk: dict[str, Any]) -> str:
+    time_begin = chunk.get("time_begin", chunk.get("requested_begin"))
+    time_end = chunk.get("time_end", chunk.get("requested_end"))
+    return (
+        f"hik_cloud://{chunk['device_serial']}/ch{chunk['channel_no']}/"
+        f"{int(time_begin)}-{int(time_end)}"
+    )
+
+
+def _is_resumable_download(record: dict[str, Any]) -> bool:
+    path = record.get("path")
+    return (
+        record.get("status") == "downloaded"
+        and isinstance(path, str)
+        and Path(path).exists()
+    )
+
+
+def _video_record_from_manifest(record: dict[str, Any]) -> dict[str, Any]:
+    return {
+        "source": "hik_cloud",
+        "path": record["path"],
+        "source_path": record.get("source_path") or _source_path(record),
+        "device_serial": record["device_serial"],
+        "channel_no": record["channel_no"],
+        "requested_begin": record["requested_begin"],
+        "requested_end": record["requested_end"],
+        "actual_begin": record.get("actual_begin"),
+        "actual_end": record.get("actual_end"),
+        "status": "downloaded",
+        "retry_count": record.get("retry_count", 0),
+        "last_error": record.get("last_error"),
+    }
+
+
+def _upsert_manifest_record(
+    records: list[dict[str, Any]],
+    new_record: dict[str, Any],
+) -> None:
+    new_key = _manifest_key(new_record)
+    for index, record in enumerate(records):
+        if _manifest_key(record) == new_key:
+            records[index] = new_record
+            return
+    records.append(new_record)
+
+
+def _chunk_key(chunk: dict[str, Any]) -> tuple[Any, Any, Any, Any]:
+    return (
+        chunk.get("device_serial"),
+        chunk.get("channel_no"),
+        chunk.get("time_begin"),
+        chunk.get("time_end"),
+    )
+
+
+def _manifest_key(record: dict[str, Any]) -> tuple[Any, Any, Any, Any]:
+    return (
+        record.get("device_serial"),
+        record.get("channel_no"),
+        record.get("requested_begin"),
+        record.get("requested_end"),
+    )
+
+
+def _redaction_token(hik_config: dict[str, Any]) -> str:
+    token = hik_config.get("access_token")
+    if token:
+        return str(token)
+    token_env = hik_config.get("access_token_env")
+    if token_env:
+        return os.environ.get(str(token_env), "")
+    return ""
--- a/video_ai_analysis_poc/manifest.py
+++ b/video_ai_analysis_poc/manifest.py
@@ -0,0 +1,35 @@
+from __future__ import annotations
+
+import json
+from pathlib import Path
+from typing import Any, Iterable
+
+
+def write_manifest(path: str | Path, records: Iterable[dict[str, Any]]) -> None:
+    manifest_path = Path(path).expanduser().resolve(strict=False)
+    manifest_path.parent.mkdir(parents=True, exist_ok=True)
+    with manifest_path.open("w", encoding="utf-8") as handle:
+        for record in records:
+            normalized = _normalize_record(record)
+            handle.write(
+                json.dumps(normalized, ensure_ascii=False, sort_keys=True) + "\n"
+            )
+
+
+def read_jsonl(path: str | Path) -> list[dict[str, Any]]:
+    jsonl_path = Path(path).expanduser().resolve(strict=False)
+    if not jsonl_path.exists():
+        return []
+    records = []
+    for line in jsonl_path.read_text(encoding="utf-8").splitlines():
+        if line.strip():
+            records.append(json.loads(line))
+    return records
+
+
+def _normalize_record(record: dict[str, Any]) -> dict[str, Any]:
+    normalized = dict(record)
+    normalized.setdefault("status", "pending")
+    normalized.setdefault("retry_count", 0)
+    normalized.setdefault("last_error", None)
+    return normalized
--- a/video_ai_analysis_poc/paths.py
+++ b/video_ai_analysis_poc/paths.py
@@ -0,0 +1,71 @@
+from __future__ import annotations
+
+import hashlib
+from pathlib import Path
+
+
+FORBIDDEN_REFERENCE_ROOT = Path("/Users/yoilun/AI-train/zhengxin-vlm-0413")
+
+
+def resolve_path(path: str | Path, *, base_dir: Path | None = None) -> Path:
+    candidate = Path(path).expanduser()
+    if not candidate.is_absolute() and base_dir is not None:
+        candidate = base_dir / candidate
+    return candidate.resolve(strict=False)
+
+
+def _is_relative_to(path: Path, parent: Path) -> bool:
+    try:
+        path.relative_to(parent)
+        return True
+    except ValueError:
+        return False
+
+
+def validate_output_dir(
+    input_dir: str | Path,
+    output_dir: str | Path,
+    *,
+    forbidden_root: Path = FORBIDDEN_REFERENCE_ROOT,
+) -> Path:
+    resolved_input = resolve_path(input_dir)
+    resolved_output = resolve_path(output_dir)
+    resolved_forbidden = resolve_path(forbidden_root)
+
+    if resolved_output == resolved_input:
+        raise ValueError("output dir must not equal input dir")
+    if _is_relative_to(resolved_output, resolved_forbidden):
+        raise ValueError(
+            f"output dir must not be inside forbidden reference dir: {resolved_forbidden}"
+        )
+    return resolved_output
+
+
+def stable_video_id(path: str | Path) -> str:
+    resolved = str(resolve_path(path))
+    digest = hashlib.sha1(resolved.encode("utf-8")).hexdigest()[:16]
+    return f"video-{digest}"
+
+
+def hik_cloud_download_path(
+    output_dir: str | Path,
+    device_serial: str,
+    channel_no: int | str,
+    time_begin: int,
+    time_end: int,
+) -> Path:
+    safe_device = _safe_path_component(device_serial)
+    safe_channel = _safe_path_component(str(channel_no))
+    filename = f"{safe_device}_ch{safe_channel}_{int(time_begin)}_{int(time_end)}.mp4"
+    return (
+        resolve_path(output_dir)
+        / "downloads"
+        / "hik_cloud"
+        / safe_device
+        / f"ch{safe_channel}"
+        / filename
+    )
+
+
+def _safe_path_component(value: str) -> str:
+    return "".join(char if char.isalnum() or char in "._-" else "_" for char in value)
--- a/video_ai_analysis_poc/probe.py
+++ b/video_ai_analysis_poc/probe.py
@@ -0,0 +1,99 @@
+from __future__ import annotations
+
+import json
+import subprocess
+from pathlib import Path
+from typing import Any
+
+
+def probe_video(path: str | Path, *, timeout_seconds: int = 30) -> dict[str, Any]:
+    video_path = Path(path).expanduser().resolve(strict=False)
+    base_record: dict[str, Any] = {
+        "path": str(video_path),
+        "status": "probe_failed",
+        "retry_count": 0,
+        "last_error": None,
+    }
+    command = [
+        "ffprobe",
+        "-v",
+        "error",
+        "-print_format",
+        "json",
+        "-show_format",
+        "-show_streams",
+        str(video_path),
+    ]
+
+    try:
+        completed = subprocess.run(
+            command,
+            capture_output=True,
+            text=True,
+            check=True,
+            timeout=timeout_seconds,
+        )
+        payload = json.loads(completed.stdout or "{}")
+        video_stream = _first_video_stream(payload)
+        format_info = payload.get("format", {})
+        return {
+            **base_record,
+            "status": "probed",
+            "duration_seconds": _optional_float(format_info.get("duration")),
+            "codec_name": video_stream.get("codec_name"),
+            "width": _optional_int(video_stream.get("width")),
+            "height": _optional_int(video_stream.get("height")),
+            "fps": _parse_frame_rate(
+                video_stream.get("avg_frame_rate") or video_stream.get("r_frame_rate")
+            ),
+            "format_name": format_info.get("format_name"),
+            "start_time": _optional_float(format_info.get("start_time")),
+        }
+    except subprocess.TimeoutExpired as exc:
+        base_record["last_error"] = f"ffprobe timed out after {timeout_seconds}s"
+        if exc.stderr:
+            base_record["last_error"] += f": {exc.stderr}"
+        return base_record
+    except subprocess.CalledProcessError as exc:
+        base_record["last_error"] = _error_text(exc.stderr or exc.stdout or str(exc))
+        return base_record
+    except (json.JSONDecodeError, ValueError) as exc:
+        base_record["last_error"] = f"ffprobe parse failed: {exc}"
+        return base_record
+
+
+def _first_video_stream(payload: dict[str, Any]) -> dict[str, Any]:
+    for stream in payload.get("streams", []):
+        if stream.get("codec_type") == "video":
+            return stream
+    raise ValueError("ffprobe output did not contain a video stream")
+
+
+def _parse_frame_rate(value: str | None) -> float | None:
+    if not value or value == "0/0":
+        return None
+    if "/" in value:
+        numerator, denominator = value.split("/", 1)
+        denominator_value = float(denominator)
+        if denominator_value == 0:
+            return None
+        return float(numerator) / denominator_value
+    return float(value)
+
+
+def _optional_float(value: Any) -> float | None:
+    if value is None or value == "":
+        return None
+    return float(value)
+
+
+def _optional_int(value: Any) -> int | None:
+    if value is None or value == "":
+        return None
+    return int(value)
+
+
+def _error_text(value: Any) -> str:
+    if isinstance(value, bytes):
+        return value.decode("utf-8", errors="replace").strip()
+    return str(value).strip()
--- a/video_ai_analysis_poc/result_parser.py
+++ b/video_ai_analysis_poc/result_parser.py
@@ -0,0 +1,138 @@
+from __future__ import annotations
+
+import json
+from typing import Any
+
+
+def extract_json_payload(raw_response: str) -> dict[str, Any]:
+    text = raw_response.strip()
+    if not text:
+        raise ValueError("JSON payload is empty")
+
+    try:
+        payload = json.loads(text)
+        if isinstance(payload, dict):
+            return payload
+    except json.JSONDecodeError:
+        pass
+
+    decoder = json.JSONDecoder()
+    for index, char in enumerate(text):
+        if char != "{":
+            continue
+        try:
+            payload, _ = decoder.raw_decode(text[index:])
+        except json.JSONDecodeError:
+            continue
+        if isinstance(payload, dict):
+            return payload
+    raise ValueError("JSON object not found in model response")
+
+
+def build_clip_result(
+    raw_response: str,
+    clip_record: dict[str, Any],
+    video_record: dict[str, Any] | None,
+    config: dict[str, Any],
+    *,
+    processing: dict[str, Any] | None = None,
+    status: str | None = None,
+    error: str | None = None,
+) -> dict[str, Any]:
+    processing_record = dict(processing or {})
+    if status is not None:
+        payload: dict[str, Any] = {}
+        result_status = status
+        result_error = error
+    else:
+        try:
+            payload = extract_json_payload(raw_response)
+            result_status = "ok"
+            result_error = None
+        except ValueError as exc:
+            payload = {}
+            result_status = "parse_failed"
+            result_error = str(exc)
+
+    timeline = _timeline(clip_record, config, payload)
+    return {
+        "schema_version": config.get("schema", {}).get("version", "local-batch-v1"),
+        "video_id": str(clip_record.get("video_id")),
+        "video_path": _video_path(video_record),
+        "clip_id": str(clip_record.get("clip_id")),
+        "status": result_status,
+        "monitoring_timeline": timeline,
+        "events": _events(payload, clip_record) if result_status == "ok" else [],
+        "raw_response": raw_response,
+        "processing": processing_record,
+        "error": result_error,
+    }
+
+
+def _timeline(
+    clip_record: dict[str, Any],
+    config: dict[str, Any],
+    payload: dict[str, Any],
+) -> dict[str, Any]:
+    return {
+        "timezone": config.get("runtime", {}).get("timezone", "Asia/Shanghai"),
+        "video_start_time": clip_record.get("video_start_time"),
+        "clip_start_seconds": clip_record.get("clip_start_seconds"),
+        "clip_end_seconds": clip_record.get("clip_end_seconds"),
+        "clip_start_timecode": clip_record.get("clip_start_timecode"),
+        "clip_end_timecode": clip_record.get("clip_end_timecode"),
+        "clip_start_beijing_time": clip_record.get("clip_start_beijing_time"),
+        "clip_end_beijing_time": clip_record.get("clip_end_beijing_time"),
+        "frame_times": clip_record.get("frame_times", []),
+        "screen_time": str(
+            payload.get("screen_time") or payload.get("画面时间") or payload.get("时间") or ""
+        ),
+    }
+
+
+def _events(
+    payload: dict[str, Any],
+    clip_record: dict[str, Any],
+) -> list[dict[str, Any]]:
+    raw_events = payload.get("events") or []
+    if not isinstance(raw_events, list):
+        return []
+    return [
+        _event(event, clip_record)
+        for event in raw_events
+        if isinstance(event, dict)
+    ]
+
+
+def _event(
+    event: dict[str, Any],
+    clip_record: dict[str, Any],
+) -> dict[str, Any]:
+    normalized = dict(event)
+    normalized.setdefault("event_type", "unknown")
+    normalized.setdefault("start_time", None)
+    normalized.setdefault("end_time", None)
+    normalized.setdefault("start_offset_seconds", clip_record.get("clip_start_seconds"))
+    normalized.setdefault("end_offset_seconds", clip_record.get("clip_end_seconds"))
+    normalized.setdefault("confidence", None)
+    normalized.setdefault("severity", None)
+    normalized.setdefault("attributes", {})
+    normalized.setdefault(
+        "evidence",
+        {
+            "clip_id": clip_record.get("clip_id"),
+            "frame_paths": [
+                frame.get("frame_path")
+                for frame in clip_record.get("frame_times", [])
+                if frame.get("frame_path")
+            ],
+        },
+    )
+    return normalized
+
+
+def _video_path(video_record: dict[str, Any] | None) -> str | None:
+    if not video_record:
+        return None
+    value = video_record.get("path") or video_record.get("source_path")
+    return str(value) if value is not None else None
--- a/video_ai_analysis_poc/timeline.py
+++ b/video_ai_analysis_poc/timeline.py
@@ -0,0 +1,67 @@
+from __future__ import annotations
+
+from datetime import datetime, timedelta, timezone
+from typing import Any
+from zoneinfo import ZoneInfo, ZoneInfoNotFoundError
+
+
+TIME_FORMAT = "%Y-%m-%d %H:%M:%S"
+DEFAULT_TIMEZONE = "Asia/Shanghai"
+
+
+def format_beijing_time(
+    epoch_seconds: float | int | str | None,
+    *,
+    offset_seconds: float | int = 0,
+    timezone_name: str = DEFAULT_TIMEZONE,
+) -> str | None:
+    epoch = _optional_float(epoch_seconds)
+    if epoch is None:
+        return None
+    zone = _zone(timezone_name)
+    timestamp = epoch + float(offset_seconds)
+    return datetime.fromtimestamp(timestamp, tz=timezone.utc).astimezone(zone).strftime(
+        TIME_FORMAT
+    )
+
+
+def derive_time_from_reference(
+    reference_time: str | None,
+    *,
+    reference_offset_seconds: float | int | None,
+    target_offset_seconds: float | int | None,
+) -> str | None:
+    if not reference_time:
+        return None
+    reference_offset = _optional_float(reference_offset_seconds)
+    target_offset = _optional_float(target_offset_seconds)
+    if reference_offset is None or target_offset is None:
+        return None
+    try:
+        reference = datetime.strptime(reference_time, TIME_FORMAT)
+    except ValueError:
+        return None
+    return (reference + timedelta(seconds=target_offset - reference_offset)).strftime(
+        TIME_FORMAT
+    )
+
+
+def timeline_start_epoch(record: dict[str, Any]) -> float | None:
+    for key in ("actual_begin", "requested_begin"):
+        value = _optional_float(record.get(key))
+        if value is not None:
+            return value
+    return None
+
+
+def _zone(timezone_name: str) -> ZoneInfo:
+    try:
+        return ZoneInfo(timezone_name)
+    except ZoneInfoNotFoundError:
+        return ZoneInfo(DEFAULT_TIMEZONE)
+
+
+def _optional_float(value: Any) -> float | None:
+    if value is None or value == "":
+        return None
+    return float(value)
--- a/video_ai_analysis_poc/vlm_client.py
+++ b/video_ai_analysis_poc/vlm_client.py
@@ -0,0 +1,134 @@
+from __future__ import annotations
+
+import base64
+import json
+import time
+import urllib.request
+from pathlib import Path
+from typing import Any, Callable
+
+
+HttpPost = Callable[[str, dict[str, Any], int], dict[str, Any]]
+
+
+def infer_clip(
+    clip_record: dict[str, Any],
+    output_dir: str | Path,
+    vlm_config: dict[str, Any],
+    prompt_config: dict[str, Any],
+    *,
+    http_post: HttpPost | None = None,
+) -> dict[str, Any]:
+    start = time.monotonic()
+    client = http_post or _post_json
+    url = build_chat_url(vlm_config)
+    payload = build_payload(clip_record, output_dir, vlm_config, prompt_config)
+    response = client(url, payload, int(vlm_config.get("timeout_seconds", 120)))
+    latency_ms = int((time.monotonic() - start) * 1000)
+    return {
+        "raw_response": _extract_message_content(response.get("body")),
+        "http_status": response.get("status"),
+        "latency_ms": latency_ms,
+    }
+
+
+def build_chat_url(vlm_config: dict[str, Any]) -> str:
+    return (
+        str(vlm_config["api_base_url"]).rstrip("/")
+        + str(vlm_config["chat_completions_path"])
+    )
+
+
+def build_payload(
+    clip_record: dict[str, Any],
+    output_dir: str | Path,
+    vlm_config: dict[str, Any],
+    prompt_config: dict[str, Any],
+) -> dict[str, Any]:
+    content: list[dict[str, Any]] = [
+        {"type": "text", "text": str(prompt_config.get("user", ""))}
+    ]
+    for frame in clip_record.get("frame_times", []):
+        frame_path = frame.get("frame_path")
+        if not frame_path:
+            continue
+        content.append(
+            {
+                "type": "image_url",
+                "image_url": {
+                    "url": _image_url(
+                        frame_path,
+                        output_dir,
+                        str(vlm_config.get("image_transport", "data_uri")),
+                    )
+                },
+            }
+        )
+
+    return {
+        "model": vlm_config.get("model"),
+        "messages": [
+            {"role": "system", "content": str(prompt_config.get("system", ""))},
+            {"role": "user", "content": content},
+        ],
+        "temperature": vlm_config.get("temperature", 0),
+        "max_tokens": vlm_config.get("max_tokens", 512),
+    }
+
+
+def _image_url(
+    frame_path: str | Path,
+    output_dir: str | Path,
+    image_transport: str,
+) -> str:
+    if image_transport != "data_uri":
+        return str(frame_path)
+    path = Path(frame_path).expanduser()
+    if not path.is_absolute():
+        path = Path(output_dir).expanduser() / path
+    data = base64.b64encode(path.read_bytes()).decode("ascii")
+    return f"data:{_mime_type(path)};base64,{data}"
+
+
+def _mime_type(path: Path) -> str:
+    suffix = path.suffix.lower()
+    if suffix in {".jpg", ".jpeg"}:
+        return "image/jpeg"
+    if suffix == ".png":
+        return "image/png"
+    if suffix == ".webp":
+        return "image/webp"
+    return "application/octet-stream"
+
+
+def _post_json(
+    url: str,
+    payload: dict[str, Any],
+    timeout_seconds: int,
+) -> dict[str, Any]:
+    body = json.dumps(payload).encode("utf-8")
+    request = urllib.request.Request(
+        url,
+        data=body,
+        headers={"Content-Type": "application/json"},
+        method="POST",
+    )
+    with urllib.request.urlopen(request, timeout=timeout_seconds) as response:
+        response_body = response.read().decode("utf-8")
+        return {
+            "status": response.status,
+            "body": json.loads(response_body) if response_body else {},
+        }
+
+
+def _extract_message_content(body: Any) -> str:
+    if not isinstance(body, dict):
+        return ""
+    choices = body.get("choices")
+    if not choices:
+        return ""
+    message = choices[0].get("message", {}) if isinstance(choices[0], dict) else {}
+    content = message.get("content", "")
+    if isinstance(content, str):
+        return content
+    return json.dumps(content, ensure_ascii=False)