Add QSC prompt and phase timings

This commit is contained in:
yangyl
2026-06-17 22:52:54 +08:00
parent ef0047af6d
commit 0150c1ab5c
6 changed files with 304 additions and 118 deletions

View File

@@ -1,9 +1,12 @@
from __future__ import annotations
import argparse
from contextlib import contextmanager
import json
import time
from datetime import datetime, timezone
from pathlib import Path
from typing import Sequence
from typing import Callable, Iterator, Sequence, TypeVar
from .aggregator import aggregate_outputs
from .clips import build_clip_records
@@ -18,6 +21,64 @@ from .result_parser import build_clip_result
from .timeline import DEFAULT_TIMEZONE, format_beijing_time, timeline_start_epoch
from .vlm_client import infer_clip
T = TypeVar("T")
def _new_phase_timings() -> dict[str, object]:
return {
"schema_version": "phase-timings-v1",
"started_at": _utc_now_iso(),
"updated_at": _utc_now_iso(),
"phases": {},
}
def _write_phase_timings(
output_dir: Path,
phase_timings: dict[str, object],
) -> None:
phase_timings["updated_at"] = _utc_now_iso()
(output_dir / "phase_timings.json").write_text(
json.dumps(phase_timings, ensure_ascii=False, sort_keys=True, indent=2) + "\n",
encoding="utf-8",
)
def _measure_phase(
phase_timings: dict[str, object] | None,
phase_name: str,
func: Callable[[], T],
) -> T:
with _timed_phase(phase_timings, phase_name):
return func()
@contextmanager
def _timed_phase(
phase_timings: dict[str, object] | None,
phase_name: str,
) -> Iterator[None]:
started = time.perf_counter()
try:
yield
finally:
if phase_timings is not None:
phases = phase_timings.get("phases")
if not isinstance(phases, dict):
phases = {}
phase_timings["phases"] = phases
previous = phases.get(phase_name, 0)
if not isinstance(previous, (int, float)):
previous = 0
phases[phase_name] = round(
float(previous) + time.perf_counter() - started,
6,
)
def _utc_now_iso() -> str:
return datetime.now(timezone.utc).isoformat()
def main(argv: Sequence[str] | None = None) -> int:
parser = argparse.ArgumentParser(
@@ -43,6 +104,7 @@ def main(argv: Sequence[str] | None = None) -> int:
output_dir = Path(config["output"]["dir"])
output_dir.mkdir(parents=True, exist_ok=True)
phase_timings = _new_phase_timings()
video_manifest_path = output_dir / "video_manifest.jsonl"
resume_enabled = bool(config.get("output", {}).get("resume", False))
@@ -63,11 +125,13 @@ def main(argv: Sequence[str] | None = None) -> int:
records,
record_indexes,
download_source=not args.dry_run,
phase_timings=phase_timings,
)
except ValueError as exc:
parser.error(str(exc))
write_manifest(video_manifest_path, records)
_write_phase_timings(output_dir, phase_timings)
if args.dry_run:
return 0
@@ -93,27 +157,29 @@ def main(argv: Sequence[str] | None = None) -> int:
if record.get("status") == "sampled" and record.get("video_id")
}
changed_frame_video_ids: set[str] = set(backfilled_frame_video_ids)
for record in records:
if record.get("status") != "probed":
continue
video_id = str(record.get("video_id"))
if args.until == "inference" and video_id in existing_clip_video_ids:
continue
if video_id in existing_sampled_video_ids:
continue
frame_records = _without_video_records(frame_records, video_id)
ffmpeg_config = dict(config["ffmpeg"])
ffmpeg_config["timezone"] = timezone_name
frame_records.extend(
sample_video_frames(
record,
output_dir,
ffmpeg_config,
manifest_path=None,
with _timed_phase(phase_timings, "frame_sampling_seconds"):
for record in records:
if record.get("status") != "probed":
continue
video_id = str(record.get("video_id"))
if args.until == "inference" and video_id in existing_clip_video_ids:
continue
if video_id in existing_sampled_video_ids:
continue
frame_records = _without_video_records(frame_records, video_id)
ffmpeg_config = dict(config["ffmpeg"])
ffmpeg_config["timezone"] = timezone_name
frame_records.extend(
sample_video_frames(
record,
output_dir,
ffmpeg_config,
manifest_path=None,
)
)
)
changed_frame_video_ids.add(video_id)
changed_frame_video_ids.add(video_id)
write_manifest(frame_manifest_path, frame_records)
_write_phase_timings(output_dir, phase_timings)
sampled_video_ids = {
str(record.get("video_id"))
@@ -133,22 +199,28 @@ def main(argv: Sequence[str] | None = None) -> int:
for record in frame_records
if str(record.get("video_id")) in clip_rebuild_video_ids
]
clip_records.extend(build_clip_records(frames_to_build, config["clip"]))
with _timed_phase(phase_timings, "clip_generation_seconds"):
clip_records.extend(build_clip_records(frames_to_build, config["clip"]))
write_manifest(output_dir / "clip_manifest.jsonl", clip_records)
_write_phase_timings(output_dir, phase_timings)
if args.until == "clips":
return 0
_run_inference(
clip_records,
records,
output_dir,
config,
limit_clips=args.limit_clips,
resume=resume_enabled,
)
with _timed_phase(phase_timings, "inference_seconds"):
_run_inference(
clip_records,
records,
output_dir,
config,
limit_clips=args.limit_clips,
resume=resume_enabled,
)
_write_phase_timings(output_dir, phase_timings)
if args.until == "inference":
return 0
aggregate_outputs(output_dir, config)
with _timed_phase(phase_timings, "aggregation_seconds"):
aggregate_outputs(output_dir, config)
_write_phase_timings(output_dir, phase_timings)
return 0
@@ -175,33 +247,40 @@ def _acquire_source_records(
record_indexes: dict[str, int],
*,
download_source: bool = True,
phase_timings: dict[str, object] | None = None,
) -> None:
for source_record in _source_video_records(
config,
output_dir,
download_source=download_source,
):
path = source_record.get("path")
if not path:
continue
video_id = stable_video_id(str(path))
existing_index = record_indexes.get(video_id)
if (
existing_index is not None
and records[existing_index].get("status") == "probed"
):
continue
probe_record = probe_video(
str(path),
timeout_seconds=config["ffprobe"]["timeout_seconds"],
source_records = _measure_phase(
phase_timings,
"source_acquisition_seconds",
lambda: _source_video_records(
config,
output_dir,
download_source=download_source,
)
record = {**source_record, **probe_record, "video_id": video_id}
if existing_index is None:
record_indexes[video_id] = len(records)
records.append(record)
else:
records[existing_index] = record
)
with _timed_phase(phase_timings, "video_probe_seconds"):
for source_record in source_records:
path = source_record.get("path")
if not path:
continue
video_id = stable_video_id(str(path))
existing_index = record_indexes.get(video_id)
if (
existing_index is not None
and records[existing_index].get("status") == "probed"
):
continue
probe_record = probe_video(
str(path),
timeout_seconds=config["ffprobe"]["timeout_seconds"],
)
record = {**source_record, **probe_record, "video_id": video_id}
if existing_index is None:
record_indexes[video_id] = len(records)
records.append(record)
else:
records[existing_index] = record
def _source_video_records(