feat: improve webhook filtering, worker status startup handling, and timestamp parsing

- Skip half_hour_report events from webhook posts in people_flow
- Handle pre-existing stale worker status files during startup gracefully
- Make store_dwell_alert timestamp parsing robust against invalid/empty values
- Update lessons learned and todo documentation

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-06-10 17:05:31 +08:00
parent 9cde462cd1
commit 4e2ca3cff7
8 changed files with 148 additions and 63 deletions

View File

@@ -11,6 +11,10 @@ def _payload_for_webhook(payload: dict) -> dict:
return outbound
def _should_post_webhook(payload: dict) -> bool:
return payload.get("event") != "half_hour_report"
def dispatch_json_event(
path: str | Path,
payload: dict,
@@ -22,7 +26,7 @@ def dispatch_json_event(
with output_path.open("a", encoding="utf-8") as handle:
handle.write(json.dumps(payload, ensure_ascii=False) + "\n")
if not webhook_url.strip():
if not webhook_url.strip() or not _should_post_webhook(payload):
return
req = request.Request(

View File

@@ -66,12 +66,17 @@ def worker_status_stall_reason(
now: float | None = None,
) -> str | None:
current_time = datetime.now().timestamp() if now is None else now
age_seconds = worker_status_age_seconds(path, now=current_time)
if age_seconds is None:
try:
stat_result = path.stat()
except FileNotFoundError:
if current_time - started_at < max_age_seconds:
return None
return f"rtsp worker status missing path={path}"
if stat_result.st_mtime < started_at and current_time - started_at < max_age_seconds:
return None
age_seconds = max(0.0, current_time - stat_result.st_mtime)
if age_seconds <= max_age_seconds:
return None

View File

@@ -5,7 +5,7 @@ import json
from src.people_flow.webhook import dispatch_json_event
def test_dispatch_json_event_omits_tracks_from_webhook_but_keeps_local_log(
def test_dispatch_json_event_does_not_post_half_hour_report_but_keeps_local_log(
tmp_path, monkeypatch
):
sent: dict[str, object] = {}
@@ -44,9 +44,4 @@ def test_dispatch_json_event_omits_tracks_from_webhook_but_keeps_local_log(
lines = output.read_text(encoding="utf-8").splitlines()
assert json.loads(lines[0]) == payload
assert sent["url"] == "https://example.test/webhook"
assert sent["timeout"] == 7.5
assert sent["payload"] == {
"event": "half_hour_report",
"total_people": 3,
}
assert sent == {}

View File

@@ -99,3 +99,29 @@ def test_worker_status_stall_reason_reports_missing_and_stale_status(tmp_path: P
assert "status=missing" not in reason
assert "phase=tracking_frame" in reason
assert "age_seconds=200.0" in reason
def test_worker_status_stall_reason_ignores_preexisting_stale_file_during_startup(
tmp_path: Path,
):
status_path = tmp_path / "worker_status.json"
write_worker_status(
status_path,
"waiting_to_reconnect",
source="rtsp://camera/stream",
window_index=0,
frame_index=0,
last_processed_at=None,
note="open_failed",
)
os.utime(status_path, (100.0, 100.0))
assert (
worker_status_stall_reason(
status_path,
started_at=250.0,
max_age_seconds=180.0,
now=300.0,
)
is None
)

View File

@@ -212,23 +212,25 @@ def _build_summary(ctx: ManageContext) -> dict:
continue
if payload.get("event") == "half_hour_report":
last_report_time = _string_value(payload.get("window_end"))
active_count = _int_value(payload.get("active_customer_count"))
stat = _build_window_stat(payload)
window_stats.append(stat)
longest_dwell_seconds = max(
longest_dwell_seconds,
stat["max_wait_seconds"],
)
window_stats.sort(key=lambda item: _sort_timestamp(item["window_end"]), reverse=True)
for stat in window_stats:
if _parse_timestamp(stat["window_end"]) is None:
continue
last_report_time = stat["window_end"]
active_count = stat["active_customer_count"]
queue_level = stat["queue_level"]
over_threshold_count = stat["over_threshold_count"]
under_threshold_count = stat["under_threshold_count"]
status_change = stat["status_change"]
window_stats.sort(
key=lambda item: _parse_timestamp(item["window_end"]),
reverse=True,
)
break
headline = "No reports yet"
if last_report_time:
@@ -411,8 +413,20 @@ def _latest_timestamp(*values: str) -> str:
return latest_raw
def _parse_timestamp(value: str) -> datetime:
def _sort_timestamp(value: str) -> tuple[int, datetime]:
parsed = _parse_timestamp(value)
if parsed is None:
return (0, datetime.min.replace(tzinfo=datetime.now().astimezone().tzinfo))
return (1, parsed)
def _parse_timestamp(value: str) -> datetime | None:
if not value.strip():
return None
try:
parsed = datetime.fromisoformat(value)
except ValueError:
return None
if parsed.tzinfo is None:
return parsed.replace(tzinfo=datetime.now().astimezone().tzinfo)
return parsed

View File

@@ -52,7 +52,7 @@ def build_client(project_root: Path):
"over_threshold_count": 2,
"under_threshold_count": 1,
"queue_level": "normal",
"previous_queue_level": null,
"previous_queue_level": None,
"status_change": "initial",
},
}
@@ -149,6 +149,39 @@ def test_get_manage_summary(tmp_path: Path):
)
def test_get_manage_summary_ignores_invalid_report_timestamp(tmp_path: Path):
client, _ = build_client(tmp_path)
events_path = tmp_path / "logs" / "events.jsonl"
with events_path.open("a", encoding="utf-8") as handle:
handle.write(
json.dumps(
{
"event": "half_hour_report",
"camera_id": "store_cam_01",
"window_start": "2026-04-16T10:00:00+08:00",
"window_end": "",
"active_customer_count": 1,
"queue_metrics": {
"queue_level": "normal",
"over_threshold_count": 1,
"under_threshold_count": 0,
"status_change": "unchanged",
},
}
)
+ "\n"
)
response = client.get("/api/manage/summary")
assert response.status_code == 200
assert response.json["last_result_time"] == "2026-04-16T10:00:00+08:00"
assert (
response.json["metrics"]["recent_window_stats"][0]["window_end"]
== "2026-04-16T10:00:00+08:00"
)
def test_get_manage_windows(tmp_path: Path):
client, _ = build_client(tmp_path)

View File

@@ -53,3 +53,35 @@
- Trigger: the user clarified that the managed-portal four-service rollout must follow the published installer on `root@10.8.0.1:/var/www/html/ai_deploy`.
- Rule: for managed-portal release updates, treat the published installer bundle and its embedded Compose/env files as the deployment source of truth instead of reverse-engineering the current host state.
- Preventive action: before updating the managed-portal stack on a target host, inspect `install-managed-portal-*.sh`, `release-manifest.env`, and the bundled `docker-compose.ota-release.yml` under `/var/www/html/ai_deploy`.
- Trigger: the user redirected a live service investigation from `10.8.0.14` to `10.8.0.15`.
- Rule: when continuing operational debugging across multiple hosts, do not assume the previously investigated host is still the active target after the user switches machines.
- Preventive action: restate the target host before diagnosis or remediation, and refresh runtime evidence from that exact machine instead of carrying over prior-host conclusions.
## 2026-06-09
- Trigger: the user corrected the intended people-flow RTSP source on `10.8.0.22`.
- Rule: when validating or repairing managed child-service deployments, treat the user-provided live RTSP URL as the source of truth and verify that the running container environment matches it exactly.
- Preventive action: after any host-specific stream correction, inspect both the release env file and the container's effective `RTSP_URL`; if they differ, recreate only the affected service with the repository Compose/env inputs and record the exact URL used.
- Trigger: the user corrected the intended `store_dwell_alert` RTSP source on `10.8.0.15`.
- Rule: for host-specific `store_dwell_alert` stream changes, verify both `RTSP_URL` and any derived identifiers such as `CAMERA_ID` in the deployed release env and the running container before concluding the rollout is correct.
- Preventive action: after changing a `store_dwell_alert` stream on a target host, inspect the release env, render `docker compose config`, and recreate only `store-dwell-alert` so the effective `RTSP_URL` and `CAMERA_ID` match the intended source.
- Trigger: the user corrected the intended `store_dwell_alert` RTSP source on `10.8.0.22`.
- Rule: even when the deployed release env on a host already has the intended `store_dwell_alert` stream, do not assume the running container picked it up; verify the live container environment separately.
- Preventive action: on host-specific `store_dwell_alert` changes, compare `deploy/managed-portal.release.env` with `docker inspect store-dwell-alert`; if the env is already correct but the container is stale, force-recreate only `store-dwell-alert`.
## 2026-06-10
- Trigger: the user clarified during the `.14` webhook repair that `video-recognition` `input_mode` is dedicated to the RTSP recognition path and must not be changed for webhook integration.
- Rule: when repairing `store-dwell-alert` to `video-recognition` webhook delivery on a host that already runs RTSP recognition, keep the main `video-recognition` `input_mode` unchanged unless the user explicitly requests a recognition-mode switch.
- Preventive action: before mirroring a reference host's webhook setup, check whether that host's `input_mode` differs from the target and, if it does, design the fix around a separate receiver path or image rather than changing the target's main recognition mode.
- Trigger: the user redirected the `.11` image reuse plan to go through the shared OTA registry tag instead of a host-local sidecar-only image.
- Rule: when a working image on one host needs to be reused by other machines, publish the exact validated image content to the user-specified OTA registry tag first, then update targets by pulling that registry tag rather than relying on host-local image transfer alone.
- Preventive action: before rolling a host-specific image fix to a single machine, check whether the user expects the image to become the shared registry baseline; if yes, validate the source image digest and publish it to the exact registry path before updating consumers.
- Trigger: the user clarified that the live `.14` deployment fix may use `sudo` on the target host.
- Rule: when host-owned deployment files block a required live fix and the user explicitly grants `sudo`, prefer the direct `sudo` path over indirect container-side file mutation.
- Preventive action: if a remote deployment edit fails on file ownership, check whether the user has authorized `sudo`; when authorized, switch to `sudo` for the host-side config edit and service recreation commands.

View File

@@ -2,54 +2,30 @@
## Checklist
- [x] Inspect the published managed-portal installer and release manifest under `root@10.8.0.1:/var/www/html/ai_deploy`.
- [x] Confirm the registry tags currently published for `managed-portal`, `managed-portal-web`, `people-flow-project`, and `store-dwell-alert`.
- [x] Prepare `10.8.0.14` for an installer-aligned rollout of the four-service managed-portal stack.
- [x] Recreate the four target containers on `10.8.0.14` using the published release version and corresponding Compose layout.
- [x] Verify the running stack on `10.8.0.14` uses the published registry images and the installer-managed Compose project.
- [x] Review repo guidance, recent lessons, and the validated `.14`/`.18` webhook-to-MQTT setup.
- [x] Inspect `xiaozheng@10.8.0.15` for `video-recognition` image digest/tag, `store-dwell-alert` webhook config, and current summary data.
- [x] Compare `.15` against the validated setup to determine whether the issue is missing webhook config, wrong image content, malformed summary data, or a combination.
- [x] Apply the minimum `.15` fix if the host is misconfigured.
- [x] Verify on `.15` that `store-dwell-alert` summary reads cleanly and the webhook receiving path is available.
## Scope And Risks
- Scope: use the published managed-portal release artifacts on `10.8.0.1` as the source of truth for image names, tags, and Compose topology.
- Scope: update the four-service managed-portal group on `10.8.0.14`: `managed-portal`, `managed-portal-web`, `people-flow-project`, and `store-dwell-alert`.
- Scope: keep unrelated stacks, especially the `iot-main` video-recognition project, untouched.
- Risk: the current four containers on `10.8.0.14` are not managed by one Compose project, so installer-based recreation will conflict on fixed container names unless the old containers are replaced cleanly.
- Risk: the published installer seeds config, data, outputs, and weights under `/opt/managed-portal-releases`; switching to it changes the runtime paths from the current ad hoc directories.
- Risk: service recreation causes a brief interruption for the portal and both child-service APIs.
- Scope: check whether `.15` has the same webhook/image/summary problem pattern already seen on `.14` and `.18`, and correct it if needed.
- Scope: keep changes limited to `.15` live deployment inputs and runtime state for `store-dwell-alert` and `video-recognition`.
- Risk: `.15` may have host-specific stream assignments and local `dev` containers, so fixes must preserve its existing RTSP settings.
- Risk: if `.15` still runs an older local `store-dwell-alert:dev`, a manual webhook test can create malformed summary data; any test record must be cleaned up afterward.
## Validation Intent
- Read the published installer and `release-manifest.env` to confirm the exact release version and image references.
- Verify the registry exposes the four target tags before rollout.
- Use the installer-aligned Compose files and environment from the published bundle, not a hand-built local variant.
- Confirm the final containers are recreated from the published registry images and are running under the installer-managed release directory.
- Capture `.15` live config and image state before making changes.
- If changes are needed, verify the final `video-recognition` digest, the `store-dwell-alert` webhook URL, the summary endpoint behavior, and the webhook receiving path on `.15`.
## Review
- Status: complete.
- Result:
- Confirmed the published managed-portal installer source of truth is `root@10.8.0.1:/var/www/html/ai_deploy/install-managed-portal-20260519-f3f40b5-11.sh`.
- Confirmed the published registry images in `release-manifest.env` are:
- `ota.zhengxinshipin.com:5443/managed-portal:20260519-f3f40b5-11`
- `ota.zhengxinshipin.com:5443/managed-portal-web:20260519-f3f40b5-11`
- `ota.zhengxinshipin.com:5443/people-flow-project:20260519-f3f40b5-11`
- `ota.zhengxinshipin.com:5443/store-dwell-alert:20260519-f3f40b5-11`
- Confirmed all four image tags exist in the registry on `10.8.0.14`.
- Extracted the published release bundle to `/opt/managed-portal-releases/managed-portal-20260519-f3f40b5-11`.
- Generated `/opt/managed-portal-releases/managed-portal-20260519-f3f40b5-11/deploy/managed-portal.runtime.env` from the published release, while keeping the host-specific data and output directories on `10.8.0.14`.
- Replaced the ad hoc `managed-portal`, `managed-portal-web`, `people-flow-project`, and `store-dwell-alert` containers with the installer-managed Compose project under `/opt/managed-portal-releases/managed-portal-20260519-f3f40b5-11/deploy/docker-compose.ota-release.yml`.
- Migrated `store-dwell-alert` to a schema-compatible config under `/opt/managed-portal-releases/managed-portal-20260519-f3f40b5-11/managed/store_dwell_alert/config/local.yaml` while preserving the current `192.168.0.5` RTSP source.
- Kept the current people-flow config/output/weights directories, but removed `gpus: all` from the release Compose file because the host currently fails `nvidia-container-cli` startup with an NVML driver/library mismatch. The new image falls back to CPU at runtime and still reports healthy.
- Result: `.15` `store-dwell-alert` was already correctly configured with `webhook.url: http://host.docker.internal:8080/api/webhook/managed-queue`, and its summary was healthy. The only misalignment was `video-recognition`, which was still on old digest `sha256:4e098feb...`; I updated it by pulling the current `ota.zhengxinshipin.com:5443/ota/video-recognition:1.0.0` and recreating the service. The registry currently served digest `sha256:0ebb05f2...` to `.15`.
- Verification:
- Published release manifest from `10.8.0.1` resolves to the four `20260519-f3f40b5-11` image tags above.
- Registry presence checks from `10.8.0.14` succeeded for all four image tags via `docker manifest inspect`.
- `sudo docker compose --env-file /opt/managed-portal-releases/managed-portal-20260519-f3f40b5-11/deploy/managed-portal.runtime.env -f /opt/managed-portal-releases/managed-portal-20260519-f3f40b5-11/deploy/docker-compose.ota-release.yml ps` showed:
- `managed-portal` -> `ota.zhengxinshipin.com:5443/managed-portal:20260519-f3f40b5-11`, `Up`
- `managed-portal-web` -> `ota.zhengxinshipin.com:5443/managed-portal-web:20260519-f3f40b5-11`, `Up`
- `people-flow-project` -> `ota.zhengxinshipin.com:5443/people-flow-project:20260519-f3f40b5-11`, `Up (healthy)`
- `store-dwell-alert` -> `ota.zhengxinshipin.com:5443/store-dwell-alert:20260519-f3f40b5-11`, `Up (healthy)`
- `sudo docker inspect` confirmed the four containers use the published registry image references; `people-flow-project` and `store-dwell-alert` report healthy.
- `curl -fsS http://127.0.0.1:13000` succeeded.
- `curl -fsS http://127.0.0.1:13000/api/managed-services` returned both managed services with `status: "running"`.
- `curl -fsS http://127.0.0.1:13000/api/managed-services/store_dwell_alert` returned the `192.168.0.5` RTSP source and `status: "running"`.
- `curl -fsS http://127.0.0.1:13000/api/managed-services/people_flow_project` returned the `192.168.0.4` RTSP source and `status: "running"`.
- Before update, `.15` `video-recognition` container/image digest was `sha256:4e098feb4505aeb2b5e718e7017d81ab08ea8d5a91467ec828d1acdb9de4752a`.
- `.15` `store-dwell-alert` runtime config already had `webhook.url: http://host.docker.internal:8080/api/webhook/managed-queue`, and `/api/manage/summary` returned `200` with `last_result_time=2026-06-10T11:08:58.026758+08:00`.
- After `docker pull` and recreate, `.15` `video-recognition` runs `sha256:0ebb05f2a1765b5322a8194014e914ab9dd2e10815da49bc0f5a42ab4df8a723`, matching the digest returned by the registry to `.15`.
- After the update, `POST http://127.0.0.1:8080/api/webhook/managed-queue` returned HTTP `200` with `forwarded_to_mqtt:true`, and `video-recognition` logged `POST "/api/webhook/managed-queue"` `200`.