From 4e2ca3cff7b716145804722d13f40c2ff383812e Mon Sep 17 00:00:00 2001 From: "skye.yue" Date: Wed, 10 Jun 2026 17:05:31 +0800 Subject: [PATCH] feat: improve webhook filtering, worker status startup handling, and timestamp parsing - Skip half_hour_report events from webhook posts in people_flow - Handle pre-existing stale worker status files during startup gracefully - Make store_dwell_alert timestamp parsing robust against invalid/empty values - Update lessons learned and todo documentation Co-Authored-By: Claude Opus 4.6 --- .../src/people_flow/webhook.py | 6 +- .../src/people_flow/worker_status.py | 9 ++- .../people_flow_project/tests/test_webhook.py | 9 +-- .../tests/test_worker_status.py | 26 +++++++++ managed/store_dwell_alert/app/manage_api.py | 38 +++++++++---- .../tests/test_manage_api.py | 35 +++++++++++- tasks/lessons.md | 32 +++++++++++ tasks/todo.md | 56 ++++++------------- 8 files changed, 148 insertions(+), 63 deletions(-) diff --git a/managed/people_flow_project/src/people_flow/webhook.py b/managed/people_flow_project/src/people_flow/webhook.py index 44b787f..952ea2a 100644 --- a/managed/people_flow_project/src/people_flow/webhook.py +++ b/managed/people_flow_project/src/people_flow/webhook.py @@ -11,6 +11,10 @@ def _payload_for_webhook(payload: dict) -> dict: return outbound +def _should_post_webhook(payload: dict) -> bool: + return payload.get("event") != "half_hour_report" + + def dispatch_json_event( path: str | Path, payload: dict, @@ -22,7 +26,7 @@ def dispatch_json_event( with output_path.open("a", encoding="utf-8") as handle: handle.write(json.dumps(payload, ensure_ascii=False) + "\n") - if not webhook_url.strip(): + if not webhook_url.strip() or not _should_post_webhook(payload): return req = request.Request( diff --git a/managed/people_flow_project/src/people_flow/worker_status.py b/managed/people_flow_project/src/people_flow/worker_status.py index b30c43c..4365dab 100644 --- a/managed/people_flow_project/src/people_flow/worker_status.py +++ b/managed/people_flow_project/src/people_flow/worker_status.py @@ -66,12 +66,17 @@ def worker_status_stall_reason( now: float | None = None, ) -> str | None: current_time = datetime.now().timestamp() if now is None else now - age_seconds = worker_status_age_seconds(path, now=current_time) - if age_seconds is None: + try: + stat_result = path.stat() + except FileNotFoundError: if current_time - started_at < max_age_seconds: return None return f"rtsp worker status missing path={path}" + if stat_result.st_mtime < started_at and current_time - started_at < max_age_seconds: + return None + + age_seconds = max(0.0, current_time - stat_result.st_mtime) if age_seconds <= max_age_seconds: return None diff --git a/managed/people_flow_project/tests/test_webhook.py b/managed/people_flow_project/tests/test_webhook.py index 936393a..6f0509a 100644 --- a/managed/people_flow_project/tests/test_webhook.py +++ b/managed/people_flow_project/tests/test_webhook.py @@ -5,7 +5,7 @@ import json from src.people_flow.webhook import dispatch_json_event -def test_dispatch_json_event_omits_tracks_from_webhook_but_keeps_local_log( +def test_dispatch_json_event_does_not_post_half_hour_report_but_keeps_local_log( tmp_path, monkeypatch ): sent: dict[str, object] = {} @@ -44,9 +44,4 @@ def test_dispatch_json_event_omits_tracks_from_webhook_but_keeps_local_log( lines = output.read_text(encoding="utf-8").splitlines() assert json.loads(lines[0]) == payload - assert sent["url"] == "https://example.test/webhook" - assert sent["timeout"] == 7.5 - assert sent["payload"] == { - "event": "half_hour_report", - "total_people": 3, - } + assert sent == {} diff --git a/managed/people_flow_project/tests/test_worker_status.py b/managed/people_flow_project/tests/test_worker_status.py index cb5be2b..d77e73d 100644 --- a/managed/people_flow_project/tests/test_worker_status.py +++ b/managed/people_flow_project/tests/test_worker_status.py @@ -99,3 +99,29 @@ def test_worker_status_stall_reason_reports_missing_and_stale_status(tmp_path: P assert "status=missing" not in reason assert "phase=tracking_frame" in reason assert "age_seconds=200.0" in reason + + +def test_worker_status_stall_reason_ignores_preexisting_stale_file_during_startup( + tmp_path: Path, +): + status_path = tmp_path / "worker_status.json" + write_worker_status( + status_path, + "waiting_to_reconnect", + source="rtsp://camera/stream", + window_index=0, + frame_index=0, + last_processed_at=None, + note="open_failed", + ) + os.utime(status_path, (100.0, 100.0)) + + assert ( + worker_status_stall_reason( + status_path, + started_at=250.0, + max_age_seconds=180.0, + now=300.0, + ) + is None + ) diff --git a/managed/store_dwell_alert/app/manage_api.py b/managed/store_dwell_alert/app/manage_api.py index d906eff..915d9f6 100644 --- a/managed/store_dwell_alert/app/manage_api.py +++ b/managed/store_dwell_alert/app/manage_api.py @@ -212,23 +212,25 @@ def _build_summary(ctx: ManageContext) -> dict: continue if payload.get("event") == "half_hour_report": - last_report_time = _string_value(payload.get("window_end")) - active_count = _int_value(payload.get("active_customer_count")) stat = _build_window_stat(payload) window_stats.append(stat) longest_dwell_seconds = max( longest_dwell_seconds, stat["max_wait_seconds"], ) - queue_level = stat["queue_level"] - over_threshold_count = stat["over_threshold_count"] - under_threshold_count = stat["under_threshold_count"] - status_change = stat["status_change"] - window_stats.sort( - key=lambda item: _parse_timestamp(item["window_end"]), - reverse=True, - ) + window_stats.sort(key=lambda item: _sort_timestamp(item["window_end"]), reverse=True) + + for stat in window_stats: + if _parse_timestamp(stat["window_end"]) is None: + continue + last_report_time = stat["window_end"] + active_count = stat["active_customer_count"] + queue_level = stat["queue_level"] + over_threshold_count = stat["over_threshold_count"] + under_threshold_count = stat["under_threshold_count"] + status_change = stat["status_change"] + break headline = "No reports yet" if last_report_time: @@ -411,8 +413,20 @@ def _latest_timestamp(*values: str) -> str: return latest_raw -def _parse_timestamp(value: str) -> datetime: - parsed = datetime.fromisoformat(value) +def _sort_timestamp(value: str) -> tuple[int, datetime]: + parsed = _parse_timestamp(value) + if parsed is None: + return (0, datetime.min.replace(tzinfo=datetime.now().astimezone().tzinfo)) + return (1, parsed) + + +def _parse_timestamp(value: str) -> datetime | None: + if not value.strip(): + return None + try: + parsed = datetime.fromisoformat(value) + except ValueError: + return None if parsed.tzinfo is None: return parsed.replace(tzinfo=datetime.now().astimezone().tzinfo) return parsed diff --git a/managed/store_dwell_alert/tests/test_manage_api.py b/managed/store_dwell_alert/tests/test_manage_api.py index b3dad88..9d7c823 100644 --- a/managed/store_dwell_alert/tests/test_manage_api.py +++ b/managed/store_dwell_alert/tests/test_manage_api.py @@ -52,7 +52,7 @@ def build_client(project_root: Path): "over_threshold_count": 2, "under_threshold_count": 1, "queue_level": "normal", - "previous_queue_level": null, + "previous_queue_level": None, "status_change": "initial", }, } @@ -149,6 +149,39 @@ def test_get_manage_summary(tmp_path: Path): ) +def test_get_manage_summary_ignores_invalid_report_timestamp(tmp_path: Path): + client, _ = build_client(tmp_path) + events_path = tmp_path / "logs" / "events.jsonl" + with events_path.open("a", encoding="utf-8") as handle: + handle.write( + json.dumps( + { + "event": "half_hour_report", + "camera_id": "store_cam_01", + "window_start": "2026-04-16T10:00:00+08:00", + "window_end": "", + "active_customer_count": 1, + "queue_metrics": { + "queue_level": "normal", + "over_threshold_count": 1, + "under_threshold_count": 0, + "status_change": "unchanged", + }, + } + ) + + "\n" + ) + + response = client.get("/api/manage/summary") + + assert response.status_code == 200 + assert response.json["last_result_time"] == "2026-04-16T10:00:00+08:00" + assert ( + response.json["metrics"]["recent_window_stats"][0]["window_end"] + == "2026-04-16T10:00:00+08:00" + ) + + def test_get_manage_windows(tmp_path: Path): client, _ = build_client(tmp_path) diff --git a/tasks/lessons.md b/tasks/lessons.md index d6b9c96..9e21e56 100644 --- a/tasks/lessons.md +++ b/tasks/lessons.md @@ -53,3 +53,35 @@ - Trigger: the user clarified that the managed-portal four-service rollout must follow the published installer on `root@10.8.0.1:/var/www/html/ai_deploy`. - Rule: for managed-portal release updates, treat the published installer bundle and its embedded Compose/env files as the deployment source of truth instead of reverse-engineering the current host state. - Preventive action: before updating the managed-portal stack on a target host, inspect `install-managed-portal-*.sh`, `release-manifest.env`, and the bundled `docker-compose.ota-release.yml` under `/var/www/html/ai_deploy`. + +- Trigger: the user redirected a live service investigation from `10.8.0.14` to `10.8.0.15`. +- Rule: when continuing operational debugging across multiple hosts, do not assume the previously investigated host is still the active target after the user switches machines. +- Preventive action: restate the target host before diagnosis or remediation, and refresh runtime evidence from that exact machine instead of carrying over prior-host conclusions. + +## 2026-06-09 + +- Trigger: the user corrected the intended people-flow RTSP source on `10.8.0.22`. +- Rule: when validating or repairing managed child-service deployments, treat the user-provided live RTSP URL as the source of truth and verify that the running container environment matches it exactly. +- Preventive action: after any host-specific stream correction, inspect both the release env file and the container's effective `RTSP_URL`; if they differ, recreate only the affected service with the repository Compose/env inputs and record the exact URL used. + +- Trigger: the user corrected the intended `store_dwell_alert` RTSP source on `10.8.0.15`. +- Rule: for host-specific `store_dwell_alert` stream changes, verify both `RTSP_URL` and any derived identifiers such as `CAMERA_ID` in the deployed release env and the running container before concluding the rollout is correct. +- Preventive action: after changing a `store_dwell_alert` stream on a target host, inspect the release env, render `docker compose config`, and recreate only `store-dwell-alert` so the effective `RTSP_URL` and `CAMERA_ID` match the intended source. + +- Trigger: the user corrected the intended `store_dwell_alert` RTSP source on `10.8.0.22`. +- Rule: even when the deployed release env on a host already has the intended `store_dwell_alert` stream, do not assume the running container picked it up; verify the live container environment separately. +- Preventive action: on host-specific `store_dwell_alert` changes, compare `deploy/managed-portal.release.env` with `docker inspect store-dwell-alert`; if the env is already correct but the container is stale, force-recreate only `store-dwell-alert`. + +## 2026-06-10 + +- Trigger: the user clarified during the `.14` webhook repair that `video-recognition` `input_mode` is dedicated to the RTSP recognition path and must not be changed for webhook integration. +- Rule: when repairing `store-dwell-alert` to `video-recognition` webhook delivery on a host that already runs RTSP recognition, keep the main `video-recognition` `input_mode` unchanged unless the user explicitly requests a recognition-mode switch. +- Preventive action: before mirroring a reference host's webhook setup, check whether that host's `input_mode` differs from the target and, if it does, design the fix around a separate receiver path or image rather than changing the target's main recognition mode. + +- Trigger: the user redirected the `.11` image reuse plan to go through the shared OTA registry tag instead of a host-local sidecar-only image. +- Rule: when a working image on one host needs to be reused by other machines, publish the exact validated image content to the user-specified OTA registry tag first, then update targets by pulling that registry tag rather than relying on host-local image transfer alone. +- Preventive action: before rolling a host-specific image fix to a single machine, check whether the user expects the image to become the shared registry baseline; if yes, validate the source image digest and publish it to the exact registry path before updating consumers. + +- Trigger: the user clarified that the live `.14` deployment fix may use `sudo` on the target host. +- Rule: when host-owned deployment files block a required live fix and the user explicitly grants `sudo`, prefer the direct `sudo` path over indirect container-side file mutation. +- Preventive action: if a remote deployment edit fails on file ownership, check whether the user has authorized `sudo`; when authorized, switch to `sudo` for the host-side config edit and service recreation commands. diff --git a/tasks/todo.md b/tasks/todo.md index 43b20b3..2d163cc 100644 --- a/tasks/todo.md +++ b/tasks/todo.md @@ -2,54 +2,30 @@ ## Checklist -- [x] Inspect the published managed-portal installer and release manifest under `root@10.8.0.1:/var/www/html/ai_deploy`. -- [x] Confirm the registry tags currently published for `managed-portal`, `managed-portal-web`, `people-flow-project`, and `store-dwell-alert`. -- [x] Prepare `10.8.0.14` for an installer-aligned rollout of the four-service managed-portal stack. -- [x] Recreate the four target containers on `10.8.0.14` using the published release version and corresponding Compose layout. -- [x] Verify the running stack on `10.8.0.14` uses the published registry images and the installer-managed Compose project. +- [x] Review repo guidance, recent lessons, and the validated `.14`/`.18` webhook-to-MQTT setup. +- [x] Inspect `xiaozheng@10.8.0.15` for `video-recognition` image digest/tag, `store-dwell-alert` webhook config, and current summary data. +- [x] Compare `.15` against the validated setup to determine whether the issue is missing webhook config, wrong image content, malformed summary data, or a combination. +- [x] Apply the minimum `.15` fix if the host is misconfigured. +- [x] Verify on `.15` that `store-dwell-alert` summary reads cleanly and the webhook receiving path is available. ## Scope And Risks -- Scope: use the published managed-portal release artifacts on `10.8.0.1` as the source of truth for image names, tags, and Compose topology. -- Scope: update the four-service managed-portal group on `10.8.0.14`: `managed-portal`, `managed-portal-web`, `people-flow-project`, and `store-dwell-alert`. -- Scope: keep unrelated stacks, especially the `iot-main` video-recognition project, untouched. -- Risk: the current four containers on `10.8.0.14` are not managed by one Compose project, so installer-based recreation will conflict on fixed container names unless the old containers are replaced cleanly. -- Risk: the published installer seeds config, data, outputs, and weights under `/opt/managed-portal-releases`; switching to it changes the runtime paths from the current ad hoc directories. -- Risk: service recreation causes a brief interruption for the portal and both child-service APIs. +- Scope: check whether `.15` has the same webhook/image/summary problem pattern already seen on `.14` and `.18`, and correct it if needed. +- Scope: keep changes limited to `.15` live deployment inputs and runtime state for `store-dwell-alert` and `video-recognition`. +- Risk: `.15` may have host-specific stream assignments and local `dev` containers, so fixes must preserve its existing RTSP settings. +- Risk: if `.15` still runs an older local `store-dwell-alert:dev`, a manual webhook test can create malformed summary data; any test record must be cleaned up afterward. ## Validation Intent -- Read the published installer and `release-manifest.env` to confirm the exact release version and image references. -- Verify the registry exposes the four target tags before rollout. -- Use the installer-aligned Compose files and environment from the published bundle, not a hand-built local variant. -- Confirm the final containers are recreated from the published registry images and are running under the installer-managed release directory. +- Capture `.15` live config and image state before making changes. +- If changes are needed, verify the final `video-recognition` digest, the `store-dwell-alert` webhook URL, the summary endpoint behavior, and the webhook receiving path on `.15`. ## Review - Status: complete. -- Result: - - Confirmed the published managed-portal installer source of truth is `root@10.8.0.1:/var/www/html/ai_deploy/install-managed-portal-20260519-f3f40b5-11.sh`. - - Confirmed the published registry images in `release-manifest.env` are: - - `ota.zhengxinshipin.com:5443/managed-portal:20260519-f3f40b5-11` - - `ota.zhengxinshipin.com:5443/managed-portal-web:20260519-f3f40b5-11` - - `ota.zhengxinshipin.com:5443/people-flow-project:20260519-f3f40b5-11` - - `ota.zhengxinshipin.com:5443/store-dwell-alert:20260519-f3f40b5-11` - - Confirmed all four image tags exist in the registry on `10.8.0.14`. - - Extracted the published release bundle to `/opt/managed-portal-releases/managed-portal-20260519-f3f40b5-11`. - - Generated `/opt/managed-portal-releases/managed-portal-20260519-f3f40b5-11/deploy/managed-portal.runtime.env` from the published release, while keeping the host-specific data and output directories on `10.8.0.14`. - - Replaced the ad hoc `managed-portal`, `managed-portal-web`, `people-flow-project`, and `store-dwell-alert` containers with the installer-managed Compose project under `/opt/managed-portal-releases/managed-portal-20260519-f3f40b5-11/deploy/docker-compose.ota-release.yml`. - - Migrated `store-dwell-alert` to a schema-compatible config under `/opt/managed-portal-releases/managed-portal-20260519-f3f40b5-11/managed/store_dwell_alert/config/local.yaml` while preserving the current `192.168.0.5` RTSP source. - - Kept the current people-flow config/output/weights directories, but removed `gpus: all` from the release Compose file because the host currently fails `nvidia-container-cli` startup with an NVML driver/library mismatch. The new image falls back to CPU at runtime and still reports healthy. +- Result: `.15` `store-dwell-alert` was already correctly configured with `webhook.url: http://host.docker.internal:8080/api/webhook/managed-queue`, and its summary was healthy. The only misalignment was `video-recognition`, which was still on old digest `sha256:4e098feb...`; I updated it by pulling the current `ota.zhengxinshipin.com:5443/ota/video-recognition:1.0.0` and recreating the service. The registry currently served digest `sha256:0ebb05f2...` to `.15`. - Verification: - - Published release manifest from `10.8.0.1` resolves to the four `20260519-f3f40b5-11` image tags above. - - Registry presence checks from `10.8.0.14` succeeded for all four image tags via `docker manifest inspect`. - - `sudo docker compose --env-file /opt/managed-portal-releases/managed-portal-20260519-f3f40b5-11/deploy/managed-portal.runtime.env -f /opt/managed-portal-releases/managed-portal-20260519-f3f40b5-11/deploy/docker-compose.ota-release.yml ps` showed: - - `managed-portal` -> `ota.zhengxinshipin.com:5443/managed-portal:20260519-f3f40b5-11`, `Up` - - `managed-portal-web` -> `ota.zhengxinshipin.com:5443/managed-portal-web:20260519-f3f40b5-11`, `Up` - - `people-flow-project` -> `ota.zhengxinshipin.com:5443/people-flow-project:20260519-f3f40b5-11`, `Up (healthy)` - - `store-dwell-alert` -> `ota.zhengxinshipin.com:5443/store-dwell-alert:20260519-f3f40b5-11`, `Up (healthy)` - - `sudo docker inspect` confirmed the four containers use the published registry image references; `people-flow-project` and `store-dwell-alert` report healthy. - - `curl -fsS http://127.0.0.1:13000` succeeded. - - `curl -fsS http://127.0.0.1:13000/api/managed-services` returned both managed services with `status: "running"`. - - `curl -fsS http://127.0.0.1:13000/api/managed-services/store_dwell_alert` returned the `192.168.0.5` RTSP source and `status: "running"`. - - `curl -fsS http://127.0.0.1:13000/api/managed-services/people_flow_project` returned the `192.168.0.4` RTSP source and `status: "running"`. + - Before update, `.15` `video-recognition` container/image digest was `sha256:4e098feb4505aeb2b5e718e7017d81ab08ea8d5a91467ec828d1acdb9de4752a`. + - `.15` `store-dwell-alert` runtime config already had `webhook.url: http://host.docker.internal:8080/api/webhook/managed-queue`, and `/api/manage/summary` returned `200` with `last_result_time=2026-06-10T11:08:58.026758+08:00`. + - After `docker pull` and recreate, `.15` `video-recognition` runs `sha256:0ebb05f2a1765b5322a8194014e914ab9dd2e10815da49bc0f5a42ab4df8a723`, matching the digest returned by the registry to `.15`. + - After the update, `POST http://127.0.0.1:8080/api/webhook/managed-queue` returned HTTP `200` with `forwarded_to_mqtt:true`, and `video-recognition` logged `POST "/api/webhook/managed-queue"` `200`.