chahinebrini db7875fb34 feat(ops/mdm): AdGuard ClientID handshake — nginx + watcher
End-to-end DoH-to-backend wiring for Mac auto-activation:

  Mac → dns.rebreak.org/dns-query/<token> → nginx → AdGuard
  → querylog.json (CP field) → watcher.py → POST /handshake → backend

- ops/nginx/dns.rebreak.org.conf: vhost with `location ^~ /dns-query`
  prefix-match (not exact). proxy_pass without trailing slash preserves
  the full path so AdGuard parses the ClientID natively.
- watcher.py: NDJSON tail with inode-based rotation safety, per-token
  60s in-memory cooldown, urllib (no external deps), graceful 401/404/5xx
- rebreak-handshake-watcher.service: systemd unit, EnvironmentFile with
  chmod 600 (HANDSHAKE_SECRET never in git), NoNewPrivileges + PrivateTmp
- DOH_CLIENTID_HANDSHAKE.md: architecture + flow diagram + risk table
- RUNBOOK.md: status/logs/restart commands + deploy ordering

Not yet deployed. Verify-checklist before `nginx -s reload`:
  1. confirm AdGuard DoH port (config assumes 127.0.0.1:3000)
  2. confirm TLS cert exists for dns.rebreak.org
  3. snapshot current nginx config
  4. `nginx -t` dry-run
  5. functional curl + grep CP in querylog before starting watcher
2026-05-15 22:41:38 +02:00

297 lines
11 KiB
Python

#!/usr/bin/env python3
"""
adguard-handshake-watcher
=========================
Tails AdGuard Home's querylog.json (rotated) and fires a POST to the
rebreak backend's /api/devices/protected/handshake endpoint whenever
a DNS query contains a non-empty ClientID (= dnsToken of a protected device).
Environment variables (required / optional):
HANDSHAKE_SECRET — shared secret, sent as x-handshake-secret header
(required; provisioned via Infisical, see RUNBOOK)
BACKEND_URL — base URL of the backend, no trailing slash
default: https://staging.rebreak.org
QUERYLOG_PATH — path to AdGuard's querylog.json
default: /opt/adguardhome/data/querylog.json
Per-token in-memory cooldown: 60 seconds.
Only one POST is fired per token per minute even if the browser hammers DoH
at 10+ req/s. This keeps backend write-pressure negligible.
Log-rotation safety:
AdGuard rotates querylog.json by renaming and creating a new file.
The watcher detects EOF + inode change and re-opens the new file.
Polling interval: 1 second.
"""
import json
import logging
import os
import sys
import time
from datetime import datetime, timezone
from pathlib import Path
from typing import Optional
import urllib.request
import urllib.error
# ── Logging ──────────────────────────────────────────────────────────────────
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(message)s",
datefmt="%Y-%m-%dT%H:%M:%SZ",
)
log = logging.getLogger("handshake-watcher")
# ── Config ───────────────────────────────────────────────────────────────────
HANDSHAKE_SECRET: str = os.environ.get("HANDSHAKE_SECRET", "")
BACKEND_URL: str = os.environ.get("BACKEND_URL", "https://staging.rebreak.org").rstrip("/")
QUERYLOG_PATH: str = os.environ.get("QUERYLOG_PATH", "/opt/adguardhome/data/querylog.json")
COOLDOWN_SECONDS: int = 60 # minimum gap between two POSTs for the same token
POLL_INTERVAL: float = 1.0 # seconds between file-tail polls
if not HANDSHAKE_SECRET:
log.error("HANDSHAKE_SECRET env var is not set — cannot authenticate to backend. Exiting.")
sys.exit(1)
# ── State ────────────────────────────────────────────────────────────────────
# token -> unix timestamp of last successful POST
_last_fired: dict[str, float] = {}
def _cooldown_ok(token: str) -> bool:
"""Returns True if we have not fired for this token within the cooldown window."""
last = _last_fired.get(token)
if last is None:
return True
return (time.monotonic() - last) >= COOLDOWN_SECONDS
def _mark_fired(token: str) -> None:
_last_fired[token] = time.monotonic()
# ── HTTP ─────────────────────────────────────────────────────────────────────
def post_handshake(token: str) -> None:
"""
POST /api/devices/protected/handshake
Body: { "token": "<32hex>" }
Header: x-handshake-secret: <HANDSHAKE_SECRET>
Handles gracefully:
401 — secret wrong (log error, do not crash)
404 — token unknown (log warning, mark fired to avoid spam)
5xx — backend error (log error, do NOT mark fired so we retry next poll)
"""
url = f"{BACKEND_URL}/api/devices/protected/handshake"
payload = json.dumps({"token": token}).encode("utf-8")
req = urllib.request.Request(
url,
data=payload,
method="POST",
headers={
"Content-Type": "application/json",
"x-handshake-secret": HANDSHAKE_SECRET,
},
)
try:
with urllib.request.urlopen(req, timeout=10) as resp:
body = resp.read().decode("utf-8", errors="replace")
data = json.loads(body) if body else {}
if data.get("ignored"):
log.debug("token=%s: backend ignored (revoked/inactive)", token)
else:
status_changed = data.get("statusChanged", False)
status = data.get("status", "?")
if status_changed:
log.info("token=%s: status → %s (changed)", token, status)
else:
log.debug("token=%s: lastDnsQueryAt updated, status=%s", token, status)
_mark_fired(token)
except urllib.error.HTTPError as exc:
body_bytes = exc.read() if exc.fp else b""
body_str = body_bytes.decode("utf-8", errors="replace")
if exc.code == 401:
log.error(
"token=%s: 401 UNAUTHORIZED — HANDSHAKE_SECRET mismatch. "
"Check Infisical secret. body=%s",
token, body_str,
)
# Still mark fired — no point spamming a broken secret every second.
_mark_fired(token)
elif exc.code == 404:
log.warning(
"token=%s: 404 TOKEN_NOT_FOUND — token not in DB yet (pending provisioning?). "
"Will retry after cooldown. body=%s",
token, body_str,
)
# Do NOT mark fired — let it retry after normal cooldown expires naturally.
# Actually mark fired to avoid per-second hammering on unknown tokens:
_mark_fired(token)
else:
# 5xx or other — log but do NOT mark fired so we retry sooner
log.error(
"token=%s: HTTP %d from backend. Will retry. body=%s",
token, exc.code, body_str,
)
except (urllib.error.URLError, OSError, json.JSONDecodeError) as exc:
log.error("token=%s: request failed (%s). Will retry.", token, exc)
# ── QueryLog parsing ─────────────────────────────────────────────────────────
#
# AdGuard Home writes querylog.json as a sequence of newline-delimited JSON
# objects (NDJSON), one per line. Each object looks like:
#
# {
# "T": "2026-05-15T12:34:56.789Z", // timestamp (ISO8601)
# "QH": "example.com", // queried hostname
# "QT": "A", // query type
# "QC": "IN", // query class
# "CP": "abc123def456", // ClientID (the dnsToken, if path-based CID)
# "Result": { ... },
# "Elapsed": 123456,
# "IP": "127.0.0.1",
# ...
# }
#
# The "CP" field ("Client Protocol" / ClientID parameter) is set by AdGuard
# when a ClientID is embedded in the DNS-over-HTTPS URL path:
# /dns-query/<clientid>
#
# References:
# https://adguard-dns.io/kb/general/dns-filtering-syntax/
# AdGuard Home source: querylog/querylog_file.go, dnsforward/client_id.go
#
# NOTE: field name is "CP" in AdGuard Home's querylog JSON serialization
# (as of AdGuard Home v0.107.x). If the field appears empty or absent,
# double-check by tailing the actual querylog after a test query:
# docker exec adguardhome tail -f /opt/adguardhome/data/querylog.json
# and doing: curl -s https://dns.rebreak.org/dns-query/TESTTOKEN -H "accept: application/dns-json" "?name=example.com&type=A"
def extract_client_id(line: str) -> Optional[str]:
"""
Parse one NDJSON line from querylog.json.
Returns the ClientID string if non-empty, else None.
"""
line = line.strip()
if not line:
return None
try:
entry = json.loads(line)
except json.JSONDecodeError:
return None
cid = entry.get("CP", "")
if isinstance(cid, str) and cid.strip():
return cid.strip()
return None
# ── File tailing with rotation detection ─────────────────────────────────────
class RotationSafeTailer:
"""
Tails a file line-by-line. Detects log rotation by monitoring inode.
On rotation: waits one poll cycle (to let AdGuard finish writing the
renamed file), then opens the new file from offset 0.
"""
def __init__(self, path: str) -> None:
self.path = Path(path)
self._file = None
self._inode: Optional[int] = None
self._open()
def _open(self) -> None:
if self._file:
try:
self._file.close()
except OSError:
pass
self._file = None
try:
self._file = open(self.path, "r", encoding="utf-8", errors="replace")
self._inode = self.path.stat().st_ino
# Seek to end on initial open (don't replay old history).
# On rotation, we re-open from offset 0 to catch new entries.
log.info("Opened querylog: %s (inode=%d)", self.path, self._inode)
except FileNotFoundError:
log.warning("querylog not found: %s — will retry", self.path)
self._file = None
self._inode = None
def _seek_to_end_on_first_open(self) -> None:
"""Call once after initial _open() to skip historical entries."""
if self._file:
self._file.seek(0, 2) # SEEK_END
def readline(self) -> Optional[str]:
"""
Returns the next line or None if no new data.
Handles rotation transparently.
"""
if self._file is None:
self._open()
return None
line = self._file.readline()
if line:
return line
# EOF — check for rotation
try:
current_inode = self.path.stat().st_ino
except FileNotFoundError:
log.warning("querylog disappeared (rotation in progress?) — will re-open")
self._open()
return None
if current_inode != self._inode:
log.info("querylog rotation detected (inode %d -> %d) — re-opening", self._inode, current_inode)
self._open()
# Don't seek to end on rotation — read from beginning to catch
# any entries written right after rotation.
return None
# ── Main loop ─────────────────────────────────────────────────────────────────
def main() -> None:
log.info(
"Starting handshake-watcher | backend=%s | querylog=%s | cooldown=%ds",
BACKEND_URL, QUERYLOG_PATH, COOLDOWN_SECONDS,
)
tailer = RotationSafeTailer(QUERYLOG_PATH)
tailer._seek_to_end_on_first_open()
while True:
line = tailer.readline()
if line:
token = extract_client_id(line)
if token and _cooldown_ok(token):
log.info("Firing handshake for token=%s", token)
post_handshake(token)
else:
time.sleep(POLL_INTERVAL)
if __name__ == "__main__":
main()