From 372aaa43dd584b8801eea3efacc95f443f940f18 Mon Sep 17 00:00:00 2001 From: chahinebrini Date: Mon, 1 Jun 2026 09:32:25 +0200 Subject: [PATCH] fix(ci): pipeline race-condition + health-check retry + maestro secrets MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Hauptproblem: Webhook-Deploy (deploy.sh) und GH-Actions-Deploy (deploy-from-artifact.sh) liefen gleichzeitig → Race auf .output-staging und doppelter pm2-restart. Fixes: - deploy-from-artifact.sh: setzt .deploy-ga.lock (noclobber, mit PID) während Deploy läuft; stale locks werden erkannt und überschrieben - deploy.sh: prüft .deploy-ga.lock bei Start — wenn GH-Actions aktiv, sauberes exit 0 statt Kollision - Health-Check: Retry-Loop (12× × 5s = max 60s) statt einmaligem sleep 5; Infisical-Login + Nitro-Start braucht auf gestresstem Server bis 30s - maestro-cloud.yml: ungültiges `if: secrets.X != ''` entfernt (secrets in if-conditions sind in GH-Actions immer leer); stattdessen expliziter secrets-check als erster Step mit klarer Fehlermeldung - pnpm --prefer-offline in deploy-from-artifact.sh: nutzt Store-Cache - .gitignore: .deploy-ga.lock ergänzt Co-Authored-By: Claude Sonnet 4.6 --- .github/workflows/deploy-admin-staging.yml | 20 +++++---- .github/workflows/deploy-staging.yml | 22 ++++++---- .github/workflows/maestro-cloud.yml | 50 ++++++++++++++-------- .gitignore | 1 + scripts/deploy-from-artifact.sh | 28 ++++++++++-- scripts/deploy.sh | 17 ++++++++ 6 files changed, 100 insertions(+), 38 deletions(-) diff --git a/.github/workflows/deploy-admin-staging.yml b/.github/workflows/deploy-admin-staging.yml index b2fa91f..7c00161 100644 --- a/.github/workflows/deploy-admin-staging.yml +++ b/.github/workflows/deploy-admin-staging.yml @@ -110,11 +110,15 @@ jobs: - name: Health-Check (HTTP 3xx/200 = Server erreichbar) run: | - sleep 5 - STATUS=$(curl -sS -o /dev/null -w '%{http_code}' \ - https://admin.staging.rebreak.org/ || echo "000") - echo "admin.staging.rebreak.org/ -> HTTP $STATUS" - if [ "$STATUS" = "000" ] || [ "$STATUS" = "502" ] || [ "$STATUS" = "503" ]; then - echo "FAIL: admin-staging nicht erreichbar (HTTP $STATUS)" - exit 1 - fi + for i in $(seq 1 12); do + STATUS=$(curl -sS -o /dev/null -w '%{http_code}' \ + https://admin.staging.rebreak.org/ 2>/dev/null || echo "000") + echo "Attempt $i: admin.staging.rebreak.org/ -> HTTP $STATUS" + if [ "$STATUS" != "000" ] && [ "$STATUS" != "502" ] && [ "$STATUS" != "503" ]; then + echo "Health-Check PASSED" + exit 0 + fi + sleep 5 + done + echo "FAIL: admin-staging nicht erreichbar (HTTP $STATUS) nach 60s" + exit 1 diff --git a/.github/workflows/deploy-staging.yml b/.github/workflows/deploy-staging.yml index 6f2fb5a..3851ea4 100644 --- a/.github/workflows/deploy-staging.yml +++ b/.github/workflows/deploy-staging.yml @@ -137,11 +137,17 @@ jobs: - name: Health-Check (HTTP 401 = Server erreichbar + auth-protected) run: | - sleep 5 - STATUS=$(curl -sS -o /dev/null -w '%{http_code}' \ - https://staging.rebreak.org/api/auth/me || echo "000") - echo "staging.rebreak.org/api/auth/me -> HTTP $STATUS" - if [ "$STATUS" != "401" ] && [ "$STATUS" != "200" ]; then - echo "FAIL: erwartet 401/200, bekommen $STATUS" - exit 1 - fi + # Infisical-Login + Nitro-Start + Cron-Init braucht auf gestresstem Server bis 30s. + # Retry-Loop: bis zu 12x mit 5s Pause = max 60s warten. + for i in $(seq 1 12); do + STATUS=$(curl -sS -o /dev/null -w '%{http_code}' \ + https://staging.rebreak.org/api/auth/me 2>/dev/null || echo "000") + echo "Attempt $i: staging.rebreak.org/api/auth/me -> HTTP $STATUS" + if [ "$STATUS" = "401" ] || [ "$STATUS" = "200" ]; then + echo "Health-Check PASSED" + exit 0 + fi + sleep 5 + done + echo "FAIL: erwartet 401/200, zuletzt bekommen $STATUS nach 60s" + exit 1 diff --git a/.github/workflows/maestro-cloud.yml b/.github/workflows/maestro-cloud.yml index 1a248c0..cd4c924 100644 --- a/.github/workflows/maestro-cloud.yml +++ b/.github/workflows/maestro-cloud.yml @@ -1,9 +1,9 @@ -# Maestro Cloud — E2E template for rebreak-native. +# Maestro Cloud — E2E for rebreak-native. # STATUS: TEMPLATE ONLY — not active. Requires User confirmation before enabling. # -# Trigger: manual dispatch OR PR to main (commented out — enable after User GO). +# Trigger: manual dispatch only (PR-trigger commented out — enable after User GO). # Requires: -# - MAESTRO_CLOUD_API_KEY in GitHub Actions secrets +# - MAESTRO_CLOUD_API_KEY in GitHub Actions secrets (environment: staging) # - EAS_TOKEN in GitHub Actions secrets # - E2E_TEST_USER + E2E_TEST_PASSWORD in GitHub Actions secrets # - Maestro Cloud account configured at mobile.dev @@ -32,12 +32,28 @@ jobs: maestro-cloud: name: E2E (${{ inputs.platform || 'ios' }}) runs-on: ubuntu-latest - - # Skip entirely if Maestro Cloud key is not configured — - # avoids CI failure on forks or before Cloud is set up. - if: ${{ secrets.MAESTRO_CLOUD_API_KEY != '' }} + environment: staging steps: + - name: Check required secrets + env: + MAESTRO_CLOUD_API_KEY: ${{ secrets.MAESTRO_CLOUD_API_KEY }} + EAS_TOKEN: ${{ secrets.EAS_TOKEN }} + E2E_TEST_USER: ${{ secrets.E2E_TEST_USER }} + E2E_TEST_PASSWORD: ${{ secrets.E2E_TEST_PASSWORD }} + run: | + missing=() + [ -z "$MAESTRO_CLOUD_API_KEY" ] && missing+=("MAESTRO_CLOUD_API_KEY") + [ -z "$EAS_TOKEN" ] && missing+=("EAS_TOKEN") + [ -z "$E2E_TEST_USER" ] && missing+=("E2E_TEST_USER") + [ -z "$E2E_TEST_PASSWORD" ] && missing+=("E2E_TEST_PASSWORD") + if [ ${#missing[@]} -gt 0 ]; then + echo "FATAL: Folgende Secrets fehlen in GitHub Actions (environment: staging):" + printf ' - %s\n' "${missing[@]}" + exit 1 + fi + echo "All required secrets present" + - name: Checkout uses: actions/checkout@v4 @@ -48,16 +64,15 @@ jobs: - name: Setup pnpm uses: pnpm/action-setup@v4 - with: - version: 9 - name: Install dependencies run: pnpm install --frozen-lockfile working-directory: apps/rebreak-native - # Build app via EAS — requires EAS_TOKEN secret and eas.json configured. + # Build app via EAS. # Profile "preview" must produce a .ipa (iOS) or .apk (Android). - - name: Build with EAS + # eas.json in apps/rebreak-native/ muss "preview"-Profile definieren. + - name: Setup EAS uses: expo/expo-github-action@v8 with: eas-version: latest @@ -72,7 +87,6 @@ jobs: --output ./build-artifact working-directory: apps/rebreak-native - # Install Maestro CLI - name: Install Maestro CLI run: curl -Ls "https://get.maestro.mobile.dev" | bash env: @@ -81,14 +95,12 @@ jobs: - name: Add Maestro to PATH run: echo "$HOME/.maestro/bin" >> $GITHUB_PATH - # Upload build + run flows on Maestro Cloud - name: Run Maestro Cloud run: | maestro cloud \ - --apiKey ${{ secrets.MAESTRO_CLOUD_API_KEY }} \ - --app ./build-artifact \ - --device ${{ inputs.platform || 'ios' }} \ - --env=E2E_TEST_USER=${{ secrets.E2E_TEST_USER }} \ - --env=E2E_TEST_PASSWORD=${{ secrets.E2E_TEST_PASSWORD }} \ + --apiKey "${{ secrets.MAESTRO_CLOUD_API_KEY }}" \ + --app ./apps/rebreak-native/build-artifact \ + --device "${{ inputs.platform || 'ios' }}" \ + --env=E2E_TEST_USER="${{ secrets.E2E_TEST_USER }}" \ + --env=E2E_TEST_PASSWORD="${{ secrets.E2E_TEST_PASSWORD }}" \ apps/rebreak-native/.maestro/ - working-directory: ${{ github.workspace }} diff --git a/.gitignore b/.gitignore index b4f105f..18b4869 100644 --- a/.gitignore +++ b/.gitignore @@ -36,3 +36,4 @@ xgit # Server-only deploy state (NICHT committen — wird vom deploy.sh gepflegt) .last-deployed-sha +.deploy-ga.lock diff --git a/scripts/deploy-from-artifact.sh b/scripts/deploy-from-artifact.sh index 55ec302..ccf2daa 100755 --- a/scripts/deploy-from-artifact.sh +++ b/scripts/deploy-from-artifact.sh @@ -11,6 +11,10 @@ # - Atomic .output-staging-Replacement bleibt # # Failure-Mode: Bei Migration-Fehler kein pm2-restart (Daten-Konsistenz-Schutz). +# +# Lockfile: setzt /srv/rebreak/.deploy-ga.lock waehrend Deploy laeuft. +# deploy.sh (webhook-trigger) respektiert diesen Lock und bricht ab. +# Verhindert Race-Condition wenn Webhook und GH-Actions gleichzeitig deployen. set -euo pipefail @@ -19,6 +23,7 @@ APP_DIR="${REPO_ROOT}/backend" ARTIFACT="${APP_DIR}/.output-incoming.tar.gz" PM2_BIN="/root/.nvm/versions/node/v24.11.1/bin/pm2" PNPM_BIN="/root/.nvm/versions/node/v24.11.1/bin/pnpm" +GA_LOCK="${REPO_ROOT}/.deploy-ga.lock" log() { echo "[deploy-artifact] $(date '+%H:%M:%S') $*"; } log_err() { echo "[deploy-artifact:err] $(date '+%H:%M:%S') $*" >&2; } @@ -27,7 +32,22 @@ log "=== Rebreak Deploy-from-Artifact gestartet ===" export PATH="/root/.nvm/versions/node/v24.11.1/bin:$PATH" -# 0. Sanity-Check Artifact +# 0a. Exklusiv-Lock setzen (verhindert parallelen Webhook-Deploy) +if ! ( set -o noclobber; echo "$$" > "$GA_LOCK" ) 2>/dev/null; then + LOCK_PID=$(cat "$GA_LOCK" 2>/dev/null || echo "?") + # Stale lock (Prozess tot)? Dann ueberschreiben. + if ! kill -0 "$LOCK_PID" 2>/dev/null; then + log "Staler Lock von PID $LOCK_PID gefunden -- ueberschreibe" + echo "$$" > "$GA_LOCK" + else + log_err "Ein anderer Deploy laeuft bereits (PID $LOCK_PID) -- abort" + exit 1 + fi +fi +# Lock beim Beenden immer aufraumen +trap 'rm -f "$GA_LOCK"' EXIT + +# 0b. Sanity-Check Artifact [[ -f "$ARTIFACT" ]] || { log_err "Artifact $ARTIFACT fehlt -- abort"; exit 1; } # 1. Git pull (fuer scripts/-Updates + prisma/migrations + .last-deployed-sha) @@ -93,11 +113,13 @@ fi # 3. Runtime-Deps installieren (nur falls package.json/lockfile changed) # Prisma-Client ist schon im Artifact baked-in via `prisma generate` auf dem Runner, # aber Runtime-Module (z.B. @prisma/client native binaries) muessen lokal sein. +# --prefer-offline: nutzt pnpm-Store-Cache wenn moeglich (kein neuer Download). +# Store wächst unbegrenzt -- prunen via: pnpm store prune (z.B. monatlich als Cron). log "Step 3: pnpm install (runtime-deps)..." cd "${REPO_ROOT}" -CI=true "${PNPM_BIN}" install --frozen-lockfile 2>&1 || { +CI=true "${PNPM_BIN}" install --frozen-lockfile --prefer-offline 2>&1 || { log_err "frozen-lockfile fehlgeschlagen, fallback ohne frozen..." - CI=true "${PNPM_BIN}" install --no-frozen-lockfile 2>&1 + CI=true "${PNPM_BIN}" install --no-frozen-lockfile --prefer-offline 2>&1 } log "pnpm install done" diff --git a/scripts/deploy.sh b/scripts/deploy.sh index 7c85193..96c60b7 100755 --- a/scripts/deploy.sh +++ b/scripts/deploy.sh @@ -6,6 +6,10 @@ # Clone: git@github.com:RaynisDev/rebreak.git # Backend: /srv/rebreak/backend (standalone Nitro, package: rebreak-backend) # +# WICHTIG: GitHub Actions ist der primaere Deploy-Weg (deploy-from-artifact.sh). +# Dieses Script ist Fallback/Legacy-Pfad und wird NICHT ausgefuehrt wenn +# GH-Actions gerade deployed (.deploy-ga.lock). +# # Ablauf: # 1. Git pull (via Deploy-Key) # 2. pnpm install --frozen-lockfile (mit hoisted node-linker via .npmrc) @@ -25,6 +29,7 @@ APP_DIR="${REPO_ROOT}/backend" NODE_BIN="/root/.nvm/versions/node/v24.11.1/bin/node" PNPM_BIN="/root/.nvm/versions/node/v24.11.1/bin/pnpm" PM2_BIN="/root/.nvm/versions/node/v24.11.1/bin/pm2" +GA_LOCK="${REPO_ROOT}/.deploy-ga.lock" log() { echo "[deploy] $(date '+%H:%M:%S') $*"; } log_err() { echo "[deploy:err] $(date '+%H:%M:%S') $*" >&2; } @@ -34,6 +39,18 @@ log "=== Rebreak Deploy gestartet (backend/-Layout) ===" # 0. Sicherstellen dass PATH stimmt export PATH="/root/.nvm/versions/node/v24.11.1/bin:$PATH" +# 0a. GH-Actions-Lock pruefen: wenn deploy-from-artifact.sh laeuft, nicht doppeln. +if [[ -f "$GA_LOCK" ]]; then + LOCK_PID=$(cat "$GA_LOCK" 2>/dev/null || echo "") + if [[ -n "$LOCK_PID" ]] && kill -0 "$LOCK_PID" 2>/dev/null; then + log "GitHub-Actions-Deploy laeuft gerade (PID $LOCK_PID) -- Webhook-Deploy abgebrochen (kein Fehler)" + exit 0 + else + log "Staler GA-Lock gefunden (PID $LOCK_PID) -- wird ignoriert" + rm -f "$GA_LOCK" + fi +fi + # 1. Git pull via Deploy-Key (SSH ist konfiguriert in /root/.ssh/config) log "Step 1: git pull..." cd "${REPO_ROOT}"