#!/usr/bin/env bash set -euo pipefail WORKSPACE="/home/openclaw/.openclaw/workspace" LOG="${WORKSPACE}/logs/doctorbot.log" STATE_DIR="${WORKSPACE}/memory" STATE_FILE="${STATE_DIR}/doctorbot-state.json" MEM_CHECK_SCRIPT="${WORKSPACE}/scripts/memory-integrity-check.sh" MEM_RESTORE_SCRIPT="${WORKSPACE}/scripts/memory-restore-from-db.sh" mkdir -p "${WORKSPACE}/logs" "${STATE_DIR}" log(){ echo "[$(date -u +'%F %T UTC')] $*" >> "$LOG" } read_fail_count(){ if [[ -f "$STATE_FILE" ]]; then python3 - <<'PY' "$STATE_FILE" import json,sys p=sys.argv[1] try: d=json.load(open(p,'r',encoding='utf-8')) print(int(d.get('fail_count',0))) except Exception: print(0) PY else echo 0 fi } write_state(){ local count="$1" cat > "$STATE_FILE" </dev/null 2>&1; then log "warn: memory integrity check failed" if "$MEM_RESTORE_SCRIPT" >> "$LOG" 2>&1; then log "ok: memory restore completed after failed integrity check" else rc=$? log "error: memory restore failed (code=${rc})" fi fi healthy=0 if openclaw status >/dev/null 2>&1; then healthy=1 fi fail_count=$(read_fail_count) if [[ "$healthy" -eq 1 ]]; then if [[ "$fail_count" -ne 0 ]]; then log "ok: status healthy again, reset fail counter" fi write_state 0 exit 0 fi fail_count=$((fail_count+1)) write_state "$fail_count" log "warn: health check failed (consecutive=${fail_count})" # 1st failure: no action, just observe if [[ "$fail_count" -lt 2 ]]; then exit 0 fi # 2nd+ consecutive failure: try gentle recovery once per run log "action: openclaw gateway restart" openclaw gateway restart >/dev/null 2>&1 || true sleep 4 if openclaw status >/dev/null 2>&1; then log "ok: recovered after gateway restart" write_state 0 exit 0 fi log "action: systemctl --user restart openclaw-gateway.service" systemctl --user restart openclaw-gateway.service >/dev/null 2>&1 || true sleep 4 if openclaw status >/dev/null 2>&1; then log "ok: recovered after user service restart" write_state 0 exit 0 fi log "error: unrecovered (consecutive=${fail_count})" exit 0