Files
openclaw/scripts/doctorbot-guardian.sh
2026-03-01 17:44:19 +03:00

101 lines
2.2 KiB
Bash
Executable File

#!/usr/bin/env bash
set -euo pipefail
WORKSPACE="/home/openclaw/.openclaw/workspace"
LOG="${WORKSPACE}/logs/doctorbot.log"
STATE_DIR="${WORKSPACE}/memory"
STATE_FILE="${STATE_DIR}/doctorbot-state.json"
MEM_CHECK_SCRIPT="${WORKSPACE}/scripts/memory-integrity-check.sh"
MEM_RESTORE_SCRIPT="${WORKSPACE}/scripts/memory-restore-from-db.sh"
mkdir -p "${WORKSPACE}/logs" "${STATE_DIR}"
log(){
echo "[$(date -u +'%F %T UTC')] $*" >> "$LOG"
}
read_fail_count(){
if [[ -f "$STATE_FILE" ]]; then
python3 - <<'PY' "$STATE_FILE"
import json,sys
p=sys.argv[1]
try:
d=json.load(open(p,'r',encoding='utf-8'))
print(int(d.get('fail_count',0)))
except Exception:
print(0)
PY
else
echo 0
fi
}
write_state(){
local count="$1"
cat > "$STATE_FILE" <<EOF
{
"fail_count": $count,
"updated_utc": "$(date -u +'%Y-%m-%dT%H:%M:%SZ')"
}
EOF
}
# Memory integrity guard + auto-restore from external DB.
if ! "$MEM_CHECK_SCRIPT" >/dev/null 2>&1; then
log "warn: memory integrity check failed"
if "$MEM_RESTORE_SCRIPT" >> "$LOG" 2>&1; then
log "ok: memory restore completed after failed integrity check"
else
rc=$?
log "error: memory restore failed (code=${rc})"
fi
fi
healthy=0
if openclaw status >/dev/null 2>&1; then
healthy=1
fi
fail_count=$(read_fail_count)
if [[ "$healthy" -eq 1 ]]; then
if [[ "$fail_count" -ne 0 ]]; then
log "ok: status healthy again, reset fail counter"
fi
write_state 0
exit 0
fi
fail_count=$((fail_count+1))
write_state "$fail_count"
log "warn: health check failed (consecutive=${fail_count})"
# 1st failure: no action, just observe
if [[ "$fail_count" -lt 2 ]]; then
exit 0
fi
# 2nd+ consecutive failure: try gentle recovery once per run
log "action: openclaw gateway restart"
openclaw gateway restart >/dev/null 2>&1 || true
sleep 4
if openclaw status >/dev/null 2>&1; then
log "ok: recovered after gateway restart"
write_state 0
exit 0
fi
log "action: systemctl --user restart openclaw-gateway.service"
systemctl --user restart openclaw-gateway.service >/dev/null 2>&1 || true
sleep 4
if openclaw status >/dev/null 2>&1; then
log "ok: recovered after user service restart"
write_state 0
exit 0
fi
log "error: unrecovered (consecutive=${fail_count})"
exit 0