101 lines
2.2 KiB
Bash
Executable File
101 lines
2.2 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
set -euo pipefail
|
|
|
|
WORKSPACE="/home/openclaw/.openclaw/workspace"
|
|
LOG="${WORKSPACE}/logs/doctorbot.log"
|
|
STATE_DIR="${WORKSPACE}/memory"
|
|
STATE_FILE="${STATE_DIR}/doctorbot-state.json"
|
|
MEM_CHECK_SCRIPT="${WORKSPACE}/scripts/memory-integrity-check.sh"
|
|
MEM_RESTORE_SCRIPT="${WORKSPACE}/scripts/memory-restore-from-db.sh"
|
|
|
|
mkdir -p "${WORKSPACE}/logs" "${STATE_DIR}"
|
|
|
|
log(){
|
|
echo "[$(date -u +'%F %T UTC')] $*" >> "$LOG"
|
|
}
|
|
|
|
read_fail_count(){
|
|
if [[ -f "$STATE_FILE" ]]; then
|
|
python3 - <<'PY' "$STATE_FILE"
|
|
import json,sys
|
|
p=sys.argv[1]
|
|
try:
|
|
d=json.load(open(p,'r',encoding='utf-8'))
|
|
print(int(d.get('fail_count',0)))
|
|
except Exception:
|
|
print(0)
|
|
PY
|
|
else
|
|
echo 0
|
|
fi
|
|
}
|
|
|
|
write_state(){
|
|
local count="$1"
|
|
cat > "$STATE_FILE" <<EOF
|
|
{
|
|
"fail_count": $count,
|
|
"updated_utc": "$(date -u +'%Y-%m-%dT%H:%M:%SZ')"
|
|
}
|
|
EOF
|
|
}
|
|
|
|
# Memory integrity guard + auto-restore from external DB.
|
|
if ! "$MEM_CHECK_SCRIPT" >/dev/null 2>&1; then
|
|
log "warn: memory integrity check failed"
|
|
if "$MEM_RESTORE_SCRIPT" >> "$LOG" 2>&1; then
|
|
log "ok: memory restore completed after failed integrity check"
|
|
else
|
|
rc=$?
|
|
log "error: memory restore failed (code=${rc})"
|
|
fi
|
|
fi
|
|
|
|
healthy=0
|
|
if openclaw status >/dev/null 2>&1; then
|
|
healthy=1
|
|
fi
|
|
|
|
fail_count=$(read_fail_count)
|
|
|
|
if [[ "$healthy" -eq 1 ]]; then
|
|
if [[ "$fail_count" -ne 0 ]]; then
|
|
log "ok: status healthy again, reset fail counter"
|
|
fi
|
|
write_state 0
|
|
exit 0
|
|
fi
|
|
|
|
fail_count=$((fail_count+1))
|
|
write_state "$fail_count"
|
|
log "warn: health check failed (consecutive=${fail_count})"
|
|
|
|
# 1st failure: no action, just observe
|
|
if [[ "$fail_count" -lt 2 ]]; then
|
|
exit 0
|
|
fi
|
|
|
|
# 2nd+ consecutive failure: try gentle recovery once per run
|
|
log "action: openclaw gateway restart"
|
|
openclaw gateway restart >/dev/null 2>&1 || true
|
|
sleep 4
|
|
|
|
if openclaw status >/dev/null 2>&1; then
|
|
log "ok: recovered after gateway restart"
|
|
write_state 0
|
|
exit 0
|
|
fi
|
|
|
|
log "action: systemctl --user restart openclaw-gateway.service"
|
|
systemctl --user restart openclaw-gateway.service >/dev/null 2>&1 || true
|
|
sleep 4
|
|
|
|
if openclaw status >/dev/null 2>&1; then
|
|
log "ok: recovered after user service restart"
|
|
write_state 0
|
|
exit 0
|
|
fi
|
|
|
|
log "error: unrecovered (consecutive=${fail_count})"
|
|
exit 0
|