railiance-infra/Makefile

273 lines
14 KiB
Makefile
Raw Normal View History

# -------- RailianceHosts Make Utilities --------
SHELL := /usr/bin/env bash
.DEFAULT_GOAL := help
# Set this to your Gitea host if you want 'remote-set' helper
GITEA ?= gitea.example.com
OWNER ?= coulomb
REPO ?= railiance-infra
# New-host defaults (can be overridden: make new-host NAME=... TYPE=...)
TYPE ?= cpx11
REGION ?= nbg1
ROLE ?= core
IMG ?= ubuntu-24.04
USER ?= admin
# Decrypt Hetzner token at runtime (requires SOPS_AGE_KEY or keys.txt locally)
HCLOUD_TOKEN := $(shell sops -d --extract '["hetzner"]["token"]' secrets/hetzner-token.yaml 2>/dev/null)
# ---- Help ----
help: ## Show this help
@echo "RailianceHosts Commands"; \
grep -E '^[a-zA-Z0-9_-]+:.*?## ' $(MAKEFILE_LIST) | sort | sed 's/:.*##/: /'
# ---- Git hooks ----
hooks: ## Configure git to use repo-local hooks (.githooks) and ensure executables
@mkdir -p .githooks
git config core.hooksPath .githooks
@test -f .githooks/pre-commit || (echo "❌ Missing .githooks/pre-commit"; exit 1)
chmod +x .githooks/pre-commit
@echo "✔ hooks enabled and pre-commit is executable"
hooks-test: ## Test secrets hook blocks plaintext in secrets/
@mkdir -p secrets && echo 'PLAINTEXT_TEST=true' > secrets/_hook_test.yaml
@git add secrets/_hook_test.yaml || true
@if git commit -m "TEST: should be blocked" 2>/dev/null; then \
echo "❌ Hook did NOT block plaintext (check .githooks/pre-commit)"; \
git reset --soft HEAD~1; \
else \
echo "✔ Hook blocked plaintext as expected"; \
fi
@git restore --staged secrets/_hook_test.yaml || true
@rm -f secrets/_hook_test.yaml
# ---- SOPS / Age helpers ----
sops-setup: ## Copy age key to SOPS default path (~/.config/sops/age/keys.txt)
mkdir -p ~/.config/sops/age
cp -n ~/.config/age/key.txt ~/.config/sops/age/keys.txt || true
chmod 600 ~/.config/sops/age/keys.txt
@echo "✔ SOPS key path set (~/.config/sops/age/keys.txt). Alternatively export SOPS_AGE_KEY."
sops-edit: ## Edit the global secrets with SOPS
2025-09-14 00:11:27 +00:00
sops secrets/hetzner-token.yaml
sops-encrypt: ## Encrypt a file in place: make sops-encrypt FILE=secrets/foo.yaml
@[ -n "$(FILE)" ] || (echo "Usage: make sops-encrypt FILE=secrets/xxx.yaml" && exit 1)
sops --encrypt --in-place $(FILE)
@echo "✔ Encrypted $(FILE)"
sops-decrypt: ## Print decrypted file to stdout (for inspection) FILE=secrets/foo.sops.yaml
@[ -n "$(FILE)" ] || (echo "Usage: make sops-decrypt FILE=secrets/xxx.sops.yaml" && exit 1)
sops -d $(FILE)
sops-rotate: ## Rotate recipients on a SOPS file (after updating .sops.yaml)
@[ -n "$(FILE)" ] || (echo "Usage: make sops-rotate FILE=secrets/xxx.sops.yaml" && exit 1)
sops --rotate --in-place $(FILE)
check-secrets: ## Fail if any file in secrets/ is not SOPS-encrypted
@! (git ls-files secrets | xargs -r grep -L -E '(^sops:$$|\"sops\"[[:space:]]*:)' | tee /dev/stderr | read) \
|| (echo "❌ Unencrypted secrets detected above. Encrypt with: sops --encrypt --in-place <file>"; exit 1)
@echo "✔ All files in secrets/ appear SOPS-encrypted"
# ---- Terraform (Hetzner) ----
tf-fmt: ## Terraform fmt
2025-09-14 00:24:48 +00:00
@[ -n "$(HCLOUD_TOKEN)" ] || (echo "HCLOUD_TOKEN empty; export SOPS_AGE_KEY or set keys.txt & fill secrets.sops.yaml" && exit 1)
@export HCLOUD_TOKEN=$(HCLOUD_TOKEN); @terraform -chdir=terraform/hetzner fmt -recursive || true
tf-init: ## Terraform init
2025-09-14 00:24:48 +00:00
@[ -n "$(HCLOUD_TOKEN)" ] || (echo "HCLOUD_TOKEN empty; export SOPS_AGE_KEY or set keys.txt & fill secrets.sops.yaml" && exit 1)
@export HCLOUD_TOKEN=$(HCLOUD_TOKEN); terraform -chdir=terraform/hetzner init
tf-plan: tf-init ## Terraform plan (requires decrypted HCLOUD_TOKEN)
@echo "🔍 Running terraform plan..."
@[ -n "$(HCLOUD_TOKEN)" ] || (echo "HCLOUD_TOKEN empty; export SOPS_AGE_KEY or set keys.txt & fill secrets.sops.yaml" && exit 1)
@export HCLOUD_TOKEN=$(HCLOUD_TOKEN); terraform -chdir=terraform/hetzner plan -var="hcloud_token=$(HCLOUD_TOKEN)"
tf-apply: tf-init ## Terraform apply (provision)
@[ -n "$(HCLOUD_TOKEN)" ] || (echo "HCLOUD_TOKEN empty; export SOPS_AGE_KEY or set keys.txt & fill secrets.sops.yaml" && exit 1)
@export HCLOUD_TOKEN=$(HCLOUD_TOKEN); terraform -chdir=terraform/hetzner apply -auto-approve -var="hcloud_token=$(HCLOUD_TOKEN)"
tf-destroy: tf-init ## Terraform destroy (tear down)
@[ -n "$(HCLOUD_TOKEN)" ] || (echo "HCLOUD_TOKEN empty; export SOPS_AGE_KEY or set keys.txt & fill secrets.sops.yaml" && exit 1)
@export HCLOUD_TOKEN=$(HCLOUD_TOKEN); terraform -chdir=terraform/hetzner destroy -auto-approve -var="hcloud_token=$(HCLOUD_TOKEN)"
2025-09-14 01:31:03 +00:00
# --- Terraform provider/lockfile helpers ---
TF_DIR := terraform/hetzner
TF_TOKEN := $(HCLOUD_TOKEN)
LOCKFILE := $(TF_DIR)/.terraform.lock.hcl
tf-lock-commit: ## Commit the current provider lockfile
@test -f $(LOCKFILE) || (echo "$(LOCKFILE) not found. Run 'make tf-init' first."; exit 1)
@git add $(LOCKFILE)
@git commit -m "chore(terraform): lock providers" || echo " No lockfile changes to commit."
tf-providers-check: ## Check if newer provider versions are available (non-destructive)
@echo "🔎 Checking for provider upgrades (lockfile readonly)…"
@if terraform -chdir=$(TF_DIR) init -upgrade -lockfile=readonly >/dev/null 2>&1; then \
echo "✔ Providers up to date (no upgrades available)."; \
else \
echo "↗ Provider upgrades likely available (readonly lockfile blocked changes)."; \
echo " Run: make tf-providers-upgrade"; \
fi
tf-providers-upgrade: ## Upgrade providers (updates .terraform.lock.hcl)
@echo "⬆️ Upgrading providers…"
@terraform -chdir=$(TF_DIR) init -upgrade
@echo "— Diff for $(LOCKFILE):"
@git --no-pager diff -- $(LOCKFILE) || true
@echo "💡 If changes look good: make tf-lock-commit"
tf-providers-upgrade-commit: tf-providers-upgrade tf-lock-commit ## Upgrade providers and commit the lockfile
tf-providers-plan: ## Plan after an upgrade (uses HCLOUD_TOKEN if set)
@echo "🧪 Planning with upgraded providers…"
@terraform -chdir=$(TF_DIR) plan $(if $(TF_TOKEN),-var="hcloud_token=$(TF_TOKEN)")
# ---- Backup (Q3 Operability & Resilience — D4) ----
backup: ## Backup S1 OS config to /opt/backup/railiance/infra/ (age-encrypted, root required)
sudo tools/cmd/railiance-backup-s1
# ---- Ansible ----
ansible-bootstrap: ## Run base bootstrap play (users, ssh, ufw, sops-agent, custodian-agent)
cd ansible && ansible-playbook playbooks/bootstrap.yaml -u admin
provision-custodian-agent: ## Deploy custodian agent SSH key to all managed hosts
@python3 -c "import yaml; d=yaml.safe_load(open('ansible/inventory/group_vars/all.yaml')); k=d.get('custodian_agent_pubkey',''); exit(0 if k else 1)" \
|| (echo "ERROR: custodian_agent_pubkey is empty. Run: cd ~/the-custodian && make custodian-keygen"; exit 1)
cd ansible && ansible-playbook playbooks/custodian-agent.yaml -u $(SSH_USER)
provision-custodian-agent-host: ## Deploy custodian agent key to one host: make provision-custodian-agent-host HOST=Railiance01
@test -n "$(HOST)" || (echo "Usage: make provision-custodian-agent-host HOST=Railiance01"; exit 1)
cd ansible && ansible-playbook playbooks/custodian-agent.yaml -u $(SSH_USER) \
--limit "$(HOST)"
# ---- Orchestration ----
apply: tf-fmt tf-apply ansible-bootstrap ## Provision via Terraform then converge via Ansible
deploy-stack: ## Print the full S1→S5 ordered deploy sequence (operator follows each step)
@echo ""
@echo "╔══════════════════════════════════════════════════════════════╗"
@echo "║ Railiance Stack — Full Deploy Sequence ║"
@echo "║ See docs/deploy-stack.md for full runbook ║"
@echo "╚══════════════════════════════════════════════════════════════╝"
@echo ""
@echo "PRE-CONDITIONS"
@echo " [ ] SSH key: ~/.ssh/id_ops"
@echo " [ ] SOPS key: ~/.config/sops/age/keys.txt (or SOPS_AGE_KEY)"
@echo " [ ] ops-bridge: bridge up state-hub-coulombcore k3s-api-coulombcore"
@echo ""
@echo "S1 — Infrastructure Substrate (this repo)"
@echo " make tf-plan && make tf-apply # provision server (skip if exists)"
@echo " ssh tegwick@92.205.130.254 'cd ~/railiance-infra/ansible && ansible-playbook playbooks/bootstrap.yaml -c local --become -l CoulombCore'"
@echo " make verify"
@echo ""
@echo "S2 — Cluster Runtime (railiance-cluster)"
@echo " ssh tegwick@92.205.130.254 'cd ~/railiance-cluster && make converge && make smoke'"
@echo ""
@echo "S3 — Platform Services (railiance-platform)"
@echo " ssh tegwick@92.205.130.254 'cd ~/railiance-platform && make db-deploy && make valkey-deploy'"
@echo ""
@echo "S4 — Developer Enablement (no workplan yet — ArgoCD already at cluster)"
@echo " (no steps required)"
@echo ""
@echo "S5 — Workloads (railiance-apps)"
@echo " ssh tegwick@92.205.130.254 'cd ~/railiance-apps && make gitea-deploy'"
@echo " ssh tegwick@92.205.130.254 'cd ~/railiance-apps && make state-hub-deploy' # T09"
@echo ""
@echo " Full runbook: docs/deploy-stack.md"
# ---- Utilities ----
doctor: ## Check tools and basic repo setup
@bash -ceu ' \
ok(){ printf "✔ %s\n" "$$1"; }; fail(){ printf "❌ %s\n" "$$1"; exit 1; }; \
command -v git >/dev/null && ok "git: $$(git --version)" || fail "git missing"; \
command -v ansible >/dev/null && ok "ansible: $$(ansible --version | head -1)"; \
command -v sops >/dev/null && ok "sops: $$(sops --version --check-for-updates)"; \
command -v age >/dev/null && ok "age: $$(age --version)"; \
command -v terraform >/dev/null && ok "terraform: $$(terraform -version | head -1)"; \
test -f keys/admin_ssh.pub && ok "keys/admin_ssh.pub present" || echo " add your SSH pubkey to keys/admin_ssh.pub"; \
test -f inventory/group_vars/secrets.sops.yaml && ok "secrets.sops.yaml present" || echo " create inventory/group_vars/secrets.sops.yaml"; \
grep -q "age1" .sops.yaml && ok ".sops.yaml has an age recipient" || echo " add your age public key to .sops.yaml"; \
git config --get core.hooksPath >/dev/null && ok "git hooksPath: $$(git config --get core.hooksPath)" || echo " run: make hooks"; \
'
# ---- Inventory convenience ----
new-host: ## Add a new host quickly: make new-host NAME=core1 TYPE=cpx11 REGION=nbg1 ROLE=core
@[ -n "$(NAME)" ] || (echo "Usage: make new-host NAME=... [TYPE=...] [REGION=...] [ROLE=...] [IMG=...] [USER=...]" && exit 1)
@python3 scripts/new_host.py --name "$(NAME)" --type "$(TYPE)" --region "$(REGION)" --role "$(ROLE)" --image "$(IMG)" --user "$(USER)"
@echo "✔ Added host $(NAME) to inventory/servers.yaml"
remote-set: ## Set origin to your Gitea repo (GITEA/OWNER/REPO vars)
git remote remove origin 2>/dev/null || true
git remote add origin https://$(GITEA)/$(OWNER)/$(REPO).git
git branch -M main
git push -u origin main
@echo "✔ Remote set to https://$(GITEA)/$(OWNER)/$(REPO).git"
2025-09-14 02:23:03 +00:00
# ==== Convergence (Ansible) ====
ANS_DIR := ansible
INV_SCRIPT := $(ANS_DIR)/inventory_from_yaml.py
PLAY := $(ANS_DIR)/playbooks/bootstrap.yaml
SSH_USER ?= admin
# Load your SOPS key for decryption when running playbooks (optional if you use keys.txt)
export SOPS_AGE_KEY := $(shell cat ~/.config/sops/age/keys.txt 2>/dev/null)
ansible-help: ## Show common Ansible commands
@echo "Convergence targets:"
@echo " make ansible-inventory # show resolved inventory"
@echo " make ansible-ping # ping all hosts"
@echo " make converge # run baseline convergence on all hosts"
@echo " make converge-host HOST=web-01# run on a single host"
@echo " make converge-tags TAGS=base # run only tagged tasks"
@echo " make converge-check # dry-run (check mode)"
@echo " make converge-diff # show config diffs"
ansible-inventory: ## Print the dynamic inventory Ansible will use
cd $(ANS_DIR) && ansible-inventory --list | head -200
ansible-ping: ## Quick connectivity check (SSH + Python availability)
cd $(ANS_DIR) && ansible all -u $(SSH_USER) -m ping
status: ## Show live security state of all hosts (UFW, fail2ban, SSH hardening)
@echo "=== Connectivity ==="
cd $(ANS_DIR) && ansible all -u $(SSH_USER) -m ping
@echo "=== UFW ==="
cd $(ANS_DIR) && ansible all -u $(SSH_USER) -m shell -a "ufw status" --become
@echo "=== fail2ban ==="
cd $(ANS_DIR) && ansible all -u $(SSH_USER) -m shell -a "systemctl is-active fail2ban"
@echo "=== SSH hardening ==="
cd $(ANS_DIR) && ansible all -u $(SSH_USER) -m shell -a "grep -iE '^(PermitRootLogin|PasswordAuthentication)' /etc/ssh/sshd_config" --become
@echo ""
@echo "--- Hint: run 'make verify' for a structured pass/fail report ---"
verify: ## Run Goss test suite against all hosts, commit TAP reports — exits non-zero on failure
@echo "Running Goss baseline assertions..."
@cd $(ANS_DIR) && ansible-playbook playbooks/verify.yaml -u $(SSH_USER) || \
(echo "One or more assertions FAILED — see reports/ for TAP output." && exit 1)
@echo "All assertions passed."
@git add reports/ && \
git diff --cached --quiet && echo "No new reports to commit." || \
git commit -m "chore: Goss verification reports $$(date -u +%Y-%m-%dT%H%M%SZ)"
2025-09-14 02:23:03 +00:00
converge: ## Converge all hosts to the baseline (idempotent)
cd $(ANS_DIR) && ansible-playbook $(PLAY) -u $(SSH_USER)
converge-host: ## Converge a single host: make converge-host HOST=core-01
@test -n "$(HOST)" || (echo "Usage: make converge-host HOST=<name>"; exit 1)
cd $(ANS_DIR) && ansible-playbook $(PLAY) -u $(SSH_USER) -l $(HOST)
converge-tags: ## Run only certain tags: make converge-tags TAGS="base,ufw"
@test -n "$(TAGS)" || (echo "Usage: make converge-tags TAGS=tag1,tag2"; exit 1)
cd $(ANS_DIR) && ansible-playbook $(PLAY) -u $(SSH_USER) --tags "$(TAGS)"
converge-check: ## Dry-run (no changes), great for previews
cd $(ANS_DIR) && ansible-playbook $(PLAY) -u $(SSH_USER) --check
converge-diff: ## Show file/templating diffs while applying changes
cd $(ANS_DIR) && ansible-playbook $(PLAY) -u $(SSH_USER) --diff