proxMon/SETUP-AND-DEPLOY-slides.html
Carsten 2f65bab7cf docs: single-file HTML slide deck for setup & deployment
32-slide self-contained deck mirroring SETUP-AND-DEPLOY.md structure.
Keyboard nav (arrows/space/PageUp-Down/digits/f for fullscreen), swipe,
click-to-advance, deep-linkable slides via #s=N, print-friendly.
Zero external deps — ships as one HTML file.
2026-04-22 09:06:32 +02:00

1028 lines
35 KiB
HTML
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1" />
<title>Proxmox Monitor — Setup & Deploy</title>
<style>
:root {
--bg: #0b1220;
--bg-2: #121b2f;
--fg: #e6edf3;
--muted: #94a3b8;
--accent: #22c55e;
--accent-2: #38bdf8;
--warn: #f59e0b;
--danger: #ef4444;
--border: #1e293b;
--code-bg: #0f172a;
--code-fg: #cbd5e1;
}
* { box-sizing: border-box; }
html, body { margin: 0; padding: 0; height: 100%; width: 100%; }
body {
background: var(--bg);
color: var(--fg);
font-family: -apple-system, BlinkMacSystemFont, "Inter", "Segoe UI", Roboto, sans-serif;
font-size: 18px;
line-height: 1.5;
overflow: hidden;
-webkit-font-smoothing: antialiased;
}
.deck {
position: relative;
width: 100vw;
height: 100vh;
overflow: hidden;
}
.slide {
position: absolute;
inset: 0;
display: none;
padding: 4vw 6vw 6vw 6vw;
overflow: auto;
}
.slide.active { display: flex; flex-direction: column; }
.slide h1 {
font-size: clamp(1.6rem, 3.2vw, 2.8rem);
font-weight: 700;
letter-spacing: -0.02em;
margin: 0 0 1rem 0;
line-height: 1.15;
}
.slide h2 {
font-size: clamp(1.1rem, 1.6vw, 1.4rem);
font-weight: 500;
color: var(--muted);
margin: 0 0 2rem 0;
letter-spacing: 0.02em;
}
.slide h3 {
font-size: 1.15rem;
font-weight: 600;
margin: 1.2rem 0 0.5rem 0;
color: var(--accent-2);
}
.eyebrow {
display: inline-block;
font-size: 0.78rem;
font-weight: 600;
letter-spacing: 0.16em;
text-transform: uppercase;
color: var(--accent);
margin-bottom: 0.6rem;
border-left: 3px solid var(--accent);
padding-left: 0.7rem;
}
.slide ul, .slide ol {
padding-left: 1.3rem;
margin: 0.5rem 0 1rem 0;
}
.slide li { margin: 0.35rem 0; }
.slide .muted { color: var(--muted); }
.slide strong { color: #ffffff; }
.slide code {
font-family: "SF Mono", Menlo, Consolas, "Liberation Mono", monospace;
background: var(--code-bg);
padding: 0.1rem 0.4rem;
border-radius: 4px;
font-size: 0.92em;
color: var(--code-fg);
}
.slide pre {
background: var(--code-bg);
border: 1px solid var(--border);
border-radius: 8px;
padding: 1rem 1.2rem;
overflow-x: auto;
font-family: "SF Mono", Menlo, Consolas, "Liberation Mono", monospace;
font-size: clamp(0.72rem, 0.95vw, 0.95rem);
line-height: 1.5;
color: var(--code-fg);
margin: 0.6rem 0 1rem 0;
}
.slide pre .c { color: #64748b; }
.slide pre .k { color: var(--accent-2); }
.slide pre .s { color: #fca5a5; }
.slide pre .n { color: var(--warn); }
.grid {
display: grid;
grid-template-columns: 1fr 1fr;
gap: 1.5rem;
}
.grid-3 {
display: grid;
grid-template-columns: repeat(3, 1fr);
gap: 1rem;
}
.card {
background: var(--bg-2);
border: 1px solid var(--border);
border-radius: 10px;
padding: 1.2rem 1.3rem;
}
.card .label {
font-size: 0.72rem;
text-transform: uppercase;
letter-spacing: 0.12em;
color: var(--muted);
margin-bottom: 0.4rem;
}
.card .value { font-weight: 600; color: #ffffff; }
.callout {
border-left: 4px solid var(--warn);
background: rgba(245, 158, 11, 0.08);
padding: 0.8rem 1.1rem;
border-radius: 4px;
margin: 0.8rem 0;
color: #fde68a;
font-size: 0.95rem;
}
.callout.danger {
border-color: var(--danger);
background: rgba(239, 68, 68, 0.08);
color: #fecaca;
}
.callout.info {
border-color: var(--accent-2);
background: rgba(56, 189, 248, 0.08);
color: #bae6fd;
}
.check {
display: flex;
align-items: flex-start;
gap: 0.8rem;
padding: 0.45rem 0;
border-bottom: 1px solid var(--border);
font-size: 0.95rem;
}
.check:last-child { border-bottom: none; }
.check .box {
flex: 0 0 auto;
width: 18px;
height: 18px;
border: 2px solid var(--accent);
border-radius: 3px;
margin-top: 3px;
}
table { width: 100%; border-collapse: collapse; margin: 0.6rem 0; font-size: 0.9rem; }
th, td {
text-align: left;
padding: 0.45rem 0.6rem;
border-bottom: 1px solid var(--border);
vertical-align: top;
}
th { color: var(--muted); font-weight: 600; font-size: 0.78rem; letter-spacing: 0.08em; text-transform: uppercase; }
td code { font-size: 0.85em; }
.ascii {
font-family: "SF Mono", Menlo, Consolas, monospace;
font-size: clamp(0.72rem, 0.95vw, 1rem);
color: var(--code-fg);
white-space: pre;
background: var(--code-bg);
border: 1px solid var(--border);
border-radius: 8px;
padding: 1rem 1.2rem;
overflow-x: auto;
}
/* Title slide */
.title-slide {
justify-content: center;
align-items: flex-start;
}
.title-slide h1 {
font-size: clamp(2.5rem, 6vw, 5rem);
margin-bottom: 0.5rem;
}
.title-slide .subtitle {
font-size: clamp(1.2rem, 2vw, 1.6rem);
color: var(--muted);
margin-bottom: 1.5rem;
}
.title-slide .meta {
margin-top: 2rem;
color: var(--muted);
font-size: 0.9rem;
}
/* End slide */
.end-slide { justify-content: center; align-items: center; text-align: center; }
.end-slide h1 { font-size: clamp(2rem, 5vw, 4rem); }
/* Chrome */
.topbar {
position: fixed;
top: 0; left: 0; right: 0;
display: flex;
justify-content: space-between;
align-items: center;
padding: 0.9rem 1.4rem;
font-size: 0.78rem;
color: var(--muted);
z-index: 10;
pointer-events: none;
}
.topbar .brand { letter-spacing: 0.12em; text-transform: uppercase; }
.topbar .counter { font-variant-numeric: tabular-nums; }
.progress {
position: fixed;
bottom: 0; left: 0;
height: 2px;
background: var(--accent);
transition: width 0.25s ease;
z-index: 10;
}
.footer-hint {
position: fixed;
bottom: 0.7rem; right: 1.4rem;
font-size: 0.72rem;
color: var(--muted);
z-index: 10;
pointer-events: none;
}
@media (max-width: 720px) {
.grid, .grid-3 { grid-template-columns: 1fr; }
.slide { padding: 5vw 5vw 10vw 5vw; }
}
@media print {
.topbar, .progress, .footer-hint { display: none; }
.deck, .slide { position: static; height: auto; overflow: visible; }
.slide { display: block !important; page-break-after: always; min-height: 100vh; }
}
</style>
</head>
<body>
<div class="topbar">
<span class="brand">Proxmox Monitor · Setup &amp; Deploy</span>
<span class="counter"><span id="current">1</span> / <span id="total">1</span></span>
</div>
<div class="progress" id="progress" style="width:0%"></div>
<div class="footer-hint">← → or space to navigate · <kbd>f</kbd> fullscreen</div>
<div class="deck" id="deck">
<!-- 1 -->
<section class="slide title-slide">
<span class="eyebrow">Runbook</span>
<h1>Proxmox Monitor</h1>
<div class="subtitle">Setup &amp; Deployment — Production Rollout</div>
<p class="muted" style="max-width:48rem">
Agent-server monitoring for Proxmox hosts. Elixir/OTP backend, Burrito-packaged agents,
Phoenix LiveView dashboard. This deck walks you from a clean environment to 20 hosts reporting,
in order, with verification at every step.
</p>
<div class="meta">
Reference: <code>SETUP-AND-DEPLOY.md</code> · ~23h end-to-end + host rollout time
</div>
</section>
<!-- 2 -->
<section class="slide">
<span class="eyebrow">What you're deploying</span>
<h1>Architecture</h1>
<h2>Two artifacts, independent pipelines, one dashboard</h2>
<div class="ascii">
┌─────────────────────────┐
│ Server (LXC in RZ) │
agents ──WSS──│ · Phoenix release │
│ · SQLite │
│ · Caddy (TLS) │
└─────────────────────────┘
│ ssh
┌─────────────────────────┐
│ Operator workstation │
│ · builds server │
│ · builds agent binary │
└─────────────────────────┘
│ scp
┌─────────────────────────┐
│ Proxmox host (1 of N) │
│ · Burrito binary │
│ · systemd unit │
└─────────────────────────┘
</div>
<p class="muted">Agents initiate outbound WSS — no inbound ports on Proxmox hosts.</p>
</section>
<!-- 3 -->
<section class="slide">
<span class="eyebrow">Phases</span>
<h1>Roadmap for this deck</h1>
<ol>
<li><strong>Preflight</strong> — confirm prerequisites</li>
<li><strong>Local build</strong> — produce the two artifacts</li>
<li><strong>Server deploy</strong> — one-time LXC bring-up</li>
<li><strong>First agent</strong> — prove the pipeline end-to-end</li>
<li><strong>Test tier</strong> — 23 hosts for 24h</li>
<li><strong>Full rollout</strong> — the remaining fleet</li>
<li><strong>Rollback</strong> — because things go wrong</li>
<li><strong>Ongoing operations</strong> — upgrades, backups, rotation</li>
<li><strong>Go / No-Go</strong> — final sign-off</li>
</ol>
</section>
<!-- 4 -->
<section class="slide">
<span class="eyebrow">§ 1 Preflight</span>
<h1>Hardware &amp; network</h1>
<div class="grid">
<div class="card">
<div class="label">Server LXC</div>
<div class="value">Debian 12 · 1 GB RAM · 2 cores · 10 GB</div>
<p class="muted" style="margin:.5rem 0 0 0">Unprivileged. Covers &gt;20 agents comfortably.</p>
</div>
<div class="card">
<div class="label">DNS</div>
<div class="value">A record → public IP</div>
<p class="muted" style="margin:.5rem 0 0 0">Verify: <code>dig +short monitor.example.com</code></p>
</div>
<div class="card">
<div class="label">Inbound</div>
<div class="value">TCP 443 → server LXC</div>
<p class="muted" style="margin:.5rem 0 0 0">Caddy handles Let's Encrypt via HTTP-01.</p>
</div>
<div class="card">
<div class="label">Outbound</div>
<div class="value">HTTPS from every Proxmox host</div>
<p class="muted" style="margin:.5rem 0 0 0">No inbound port required on hosts.</p>
</div>
</div>
<p class="muted" style="margin-top:1rem">SSH root access: hypervisor + every Proxmox host.</p>
</section>
<!-- 5 -->
<section class="slide">
<span class="eyebrow">§ 1 Preflight</span>
<h1>Versions &amp; tools</h1>
<div class="grid">
<div>
<h3>Proxmox fleet</h3>
<ul>
<li>VE <strong>8.3+</strong></li>
<li>OpenZFS <strong>2.3+</strong> (for <code>-j</code> JSON output)</li>
<li>Older hosts will report empty ZFS payloads</li>
</ul>
</div>
<div>
<h3>Build machine</h3>
<ul>
<li>Elixir <strong>1.19</strong> + OTP <strong>28</strong></li>
<li>Mix + Hex</li>
<li><strong>Docker</strong> daemon running (for Linux binaries)</li>
<li>SSH, scp, <code>sqlite3</code> (optional)</li>
</ul>
</div>
</div>
<div class="callout info">
No Docker? Run <code>./scripts/build-linux.sh</code> on the server LXC itself instead.
</div>
</section>
<!-- 6 -->
<section class="slide">
<span class="eyebrow">§ 1 Preflight</span>
<h1>Secrets plan</h1>
<h2>Three values — keep in a password manager, never in git</h2>
<table>
<thead><tr><th>Secret</th><th>How to generate</th></tr></thead>
<tbody>
<tr>
<td><code>DASHBOARD_PASSWORD_HASH</code></td>
<td><code>mix run -e 'IO.puts(Argon2.hash_pwd_salt("&lt;pw&gt;"))'</code></td>
</tr>
<tr>
<td><code>SECRET_KEY_BASE</code></td>
<td><code>mix phx.gen.secret</code> (64-byte base64)</td>
</tr>
<tr>
<td>Per-agent tokens</td>
<td>Admin UI → <em>Add host</em> reveals token once</td>
</tr>
</tbody>
</table>
<div class="callout danger">
Tokens are shown <strong>once</strong>. Paste into your password manager before clicking away.
</div>
</section>
<!-- 7 -->
<section class="slide">
<span class="eyebrow">§ 2 Local build</span>
<h1>Tests first</h1>
<h2>If either suite is red, stop</h2>
<pre><code>cd server &amp;&amp; mix deps.get &amp;&amp; mix test
cd ../agent &amp;&amp; mix deps.get &amp;&amp; mix test</code></pre>
<div class="grid">
<div class="card">
<div class="label">Server</div>
<div class="value">58 tests, 0 failures</div>
</div>
<div class="card">
<div class="label">Agent</div>
<div class="value">23 tests, 0 failures</div>
</div>
</div>
<p class="muted">Never build a release from a branch with failing tests.</p>
</section>
<!-- 8 -->
<section class="slide">
<span class="eyebrow">§ 2 Local build</span>
<h1>Hash the password</h1>
<pre><code>cd server
mix run -e 'IO.puts(Argon2.hash_pwd_salt("your-password"))'</code></pre>
<p>Output looks like:</p>
<pre><code>$argon2id$v=19$m=65536,t=3,p=4$dSB9...$x0OQ...</code></pre>
<div class="callout">
Copy the whole <code>$argon2id$...</code> string into your password manager.
The plaintext password never leaves your head / password manager.
</div>
</section>
<!-- 9 -->
<section class="slide">
<span class="eyebrow">§ 2 Local build</span>
<h1>Server release</h1>
<pre><code>MIX_ENV=prod DASHBOARD_PASSWORD_HASH='placeholder' \
mix release --overwrite
tar -czf /tmp/server_release.tgz -C _build/prod/rel server
ls -lh /tmp/server_release.tgz</code></pre>
<p>Expected: ~3060 MB tarball.</p>
<div class="callout info">
The <code>placeholder</code> hash only needs to exist so <code>config/runtime.exs</code>
accepts it. The real hash is supplied on the LXC at start time.
</div>
</section>
<!-- 10 -->
<section class="slide">
<span class="eyebrow">§ 2 Local build</span>
<h1>Agent binaries</h1>
<pre><code>cd ../agent
./scripts/build-linux.sh</code></pre>
<p>Expected output:</p>
<pre><code>Binaries written to /.../agent/dist:
proxmox-monitor-agent_linux_amd64
proxmox-monitor-agent_linux_arm64</code></pre>
<p>Sanity check:</p>
<pre><code>file dist/proxmox-monitor-agent_linux_amd64 | grep 'ELF 64-bit'</code></pre>
<p class="muted">First build: 510 min. Subsequent builds: seconds (Docker layer cache).</p>
</section>
<!-- 11 -->
<section class="slide">
<span class="eyebrow">§ 3 Server deploy</span>
<h1>Create the LXC</h1>
<p>On the hypervisor:</p>
<pre><code>pct create 200 \
/var/lib/vz/template/cache/debian-12-standard_12.7-1_amd64.tar.zst \
--hostname proxmox-monitor \
--memory 1024 --cores 2 \
--rootfs local-zfs:10 \
--net0 name=eth0,bridge=vmbr0,ip=dhcp \
--unprivileged 1 --features nesting=0 --onboot 1
pct start 200
pct exec 200 -- ip -4 addr show eth0 | grep -Po 'inet \K[\d.]+'</code></pre>
<p>Save the IP as <code>LXC_IP</code>. Typos here cost hours.</p>
</section>
<!-- 12 -->
<section class="slide">
<span class="eyebrow">§ 3 Server deploy</span>
<h1>Base packages</h1>
<p><code>pct enter 200</code> then:</p>
<pre><code>apt-get update
apt-get install -y ca-certificates curl gnupg \
debian-keyring debian-archive-keyring apt-transport-https sqlite3
curl -1sLf 'https://dl.cloudsmith.io/public/caddy/stable/gpg.key' | \
gpg --dearmor -o /usr/share/keyrings/caddy-stable-archive-keyring.gpg
curl -1sLf 'https://dl.cloudsmith.io/public/caddy/stable/debian.deb.txt' \
> /etc/apt/sources.list.d/caddy-stable.list
apt-get update &amp;&amp; apt-get install -y caddy
caddy version</code></pre>
</section>
<!-- 13 -->
<section class="slide">
<span class="eyebrow">§ 3 Server deploy</span>
<h1>Upload and extract the release</h1>
<p>From your workstation:</p>
<pre><code>scp /tmp/server_release.tgz root@$LXC_IP:/tmp/</code></pre>
<p>Inside the LXC:</p>
<pre><code>mkdir -p /opt/proxmox-monitor
tar -xzf /tmp/server_release.tgz -C /opt/proxmox-monitor
ls /opt/proxmox-monitor/server/bin/
# server migrate server.bat migrate.bat</code></pre>
</section>
<!-- 14 -->
<section class="slide">
<span class="eyebrow">§ 3 Server deploy</span>
<h1>Environment file</h1>
<pre><code>install -d -m 0700 /var/lib/proxmox-monitor
cat &gt; /etc/default/proxmox-monitor &lt;&lt;'EOF'
DATABASE_PATH=/var/lib/proxmox-monitor/monitor.db
SECRET_KEY_BASE=&lt;paste-mix-phx.gen.secret-output&gt;
DASHBOARD_PASSWORD_HASH=&lt;paste-$argon2id$-hash&gt;
PHX_SERVER=true
PHX_HOST=monitor.example.com
PORT=4000
EOF
chmod 0600 /etc/default/proxmox-monitor</code></pre>
<div class="callout danger">
<strong>Single-quoted heredoc matters.</strong> A double-quoted one eats the <code>$</code>
characters in the Argon2 hash.
</div>
</section>
<!-- 15 -->
<section class="slide">
<span class="eyebrow">§ 3 Server deploy</span>
<h1>Migrate &amp; systemd</h1>
<pre><code>set -a; . /etc/default/proxmox-monitor; set +a
/opt/proxmox-monitor/server/bin/server eval 'Server.Release.migrate()'
sqlite3 /var/lib/proxmox-monitor/monitor.db '.tables'
# hosts metrics schema_migrations</code></pre>
<p>Then install the systemd unit (see runbook §3.6) with:</p>
<pre><code>ExecStartPre=/opt/proxmox-monitor/server/bin/server eval 'Server.Release.migrate()'
ExecStart=/opt/proxmox-monitor/server/bin/server start
Restart=always
RestartSec=5</code></pre>
<pre><code>systemctl daemon-reload &amp;&amp; systemctl enable --now proxmox-monitor</code></pre>
</section>
<!-- 16 -->
<section class="slide">
<span class="eyebrow">§ 3 Server deploy</span>
<h1>Caddy: TLS + WSS reverse-proxy</h1>
<pre><code>monitor.example.com {
reverse_proxy 127.0.0.1:4000 {
header_up X-Forwarded-Proto {scheme}
header_up X-Forwarded-For {remote_host}
transport http {
read_timeout 90s
dial_timeout 10s
}
}
}</code></pre>
<div class="callout danger">
<code>read_timeout 90s</code> is <strong>critical</strong>. Without it, every agent's
WebSocket is torn down every ~30s and the dashboard stays permanently offline-looking.
</div>
<pre><code>caddy validate --config /etc/caddy/Caddyfile
systemctl reload caddy</code></pre>
</section>
<!-- 17 -->
<section class="slide">
<span class="eyebrow">§ 3 Server deploy</span>
<h1>Server smoke test</h1>
<p>From anywhere on the internet:</p>
<pre><code>curl -s https://monitor.example.com/health</code></pre>
<p>Expected:</p>
<pre><code>{"db":"ok","status":"ok","version":"0.1.0"}</code></pre>
<p>Then browser:</p>
<ul>
<li>Open <code>https://monitor.example.com/</code> → redirects to <code>/login</code></li>
<li>Enter your dashboard password → lands on empty overview</li>
<li>"No hosts registered yet." is the expected empty state</li>
</ul>
<div class="callout">
Login loops on "Incorrect password"? <code>DASHBOARD_PASSWORD_HASH</code> was not pasted
correctly. Re-generate and redeploy §3.4.
</div>
</section>
<!-- 18 -->
<section class="slide">
<span class="eyebrow">§ 4 First agent</span>
<h1>Register in the admin UI</h1>
<ol>
<li>Browser → <code>/admin/hosts</code></li>
<li>"Register a new host" → enter <strong>short name</strong> (e.g. <code>pve-host-01</code>)</li>
<li>Click <strong>Add</strong></li>
<li>The page reveals a token — <strong>copy it now</strong></li>
</ol>
<div class="callout danger">
Tokens are shown <strong>exactly once</strong>. If you close the page without copying,
Rotate and try again.
</div>
</section>
<!-- 19 -->
<section class="slide">
<span class="eyebrow">§ 4 First agent</span>
<h1>Deploy binary + config</h1>
<pre><code>export HOST=pve-host-01
scp agent/dist/proxmox-monitor-agent_linux_amd64 \
root@$HOST:/usr/local/bin/proxmox-monitor-agent
ssh root@$HOST 'chmod 0755 /usr/local/bin/proxmox-monitor-agent'
scp agent/rel/proxmox-monitor-agent.service \
root@$HOST:/etc/systemd/system/</code></pre>
<p>On the host — write the TOML config:</p>
<pre><code>install -d -m 0700 /etc/proxmox-monitor /var/cache/proxmox-monitor-agent
cat &gt; /etc/proxmox-monitor/agent.toml &lt;&lt;'EOF'
server_url = "wss://monitor.example.com/socket/websocket"
token = "&lt;paste-token-from-dashboard&gt;"
host_id = "pve-host-01"
[intervals]
fast_seconds = 30
medium_seconds = 300
slow_seconds = 1800
EOF
chmod 0600 /etc/proxmox-monitor/agent.toml</code></pre>
</section>
<!-- 20 -->
<section class="slide">
<span class="eyebrow">§ 4 First agent</span>
<h1>Enable and verify</h1>
<pre><code>systemctl daemon-reload
systemctl enable --now proxmox-monitor-agent
journalctl -u proxmox-monitor-agent -f</code></pre>
<p>Expected within 10s:</p>
<pre><code>agent: starting with host_id=pve-host-01
reporter: connected, joining host:pve-host-01
reporter: joined host:pve-host-01</code></pre>
<p>Reload the dashboard — the card should be <strong>online</strong> (green border)
with Load / RAM / Pools / VMs populated.</p>
</section>
<!-- 21 -->
<section class="slide">
<span class="eyebrow">§ 4 First agent</span>
<h1>Verify the offline flip</h1>
<div class="grid">
<div>
<h3>Test</h3>
<pre><code>ssh root@$HOST \
'systemctl stop proxmox-monitor-agent'</code></pre>
<p>Dashboard card grey within ~1s.</p>
<pre><code>ssh root@$HOST \
'systemctl start proxmox-monitor-agent'</code></pre>
<p>Green again within 30s.</p>
</div>
<div>
<h3>If the card stays green</h3>
<p>Channel <code>terminate</code> callback didn't run — usually Caddy.</p>
<ul>
<li>Check <code>/etc/caddy/Caddyfile</code> has <code>read_timeout 90s</code></li>
<li><code>systemctl reload caddy</code> after fixing</li>
</ul>
</div>
</div>
</section>
<!-- 22 -->
<section class="slide">
<span class="eyebrow">§ 5 Test tier</span>
<h1>23 hosts for 24h</h1>
<p>Pick non-critical hosts, or hosts with independent monitoring to fall back on.</p>
<p>What to look for overnight:</p>
<ul>
<li>All cards remain <strong>online</strong></li>
<li>No repeating <code>[error]</code> lines in server log</li>
<li>Retention log line appears: <code>retention: pruned N stale samples</code> (starts firing after 48h)</li>
</ul>
<h3>Tests to actively run</h3>
<ul>
<li>Reboot a Proxmox host → card goes offline, returns without intervention</li>
<li><code>systemctl restart proxmox-monitor</code> on the server → all agents flip offline, then green within 30s. No stuck agents.</li>
</ul>
</section>
<!-- 23 -->
<section class="slide">
<span class="eyebrow">§ 5 Test tier</span>
<h1>Go / No-Go gate</h1>
<h2>Do NOT proceed to full rollout unless ALL are true for 24h</h2>
<div class="check"><div class="box"></div>All test-tier hosts show <strong>online</strong> continuously</div>
<div class="check"><div class="box"></div>No repeating error lines in server logs</div>
<div class="check"><div class="box"></div>Retention has pruned at least one row</div>
<div class="check"><div class="box"></div>Token rotation + restart behaves as designed</div>
<div class="check"><div class="box"></div>Server-reboot drill: all agents recover without intervention</div>
<div class="check"><div class="box"></div>Dashboard is responsive (&lt;1s LiveView updates)</div>
</section>
<!-- 24 -->
<section class="slide">
<span class="eyebrow">§ 6 Full rollout</span>
<h1>Batch loop</h1>
<p>After 34 hosts by hand, batch:</p>
<pre><code>for HOST in pve-host-04 pve-host-05 pve-host-06; do
echo "Register $HOST in admin UI, paste token:"
read -s TOKEN
scp agent/dist/proxmox-monitor-agent_linux_amd64 \
root@$HOST:/usr/local/bin/proxmox-monitor-agent
scp agent/rel/proxmox-monitor-agent.service \
root@$HOST:/etc/systemd/system/
ssh root@$HOST "chmod 0755 /usr/local/bin/proxmox-monitor-agent &amp;&amp;
install -d -m 0700 /etc/proxmox-monitor /var/cache/proxmox-monitor-agent &amp;&amp;
cat &gt; /etc/proxmox-monitor/agent.toml &lt;&lt;EOF
server_url = \"wss://monitor.example.com/socket/websocket\"
token = \"$TOKEN\"
host_id = \"$HOST\"
EOF
chmod 0600 /etc/proxmox-monitor/agent.toml &amp;&amp;
systemctl daemon-reload &amp;&amp;
systemctl enable --now proxmox-monitor-agent"
done</code></pre>
<p class="muted">After each batch of ~5: spot-check cards, filter for offline, open a random host detail.</p>
</section>
<!-- 25 -->
<section class="slide">
<span class="eyebrow">§ 7 Rollback</span>
<h1>Four escape hatches</h1>
<div class="grid">
<div class="card">
<div class="label">One agent</div>
<pre style="margin-top:.5rem"><code>ssh root@$HOST \
'systemctl disable --now proxmox-monitor-agent'</code></pre>
</div>
<div class="card">
<div class="label">Whole service</div>
<pre style="margin-top:.5rem"><code>systemctl stop proxmox-monitor
systemctl stop caddy</code></pre>
</div>
<div class="card">
<div class="label">Previous release</div>
<pre style="margin-top:.5rem"><code>systemctl stop proxmox-monitor
rm -rf /opt/proxmox-monitor/server
tar -xzf /tmp/server_release_PREV.tgz \
-C /opt/proxmox-monitor
systemctl start proxmox-monitor</code></pre>
</div>
<div class="card">
<div class="label">Restore DB</div>
<pre style="margin-top:.5rem"><code>systemctl stop proxmox-monitor
cp /var/backups/proxmox-monitor/monitor-YYYY-MM-DD.db \
/var/lib/proxmox-monitor/monitor.db
systemctl start proxmox-monitor</code></pre>
</div>
</div>
<p class="muted">Tokens survive DB restores. Metrics post-backup are lost (48h max by retention policy).</p>
</section>
<!-- 26 -->
<section class="slide">
<span class="eyebrow">§ 8 Ongoing ops</span>
<h1>Upgrades</h1>
<div class="grid">
<div>
<h3>Server</h3>
<pre><code>cd server
MIX_ENV=prod DASHBOARD_PASSWORD_HASH='placeholder' \
mix release --overwrite
tar -czf /tmp/server_release.tgz -C _build/prod/rel server
scp /tmp/server_release.tgz root@$LXC:/tmp/
ssh root@$LXC '
systemctl stop proxmox-monitor
mv /opt/proxmox-monitor/server{,.old}
tar -xzf /tmp/server_release.tgz -C /opt/proxmox-monitor
systemctl start proxmox-monitor # ExecStartPre runs migrate
'</code></pre>
<p class="muted">Verify <code>/health</code> then delete <code>server.old</code>.</p>
</div>
<div>
<h3>Agent</h3>
<pre><code>scp agent/dist/proxmox-monitor-agent_linux_amd64 \
root@$HOST:/usr/local/bin/proxmox-monitor-agent.new
ssh root@$HOST '
mv /usr/local/bin/proxmox-monitor-agent{.new,}
systemctl restart proxmox-monitor-agent
'</code></pre>
<p class="muted">No DB on the host, so agent upgrades are trivially atomic.</p>
</div>
</div>
</section>
<!-- 27 -->
<section class="slide">
<span class="eyebrow">§ 8 Ongoing ops</span>
<h1>SQLite backups</h1>
<p>Install as a cron inside the LXC — keeps 30 daily snapshots:</p>
<pre><code>cat &gt; /etc/cron.d/proxmox-monitor-backup &lt;&lt;'EOF'
30 3 * * * root install -d -m 0700 /var/backups/proxmox-monitor &amp;&amp; \
sqlite3 /var/lib/proxmox-monitor/monitor.db \
".backup /var/backups/proxmox-monitor/monitor-$(date +\%Y-\%m-\%d).db" &amp;&amp; \
find /var/backups/proxmox-monitor -name 'monitor-*.db' -mtime +30 -delete
EOF</code></pre>
<p>SQLite's online-backup command is safe while the server is running.</p>
<p class="muted">Verify at least one run before declaring the rollout complete.</p>
</section>
<!-- 28 -->
<section class="slide">
<span class="eyebrow">§ 9 Sign-off</span>
<h1>Production readiness</h1>
<div class="check"><div class="box"></div><code>/health</code> returns 200 with <code>status:ok</code></div>
<div class="check"><div class="box"></div>External uptime monitor configured and green</div>
<div class="check"><div class="box"></div>All intended Proxmox hosts on overview, all <strong>online</strong></div>
<div class="check"><div class="box"></div>≥1 full 48h retention cycle observed (pruning log present)</div>
<div class="check"><div class="box"></div>SQLite backup cron installed and yesterday's file exists</div>
<div class="check"><div class="box"></div>You have rolled back once <strong>on purpose</strong> (drill)</div>
</section>
<!-- 29 -->
<section class="slide">
<span class="eyebrow">§ 9 Sign-off</span>
<h1>Secrets hygiene</h1>
<div class="check"><div class="box"></div>Dashboard password in a password manager, not a text file</div>
<div class="check"><div class="box"></div><code>SECRET_KEY_BASE</code> in a password manager</div>
<div class="check"><div class="box"></div><code>/etc/default/proxmox-monitor</code> is <code>0600 root:root</code></div>
<div class="check"><div class="box"></div><code>/etc/proxmox-monitor/agent.toml</code> is <code>0600 root:root</code> on every host</div>
<div class="check"><div class="box"></div>You can rotate an agent token in &lt;2 minutes</div>
<div class="check"><div class="box"></div>A teammate has been walked through one agent install and one token rotation live</div>
</section>
<!-- 30 -->
<section class="slide">
<span class="eyebrow">Appendix A</span>
<h1>Common errors</h1>
<table>
<thead><tr><th>Symptom</th><th>First thing to check</th></tr></thead>
<tbody>
<tr><td><code>CERT_AUTHORITY_INVALID</code> in browser</td><td>Caddy hasn't finished LE issuance. Wait 60s. <code>journalctl -u caddy</code>.</td></tr>
<tr><td>Login loops on correct password</td><td><code>DASHBOARD_PASSWORD_HASH</code> mismatch. Regenerate and redeploy.</td></tr>
<tr><td>Card stays offline after agent restart</td><td>Wrong token or <code>unknown_host</code>. Check agent journal.</td></tr>
<tr><td>All agents reconnect every ~30s</td><td>Caddy <code>read_timeout</code> missing or too short.</td></tr>
<tr><td><code>/health</code> returns 503</td><td>Process up but DB unreadable. Check permissions + <code>DATABASE_PATH</code>.</td></tr>
<tr><td>LXC can't bind port 4000</td><td>Another process owns it. <code>ss -ltnp | grep 4000</code>.</td></tr>
<tr><td>Agent logs <code>{:enoent, "pvesh"}</code></td><td>Not a Proxmox host, or empty <code>$PATH</code> under systemd.</td></tr>
</tbody>
</table>
</section>
<!-- 31 -->
<section class="slide">
<span class="eyebrow">Appendix B</span>
<h1>File &amp; port cheat sheet</h1>
<div class="grid">
<div>
<h3>Server LXC</h3>
<pre><code>/opt/proxmox-monitor/server/ release tree
/etc/default/proxmox-monitor env secrets, 0600
/etc/systemd/system/proxmox-monitor.service
/etc/caddy/Caddyfile
/var/lib/proxmox-monitor/monitor.db
/var/backups/proxmox-monitor/ daily backups
tcp 443 (caddy) → tcp 127.0.0.1:4000 (phoenix)</code></pre>
</div>
<div>
<h3>Proxmox host (per agent)</h3>
<pre><code>/usr/local/bin/proxmox-monitor-agent
/etc/proxmox-monitor/agent.toml token, 0600
/etc/systemd/system/proxmox-monitor-agent.service
/var/cache/proxmox-monitor-agent/ Burrito unpack
no listening ports</code></pre>
</div>
</div>
</section>
<!-- 32 -->
<section class="slide end-slide">
<span class="eyebrow">Done</span>
<h1>MVP in production</h1>
<p class="muted" style="max-width:40rem">
All four phases from the concept shipped: monitoring skeleton, ZFS/VM/storage collectors,
LiveView dashboard, packaged binaries. The operator has the runbook; agents report; retention
prunes; backups run. Everything else is iteration.
</p>
<p class="muted" style="margin-top:2rem;font-size:.85rem">
Full runbook: <code>SETUP-AND-DEPLOY.md</code> · Concept: <code>proxmox-monitor-konzept.md</code>
</p>
</section>
</div>
<script>
(function () {
const slides = Array.from(document.querySelectorAll('.slide'));
const total = slides.length;
document.getElementById('total').textContent = total;
let current = 0;
const params = new URLSearchParams(location.hash.replace(/^#/, '?'));
if (params.has('s')) {
const s = parseInt(params.get('s'), 10);
if (!isNaN(s) && s >= 1 && s <= total) current = s - 1;
}
function render() {
slides.forEach((el, i) => el.classList.toggle('active', i === current));
document.getElementById('current').textContent = current + 1;
document.getElementById('progress').style.width = ((current + 1) / total * 100) + '%';
location.hash = 's=' + (current + 1);
slides[current].scrollTop = 0;
}
function go(delta) {
current = Math.max(0, Math.min(total - 1, current + delta));
render();
}
document.addEventListener('keydown', (e) => {
if (e.target.tagName === 'INPUT' || e.target.tagName === 'TEXTAREA') return;
switch (e.key) {
case 'ArrowRight':
case 'PageDown':
case ' ':
e.preventDefault(); go(1); break;
case 'ArrowLeft':
case 'PageUp':
e.preventDefault(); go(-1); break;
case 'Home':
e.preventDefault(); current = 0; render(); break;
case 'End':
e.preventDefault(); current = total - 1; render(); break;
case 'f':
case 'F':
if (!document.fullscreenElement) {
document.documentElement.requestFullscreen?.();
} else {
document.exitFullscreen?.();
}
break;
default:
if (/^[0-9]$/.test(e.key)) {
const n = parseInt(e.key, 10);
if (n >= 1 && n <= Math.min(9, total)) {
current = n - 1;
render();
}
}
}
});
// Click anywhere (except links / code) to advance
document.addEventListener('click', (e) => {
const t = e.target;
if (t.closest('a, code, pre, input, button, .topbar')) return;
go(1);
});
// Swipe
let touchStartX = null;
document.addEventListener('touchstart', (e) => {
if (e.touches.length === 1) touchStartX = e.touches[0].clientX;
});
document.addEventListener('touchend', (e) => {
if (touchStartX === null) return;
const dx = (e.changedTouches[0].clientX - touchStartX);
if (Math.abs(dx) > 40) go(dx < 0 ? 1 : -1);
touchStartX = null;
});
window.addEventListener('hashchange', () => {
const p = new URLSearchParams(location.hash.replace(/^#/, '?'));
const s = parseInt(p.get('s'), 10);
if (!isNaN(s) && s >= 1 && s <= total && s - 1 !== current) {
current = s - 1;
render();
}
});
render();
})();
</script>
</body>
</html>