Compare commits

..

No commits in common. "3367b95b917ac5f442d63ad8c33ef767bcdb4f4e" and "28a40a2650215179434edfd362a540a82fd9cf21" have entirely different histories.

14 changed files with 5 additions and 1592 deletions

View file

@ -9,9 +9,7 @@ defmodule ProxmoxAgent.Application do
case load_config() do case load_config() do
{:ok, cfg} -> {:ok, cfg} ->
Logger.info("agent: starting with host_id=#{cfg.host_id}") Logger.info("agent: starting with host_id=#{cfg.host_id}")
:ok = ProxmoxAgent.Diagnostics.configure(cfg.dump_dir) [{ProxmoxAgent.Reporter, cfg}]
diagnostics_children(ProxmoxAgent.Diagnostics.dump_dir()) ++
[{ProxmoxAgent.Reporter, cfg}]
{:error, reason} -> {:error, reason} ->
Logger.error("agent: no config loaded (#{inspect(reason)}); running in idle mode") Logger.error("agent: no config loaded (#{inspect(reason)}); running in idle mode")
@ -21,9 +19,6 @@ defmodule ProxmoxAgent.Application do
Supervisor.start_link(children, strategy: :one_for_one, name: ProxmoxAgent.Supervisor) Supervisor.start_link(children, strategy: :one_for_one, name: ProxmoxAgent.Supervisor)
end end
defp diagnostics_children(nil), do: []
defp diagnostics_children(dir), do: [{ProxmoxAgent.Diagnostics.Writer, [dir: dir]}]
defp load_config do defp load_config do
path = path =
System.get_env("AGENT_CONFIG") || System.get_env("AGENT_CONFIG") ||

View file

@ -51,13 +51,13 @@ defmodule ProxmoxAgent.Collectors.Host do
end end
defp read_loadavg(proc_dir) do defp read_loadavg(proc_dir) do
body = read_and_log(Path.join(proc_dir, "loadavg")) body = File.read!(Path.join(proc_dir, "loadavg"))
[l1, l5, l15 | _] = String.split(body, ~r/\s+/, trim: true) [l1, l5, l15 | _] = String.split(body, ~r/\s+/, trim: true)
{to_float(l1), to_float(l5), to_float(l15)} {to_float(l1), to_float(l5), to_float(l15)}
end end
defp read_meminfo(proc_dir) do defp read_meminfo(proc_dir) do
body = read_and_log(Path.join(proc_dir, "meminfo")) body = File.read!(Path.join(proc_dir, "meminfo"))
parsed = parsed =
body body
@ -76,23 +76,11 @@ defmodule ProxmoxAgent.Collectors.Host do
end end
defp read_uptime(proc_dir) do defp read_uptime(proc_dir) do
body = read_and_log(Path.join(proc_dir, "uptime")) body = File.read!(Path.join(proc_dir, "uptime"))
[secs | _] = String.split(body, " ", trim: true) [secs | _] = String.split(body, " ", trim: true)
secs |> to_float() |> trunc() secs |> to_float() |> trunc()
end end
defp read_and_log(path) do
start = System.monotonic_time(:microsecond)
result = File.read(path)
duration_us = System.monotonic_time(:microsecond) - start
ProxmoxAgent.Diagnostics.log_read(path, result, duration_us)
case result do
{:ok, body} -> body
{:error, reason} -> raise File.Error, reason: reason, action: "read file", path: to_string(path)
end
end
defp kb_to_bytes(nil), do: nil defp kb_to_bytes(nil), do: nil
defp kb_to_bytes(str) do defp kb_to_bytes(str) do

View file

@ -5,7 +5,6 @@ defmodule ProxmoxAgent.Config do
:server_url, :server_url,
:token, :token,
:host_id, :host_id,
:dump_dir,
fast_seconds: 30, fast_seconds: 30,
medium_seconds: 300, medium_seconds: 300,
slow_seconds: 1800 slow_seconds: 1800
@ -15,7 +14,6 @@ defmodule ProxmoxAgent.Config do
server_url: String.t(), server_url: String.t(),
token: String.t(), token: String.t(),
host_id: String.t(), host_id: String.t(),
dump_dir: String.t() | nil,
fast_seconds: pos_integer(), fast_seconds: pos_integer(),
medium_seconds: pos_integer(), medium_seconds: pos_integer(),
slow_seconds: pos_integer() slow_seconds: pos_integer()
@ -59,24 +57,17 @@ defmodule ProxmoxAgent.Config do
defp build(map) do defp build(map) do
intervals = Map.get(map, "intervals", %{}) intervals = Map.get(map, "intervals", %{})
debug = Map.get(map, "debug", %{})
%__MODULE__{ %__MODULE__{
server_url: map["server_url"], server_url: map["server_url"],
token: map["token"], token: map["token"],
host_id: map["host_id"] || hostname(), host_id: map["host_id"] || hostname(),
dump_dir: normalize_dump_dir(Map.get(debug, "dump_dir")),
fast_seconds: Map.get(intervals, "fast_seconds", 30), fast_seconds: Map.get(intervals, "fast_seconds", 30),
medium_seconds: Map.get(intervals, "medium_seconds", 300), medium_seconds: Map.get(intervals, "medium_seconds", 300),
slow_seconds: Map.get(intervals, "slow_seconds", 1800) slow_seconds: Map.get(intervals, "slow_seconds", 1800)
} }
end end
defp normalize_dump_dir(nil), do: nil
defp normalize_dump_dir(""), do: nil
defp normalize_dump_dir(s) when is_binary(s), do: s
defp normalize_dump_dir(_), do: nil
defp hostname do defp hostname do
case :inet.gethostname() do case :inet.gethostname() do
{:ok, name} -> List.to_string(name) {:ok, name} -> List.to_string(name)

View file

@ -1,70 +0,0 @@
defmodule ProxmoxAgent.Diagnostics do
@moduledoc """
Optional diagnostic dump of external commands and outgoing samples.
Controlled by `:dump_dir` in the application env. When unset or nil,
`log_command/4` and `log_sample/2` are no-ops and incur no I/O.
When set, they cast to `ProxmoxAgent.Diagnostics.Writer` (if running)
which serializes appends to `commands.log` and `samples.log`.
"""
require Logger
@type command_result ::
{:ok, String.t()}
| {:error, term()}
@spec configure(String.t() | nil) :: :ok
def configure(nil), do: disable()
def configure(""), do: disable()
def configure(dir) when is_binary(dir) do
case File.mkdir_p(dir) do
:ok ->
Application.put_env(:agent, :dump_dir, dir)
Logger.info("diagnostics: enabled, writing to #{dir}")
:ok
{:error, reason} ->
Logger.warning("diagnostics: mkdir_p #{dir} failed (#{inspect(reason)}); dumping disabled")
disable()
end
end
@spec enabled?() :: boolean()
def enabled?, do: not is_nil(Application.get_env(:agent, :dump_dir))
@spec dump_dir() :: String.t() | nil
def dump_dir, do: Application.get_env(:agent, :dump_dir)
@spec log_command(String.t(), [String.t()], command_result(), non_neg_integer()) :: :ok
def log_command(cmd, args, result, duration_us) do
cast({:command, cmd, args, result, duration_us})
end
@spec log_sample(String.t(), map()) :: :ok
def log_sample(kind, payload) when is_binary(kind) and is_map(payload) do
cast({:sample, kind, payload})
end
@spec log_read(Path.t(), command_result(), non_neg_integer()) :: :ok
def log_read(path, result, duration_us) do
log_command("cat", [to_string(path)], result, duration_us)
end
defp disable do
Application.delete_env(:agent, :dump_dir)
:ok
end
defp cast(msg) do
if enabled?() do
case Process.whereis(ProxmoxAgent.Diagnostics.Writer) do
nil -> :ok
pid -> GenServer.cast(pid, msg)
end
end
:ok
end
end

View file

@ -1,98 +0,0 @@
defmodule ProxmoxAgent.Diagnostics.Writer do
@moduledoc """
Serializes diagnostic dump writes. Owns two append-only file handles
for commands.log and samples.log under a configured directory.
Started only when the agent's dump_dir is set. All writes are cast-based
to keep the caller path free of file I/O latency.
"""
use GenServer
require Logger
@spec start_link(keyword()) :: GenServer.on_start()
def start_link(opts) do
GenServer.start_link(__MODULE__, opts, name: __MODULE__)
end
@impl true
def init(opts) do
dir = Keyword.fetch!(opts, :dir)
# No trap_exit: the Writer owns no linked processes other than the supervisor.
# If you add a linked port/task, restore Process.flag(:trap_exit, true) and
# add a matching handle_info({:EXIT, _, _}, state) clause.
with {:ok, commands} <- open(Path.join(dir, "commands.log")),
{:ok, samples} <- open(Path.join(dir, "samples.log")) do
{:ok, %{dir: dir, commands: commands, samples: samples}}
else
{:error, reason} ->
{:stop, {:open_failed, reason}}
end
end
@impl true
def handle_cast({:command, cmd, args, result, duration_us}, state) do
write(state.commands, format_command(cmd, args, result, duration_us))
{:noreply, state}
end
def handle_cast({:sample, kind, payload}, state) do
write(state.samples, format_sample(kind, payload))
{:noreply, state}
end
@impl true
def handle_call(:flush, _from, state) do
{:reply, :ok, state}
end
@impl true
def terminate(_reason, state) do
File.close(state.commands)
File.close(state.samples)
:ok
end
# --- private ----------------------------------------------------------
defp open(path), do: File.open(path, [:append, :utf8])
defp write(io, data) do
case IO.write(io, data) do
:ok ->
:ok
{:error, reason} ->
Logger.warning("diagnostics writer: write failed (#{inspect(reason)})")
:ok
end
end
defp format_command(cmd, args, result, duration_us) do
header = "===== #{now_iso()} =====\n$ #{cmd} #{Enum.join(args, " ")}\n"
status =
case result do
{:ok, body} ->
"exit=0 duration=#{ms(duration_us)}ms size=#{byte_size(body)}\n\n#{body}\n\n"
{:error, {:nonzero_exit, code, body}} ->
"exit=#{code} duration=#{ms(duration_us)}ms size=#{byte_size(body)}\n\n#{body}\n\n"
{:error, reason} ->
"error=#{inspect(reason)} duration=#{ms(duration_us)}ms\n\n"
end
header <> status
end
defp format_sample(kind, payload) do
body = Jason.encode!(payload, pretty: true)
"===== #{now_iso()} kind=#{kind} =====\n#{body}\n\n"
end
defp now_iso, do: DateTime.utc_now() |> DateTime.to_iso8601()
defp ms(us) when is_integer(us), do: div(us, 1_000)
end

View file

@ -101,8 +101,6 @@ defmodule ProxmoxAgent.Reporter do
defp push_metric(socket, event, data) do defp push_metric(socket, event, data) do
payload = %{collected_at: DateTime.utc_now() |> DateTime.to_iso8601(), data: data} payload = %{collected_at: DateTime.utc_now() |> DateTime.to_iso8601(), data: data}
kind = String.replace_prefix(event, "metric:", "")
ProxmoxAgent.Diagnostics.log_sample(kind, payload)
case push(socket, socket.assigns.topic, event, payload) do case push(socket, socket.assigns.topic, event, payload) do
{:ok, _ref} -> {:ok, _ref} ->

View file

@ -8,14 +8,6 @@ defmodule ProxmoxAgent.Shell do
@spec run(String.t(), [String.t()]) :: result @spec run(String.t(), [String.t()]) :: result
def run(command, args) do def run(command, args) do
start = System.monotonic_time(:microsecond)
result = do_run(command, args)
duration_us = System.monotonic_time(:microsecond) - start
ProxmoxAgent.Diagnostics.log_command(command, args, result, duration_us)
result
end
defp do_run(command, args) do
try do try do
case System.cmd(command, args, stderr_to_stdout: true) do case System.cmd(command, args, stderr_to_stdout: true) do
{output, 0} -> {:ok, output} {output, 0} -> {:ok, output}

View file

@ -6,6 +6,3 @@ host_id = "pve-test-01"
fast_seconds = 15 fast_seconds = 15
medium_seconds = 120 medium_seconds = 120
slow_seconds = 600 slow_seconds = 600
[debug]
dump_dir = "/tmp/proxmox-monitor-test"

View file

@ -14,43 +14,6 @@ defmodule ProxmoxAgent.ConfigTest do
assert cfg.fast_seconds == 15 assert cfg.fast_seconds == 15
assert cfg.medium_seconds == 120 assert cfg.medium_seconds == 120
assert cfg.slow_seconds == 600 assert cfg.slow_seconds == 600
assert cfg.dump_dir == "/tmp/proxmox-monitor-test"
end
test "parses [debug] dump_dir when present" do
assert {:ok, cfg} = Config.load(@fixture)
assert cfg.dump_dir == "/tmp/proxmox-monitor-test"
end
test "dump_dir is nil when [debug] is absent" do
tmp = Path.join(System.tmp_dir!(), "agent_nodebug.toml")
File.write!(tmp, """
server_url = "wss://x/socket/websocket"
token = "t"
""")
on_exit(fn -> File.rm(tmp) end)
assert {:ok, cfg} = Config.load(tmp)
assert cfg.dump_dir == nil
end
test "dump_dir is nil when set to empty string" do
tmp = Path.join(System.tmp_dir!(), "agent_emptydebug.toml")
File.write!(tmp, """
server_url = "wss://x/socket/websocket"
token = "t"
[debug]
dump_dir = ""
""")
on_exit(fn -> File.rm(tmp) end)
assert {:ok, cfg} = Config.load(tmp)
assert cfg.dump_dir == nil
end end
test "returns error for missing file" do test "returns error for missing file" do

View file

@ -1,83 +0,0 @@
defmodule ProxmoxAgent.Diagnostics.WriterTest do
use ExUnit.Case, async: false
alias ProxmoxAgent.Diagnostics.Writer
setup do
dir = Path.join(System.tmp_dir!(), "writer-#{System.unique_integer([:positive])}")
File.mkdir_p!(dir)
{:ok, pid} = Writer.start_link(dir: dir)
on_exit(fn ->
if Process.alive?(pid), do: GenServer.stop(pid)
File.rm_rf(dir)
end)
%{dir: dir, pid: pid}
end
test "handle_cast :command appends a formatted entry to commands.log", %{dir: dir, pid: pid} do
GenServer.cast(pid, {:command, "zpool", ["list", "-j"], {:ok, "hello body"}, 1_234})
:ok = GenServer.call(pid, :flush)
body = File.read!(Path.join(dir, "commands.log"))
assert body =~ "$ zpool list -j"
assert body =~ "exit=0"
assert body =~ "duration=1ms"
assert body =~ "size=10"
assert body =~ "hello body"
assert body =~ "=====\n"
end
test "handle_cast :command records errors in place of exit", %{dir: dir, pid: pid} do
GenServer.cast(pid, {:command, "zpool", ["list"], {:error, {:enoent, "zpool"}}, 300})
:ok = GenServer.call(pid, :flush)
body = File.read!(Path.join(dir, "commands.log"))
assert body =~ "error={:enoent, \"zpool\"}"
assert body =~ "duration=0ms"
refute body =~ "exit="
end
test "handle_cast :command formats non-zero exit results", %{dir: dir, pid: pid} do
GenServer.cast(pid, {:command, "/bin/sh", ["-c", "exit 7"], {:error, {:nonzero_exit, 7, "oops"}}, 500})
:ok = GenServer.call(pid, :flush)
body = File.read!(Path.join(dir, "commands.log"))
assert body =~ "exit=7"
assert body =~ "size=4"
assert body =~ "oops"
end
test "handle_cast :sample appends pretty-printed JSON to samples.log", %{dir: dir, pid: pid} do
GenServer.cast(pid, {:sample, "fast", %{"host" => %{"load1" => 0.1}}})
:ok = GenServer.call(pid, :flush)
body = File.read!(Path.join(dir, "samples.log"))
assert body =~ "kind=fast"
assert body =~ "\"host\""
assert body =~ "\"load1\""
assert body =~ "=====\n"
end
test "multiple casts serialize and do not interleave", %{dir: dir, pid: pid} do
for i <- 1..50 do
GenServer.cast(pid, {:command, "cmd#{i}", [], {:ok, "body#{i}"}, 1})
end
:ok = GenServer.call(pid, :flush)
body = File.read!(Path.join(dir, "commands.log"))
# Every body1..body50 must appear exactly once, in order.
for i <- 1..50 do
assert body =~ "body#{i}"
end
end
test "terminate closes handles cleanly", %{pid: pid} do
ref = Process.monitor(pid)
GenServer.stop(pid, :normal)
assert_receive {:DOWN, ^ref, :process, ^pid, :normal}, 1000
end
end

View file

@ -1,110 +0,0 @@
defmodule ProxmoxAgent.DiagnosticsTest do
use ExUnit.Case, async: false
alias ProxmoxAgent.Diagnostics
setup do
# Isolate tests: clear the env key before and after each test.
Application.delete_env(:agent, :dump_dir)
on_exit(fn -> Application.delete_env(:agent, :dump_dir) end)
:ok
end
describe "configure/1 and enabled?/0" do
test "nil disables and returns :ok" do
assert :ok = Diagnostics.configure(nil)
refute Diagnostics.enabled?()
end
test "empty string disables and returns :ok" do
assert :ok = Diagnostics.configure("")
refute Diagnostics.enabled?()
end
test "valid path creates the directory and enables" do
dir = Path.join(System.tmp_dir!(), "diag-#{System.unique_integer([:positive])}")
on_exit(fn -> File.rm_rf(dir) end)
assert :ok = Diagnostics.configure(dir)
assert Diagnostics.enabled?()
assert File.dir?(dir)
assert Application.get_env(:agent, :dump_dir) == dir
end
test "unreachable path disables (does not crash)" do
# Point at a path under a non-directory file to force mkdir_p failure.
parent = Path.join(System.tmp_dir!(), "diag-parent-#{System.unique_integer([:positive])}")
File.write!(parent, "not a directory")
dir = Path.join(parent, "child")
on_exit(fn -> File.rm_rf(parent) end)
assert :ok = Diagnostics.configure(dir)
refute Diagnostics.enabled?()
end
end
describe "log_command/4 and log_sample/2 (no writer running)" do
test "log_command/4 no-ops and returns :ok when disabled" do
assert :ok = Diagnostics.log_command("zpool", ["list"], {:ok, "body"}, 1_234)
end
test "log_sample/2 no-ops and returns :ok when disabled" do
assert :ok = Diagnostics.log_sample("fast", %{foo: "bar"})
end
test "log_command/4 no-ops when enabled but writer is not running" do
dir = Path.join(System.tmp_dir!(), "diag-#{System.unique_integer([:positive])}")
on_exit(fn -> File.rm_rf(dir) end)
:ok = Diagnostics.configure(dir)
# Writer is not started in this test — log should still be safe.
assert :ok = Diagnostics.log_command("zpool", ["list"], {:ok, "body"}, 1_234)
end
end
describe "log_command/4 and log_sample/2 with writer running" do
setup do
dir = Path.join(System.tmp_dir!(), "diag-e2e-#{System.unique_integer([:positive])}")
File.mkdir_p!(dir)
:ok = Diagnostics.configure(dir)
{:ok, pid} = ProxmoxAgent.Diagnostics.Writer.start_link(dir: dir)
on_exit(fn ->
if Process.alive?(pid), do: GenServer.stop(pid)
File.rm_rf(dir)
end)
%{dir: dir}
end
test "log_command/4 writes to commands.log", %{dir: dir} do
assert :ok = Diagnostics.log_command("zpool", ["list", "-j"], {:ok, "body"}, 2_500)
:ok = GenServer.call(ProxmoxAgent.Diagnostics.Writer, :flush)
body = File.read!(Path.join(dir, "commands.log"))
assert body =~ "$ zpool list -j"
assert body =~ "exit=0"
assert body =~ "size=4"
end
test "log_sample/2 writes to samples.log", %{dir: dir} do
assert :ok = Diagnostics.log_sample("fast", %{"zfs_pools" => %{"pools" => []}})
:ok = GenServer.call(ProxmoxAgent.Diagnostics.Writer, :flush)
body = File.read!(Path.join(dir, "samples.log"))
assert body =~ "kind=fast"
assert body =~ "\"zfs_pools\""
end
test "log_read/3 writes to commands.log formatted as cat", %{dir: dir} do
assert :ok = Diagnostics.log_read("/proc/loadavg", {:ok, "0.12 0.34 0.56 1/200 3000"}, 150)
:ok = GenServer.call(ProxmoxAgent.Diagnostics.Writer, :flush)
body = File.read!(Path.join(dir, "commands.log"))
assert body =~ "$ cat /proc/loadavg"
assert body =~ "0.12 0.34 0.56"
assert body =~ "exit=0"
end
end
end

View file

@ -1,5 +1,5 @@
defmodule ProxmoxAgent.ShellTest do defmodule ProxmoxAgent.ShellTest do
use ExUnit.Case, async: false use ExUnit.Case, async: true
alias ProxmoxAgent.Shell alias ProxmoxAgent.Shell
@ -16,25 +16,4 @@ defmodule ProxmoxAgent.ShellTest do
test "run/2 returns {:error, {:enoent, _}} when binary is missing" do test "run/2 returns {:error, {:enoent, _}} when binary is missing" do
assert {:error, {:enoent, _}} = Shell.run("/does/not/exist/nope", []) assert {:error, {:enoent, _}} = Shell.run("/does/not/exist/nope", [])
end end
test "run/2 logs to Diagnostics when dump_dir is set" do
dir = Path.join(System.tmp_dir!(), "shell-dump-#{System.unique_integer([:positive])}")
File.mkdir_p!(dir)
:ok = ProxmoxAgent.Diagnostics.configure(dir)
{:ok, writer} = ProxmoxAgent.Diagnostics.Writer.start_link(dir: dir)
on_exit(fn ->
if Process.alive?(writer), do: GenServer.stop(writer)
Application.delete_env(:agent, :dump_dir)
File.rm_rf(dir)
end)
assert {:ok, _} = Shell.run("/bin/echo", ["diagnostics-test"])
:ok = GenServer.call(ProxmoxAgent.Diagnostics.Writer, :flush)
body = File.read!(Path.join(dir, "commands.log"))
assert body =~ "$ /bin/echo diagnostics-test"
assert body =~ "exit=0"
assert body =~ "diagnostics-test"
end
end end

View file

@ -1,892 +0,0 @@
# Agent Diagnostic Dump Implementation Plan
> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
**Goal:** Add an opt-in diagnostic dump to the agent that logs every external command (with full raw output) and every outgoing metric payload, so the operator can verify source data before it hits the server.
**Architecture:** A new `ProxmoxAgent.Diagnostics` module holds the on/off state in the application env (`:agent, :dump_dir`). When enabled, a `ProxmoxAgent.Diagnostics.Writer` GenServer owns append-only file handles for `commands.log` and `samples.log` and serializes all writes. `Shell.run/2` calls `Diagnostics.log_command/4` after each command; `Reporter.push_metric/3` calls `Diagnostics.log_sample/2` before each push. When `dump_dir` is unset, both helpers are no-ops and the Writer is not started.
**Tech Stack:** Elixir / OTP, ExUnit, `GenServer`, `Application.get_env`, `File` / `IO.write`. Spec: `docs/superpowers/specs/2026-04-22-agent-diagnostic-dump-design.md`.
---
## File Structure
**Create:**
- `agent/lib/proxmox_agent/diagnostics.ex` — public API: `configure/1`, `enabled?/0`, `log_command/4`, `log_sample/2`. Thin wrapper around the Writer.
- `agent/lib/proxmox_agent/diagnostics/writer.ex``GenServer` that owns the two file handles and serializes writes.
- `agent/test/proxmox_agent/diagnostics_test.exs` — unit tests for the public API (both disabled and enabled paths).
- `agent/test/proxmox_agent/diagnostics/writer_test.exs` — unit tests for the writer GenServer.
**Modify:**
- `agent/lib/proxmox_agent/config.ex` — add `:dump_dir` field (String.t() | nil).
- `agent/test/proxmox_agent/config_test.exs` — test `[debug] dump_dir` parsing.
- `agent/test/fixtures/agent.toml` — add `[debug] dump_dir` to the fixture.
- `agent/lib/proxmox_agent/shell.ex` — add timing + call `Diagnostics.log_command/4`.
- `agent/test/proxmox_agent/shell_test.exs` — add a test that exercises the dump path.
- `agent/lib/proxmox_agent/reporter.ex` — add one line calling `Diagnostics.log_sample/2` before `push/4`.
- `agent/lib/proxmox_agent/application.ex` — call `Diagnostics.configure/1` and conditionally add the Writer to the supervision tree.
---
## Task 1: Config — add `dump_dir` field
**Files:**
- Modify: `agent/lib/proxmox_agent/config.ex`
- Modify: `agent/test/proxmox_agent/config_test.exs`
- Modify: `agent/test/fixtures/agent.toml`
### - [ ] Step 1: Add `dump_dir` to the fixture
Replace the contents of `agent/test/fixtures/agent.toml` with:
```toml
server_url = "wss://monitor.example.com/socket/websocket"
token = "test_token_123"
host_id = "pve-test-01"
[intervals]
fast_seconds = 15
medium_seconds = 120
slow_seconds = 600
[debug]
dump_dir = "/tmp/proxmox-monitor-test"
```
### - [ ] Step 2: Add a failing Config test for `dump_dir`
In `agent/test/proxmox_agent/config_test.exs`, add this test inside `describe "load/1"`:
```elixir
test "parses [debug] dump_dir when present" do
assert {:ok, cfg} = Config.load(@fixture)
assert cfg.dump_dir == "/tmp/proxmox-monitor-test"
end
test "dump_dir is nil when [debug] is absent" do
tmp = Path.join(System.tmp_dir!(), "agent_nodebug.toml")
File.write!(tmp, """
server_url = "wss://x/socket/websocket"
token = "t"
""")
on_exit(fn -> File.rm(tmp) end)
assert {:ok, cfg} = Config.load(tmp)
assert cfg.dump_dir == nil
end
test "dump_dir is nil when set to empty string" do
tmp = Path.join(System.tmp_dir!(), "agent_emptydebug.toml")
File.write!(tmp, """
server_url = "wss://x/socket/websocket"
token = "t"
[debug]
dump_dir = ""
""")
on_exit(fn -> File.rm(tmp) end)
assert {:ok, cfg} = Config.load(tmp)
assert cfg.dump_dir == nil
end
```
Update the existing `"parses required fields"` test to also assert on `dump_dir`:
```elixir
test "parses required fields" do
assert {:ok, cfg} = Config.load(@fixture)
assert cfg.server_url == "wss://monitor.example.com/socket/websocket"
assert cfg.token == "test_token_123"
assert cfg.host_id == "pve-test-01"
assert cfg.fast_seconds == 15
assert cfg.medium_seconds == 120
assert cfg.slow_seconds == 600
assert cfg.dump_dir == "/tmp/proxmox-monitor-test"
end
```
### - [ ] Step 3: Run Config tests — expect FAIL
Run: `cd agent && mix test test/proxmox_agent/config_test.exs`
Expected: at least the three new assertions fail because `Config` struct has no `:dump_dir` field and `build/1` does not read `[debug] dump_dir`.
### - [ ] Step 4: Add `:dump_dir` to the Config struct and build/1
Edit `agent/lib/proxmox_agent/config.ex`. Replace the `defstruct` and `@type` blocks with:
```elixir
defstruct [
:server_url,
:token,
:host_id,
:dump_dir,
fast_seconds: 30,
medium_seconds: 300,
slow_seconds: 1800
]
@type t :: %__MODULE__{
server_url: String.t(),
token: String.t(),
host_id: String.t(),
dump_dir: String.t() | nil,
fast_seconds: pos_integer(),
medium_seconds: pos_integer(),
slow_seconds: pos_integer()
}
```
Replace the `build/1` function with:
```elixir
defp build(map) do
intervals = Map.get(map, "intervals", %{})
debug = Map.get(map, "debug", %{})
%__MODULE__{
server_url: map["server_url"],
token: map["token"],
host_id: map["host_id"] || hostname(),
dump_dir: normalize_dump_dir(Map.get(debug, "dump_dir")),
fast_seconds: Map.get(intervals, "fast_seconds", 30),
medium_seconds: Map.get(intervals, "medium_seconds", 300),
slow_seconds: Map.get(intervals, "slow_seconds", 1800)
}
end
defp normalize_dump_dir(nil), do: nil
defp normalize_dump_dir(""), do: nil
defp normalize_dump_dir(s) when is_binary(s), do: s
defp normalize_dump_dir(_), do: nil
```
### - [ ] Step 5: Run Config tests — expect PASS
Run: `cd agent && mix test test/proxmox_agent/config_test.exs`
Expected: all tests pass.
### - [ ] Step 6: Commit
```bash
git add agent/lib/proxmox_agent/config.ex \
agent/test/proxmox_agent/config_test.exs \
agent/test/fixtures/agent.toml
git commit -m "feat(agent): add [debug] dump_dir config field"
```
---
## Task 2: Diagnostics module — no-op API skeleton
**Files:**
- Create: `agent/lib/proxmox_agent/diagnostics.ex`
- Create: `agent/test/proxmox_agent/diagnostics_test.exs`
This task introduces the public API with a null-object behavior (no Writer yet). Logging calls succeed and silently no-op when disabled. This proves the API is safe to call from `Shell` and `Reporter` even before the Writer is wired in.
### - [ ] Step 1: Write failing tests for configure/enabled? and no-op logging
Create `agent/test/proxmox_agent/diagnostics_test.exs`:
```elixir
defmodule ProxmoxAgent.DiagnosticsTest do
use ExUnit.Case, async: false
alias ProxmoxAgent.Diagnostics
setup do
# Isolate tests: clear the env key before and after each test.
Application.delete_env(:agent, :dump_dir)
on_exit(fn -> Application.delete_env(:agent, :dump_dir) end)
:ok
end
describe "configure/1 and enabled?/0" do
test "nil disables and returns :ok" do
assert :ok = Diagnostics.configure(nil)
refute Diagnostics.enabled?()
end
test "empty string disables and returns :ok" do
assert :ok = Diagnostics.configure("")
refute Diagnostics.enabled?()
end
test "valid path creates the directory and enables" do
dir = Path.join(System.tmp_dir!(), "diag-#{System.unique_integer([:positive])}")
on_exit(fn -> File.rm_rf(dir) end)
assert :ok = Diagnostics.configure(dir)
assert Diagnostics.enabled?()
assert File.dir?(dir)
assert Application.get_env(:agent, :dump_dir) == dir
end
test "unreachable path disables (does not crash)" do
# Point at a path under a non-directory file to force mkdir_p failure.
parent = Path.join(System.tmp_dir!(), "diag-parent-#{System.unique_integer([:positive])}")
File.write!(parent, "not a directory")
dir = Path.join(parent, "child")
on_exit(fn -> File.rm_rf(parent) end)
assert :ok = Diagnostics.configure(dir)
refute Diagnostics.enabled?()
end
end
describe "log_command/4 and log_sample/2 (no writer running)" do
test "log_command/4 no-ops and returns :ok when disabled" do
assert :ok = Diagnostics.log_command("zpool", ["list"], {:ok, "body"}, 1_234)
end
test "log_sample/2 no-ops and returns :ok when disabled" do
assert :ok = Diagnostics.log_sample("fast", %{foo: "bar"})
end
test "log_command/4 no-ops when enabled but writer is not running" do
dir = Path.join(System.tmp_dir!(), "diag-#{System.unique_integer([:positive])}")
on_exit(fn -> File.rm_rf(dir) end)
:ok = Diagnostics.configure(dir)
# Writer is not started in this test — log should still be safe.
assert :ok = Diagnostics.log_command("zpool", ["list"], {:ok, "body"}, 1_234)
end
end
end
```
### - [ ] Step 2: Run tests — expect FAIL (module does not exist)
Run: `cd agent && mix test test/proxmox_agent/diagnostics_test.exs`
Expected: FAIL — `ProxmoxAgent.Diagnostics.__struct__/0 is undefined (module ProxmoxAgent.Diagnostics is not available)` or compilation error.
### - [ ] Step 3: Create the Diagnostics module
Create `agent/lib/proxmox_agent/diagnostics.ex`:
```elixir
defmodule ProxmoxAgent.Diagnostics do
@moduledoc """
Optional diagnostic dump of external commands and outgoing samples.
Controlled by `:dump_dir` in the application env. When unset or nil,
`log_command/4` and `log_sample/2` are no-ops and incur no I/O.
When set, they cast to `ProxmoxAgent.Diagnostics.Writer` (if running)
which serializes appends to `commands.log` and `samples.log`.
"""
require Logger
@type command_result ::
{:ok, String.t()}
| {:error, term()}
@spec configure(String.t() | nil) :: :ok
def configure(nil), do: disable()
def configure(""), do: disable()
def configure(dir) when is_binary(dir) do
case File.mkdir_p(dir) do
:ok ->
Application.put_env(:agent, :dump_dir, dir)
:ok
{:error, reason} ->
Logger.warning("diagnostics: mkdir_p #{dir} failed (#{inspect(reason)}); dumping disabled")
disable()
end
end
@spec enabled?() :: boolean()
def enabled?, do: not is_nil(Application.get_env(:agent, :dump_dir))
@spec dump_dir() :: String.t() | nil
def dump_dir, do: Application.get_env(:agent, :dump_dir)
@spec log_command(String.t(), [String.t()], command_result(), non_neg_integer()) :: :ok
def log_command(cmd, args, result, duration_us) do
cast({:command, cmd, args, result, duration_us})
end
@spec log_sample(String.t(), map()) :: :ok
def log_sample(kind, payload) when is_binary(kind) and is_map(payload) do
cast({:sample, kind, payload})
end
defp disable do
Application.delete_env(:agent, :dump_dir)
:ok
end
defp cast(msg) do
if enabled?() do
case Process.whereis(ProxmoxAgent.Diagnostics.Writer) do
nil -> :ok
pid -> GenServer.cast(pid, msg)
end
end
:ok
end
end
```
### - [ ] Step 4: Run tests — expect PASS
Run: `cd agent && mix test test/proxmox_agent/diagnostics_test.exs`
Expected: all tests pass.
### - [ ] Step 5: Commit
```bash
git add agent/lib/proxmox_agent/diagnostics.ex \
agent/test/proxmox_agent/diagnostics_test.exs
git commit -m "feat(agent): add Diagnostics module with no-op default"
```
---
## Task 3: Diagnostics.Writer GenServer
**Files:**
- Create: `agent/lib/proxmox_agent/diagnostics/writer.ex`
- Create: `agent/test/proxmox_agent/diagnostics/writer_test.exs`
### - [ ] Step 1: Write failing tests for the Writer
Create `agent/test/proxmox_agent/diagnostics/writer_test.exs`:
```elixir
defmodule ProxmoxAgent.Diagnostics.WriterTest do
use ExUnit.Case, async: false
alias ProxmoxAgent.Diagnostics.Writer
setup do
dir = Path.join(System.tmp_dir!(), "writer-#{System.unique_integer([:positive])}")
File.mkdir_p!(dir)
{:ok, pid} = Writer.start_link(dir: dir)
on_exit(fn ->
if Process.alive?(pid), do: GenServer.stop(pid)
File.rm_rf(dir)
end)
%{dir: dir, pid: pid}
end
test "handle_cast :command appends a formatted entry to commands.log", %{dir: dir, pid: pid} do
GenServer.cast(pid, {:command, "zpool", ["list", "-j"], {:ok, "hello body"}, 1_234})
:ok = GenServer.call(pid, :flush)
body = File.read!(Path.join(dir, "commands.log"))
assert body =~ "$ zpool list -j"
assert body =~ "exit=0"
assert body =~ "duration=1ms"
assert body =~ "size=10"
assert body =~ "hello body"
assert body =~ "=====\n"
end
test "handle_cast :command records errors in place of exit", %{dir: dir, pid: pid} do
GenServer.cast(pid, {:command, "zpool", ["list"], {:error, {:enoent, "zpool"}}, 300})
:ok = GenServer.call(pid, :flush)
body = File.read!(Path.join(dir, "commands.log"))
assert body =~ "error={:enoent, \"zpool\"}"
assert body =~ "duration=0ms"
refute body =~ "exit="
end
test "handle_cast :command formats non-zero exit results", %{dir: dir, pid: pid} do
GenServer.cast(pid, {:command, "/bin/sh", ["-c", "exit 7"], {:error, {:nonzero_exit, 7, "oops"}}, 500})
:ok = GenServer.call(pid, :flush)
body = File.read!(Path.join(dir, "commands.log"))
assert body =~ "exit=7"
assert body =~ "size=4"
assert body =~ "oops"
end
test "handle_cast :sample appends pretty-printed JSON to samples.log", %{dir: dir, pid: pid} do
GenServer.cast(pid, {:sample, "fast", %{"host" => %{"load1" => 0.1}}})
:ok = GenServer.call(pid, :flush)
body = File.read!(Path.join(dir, "samples.log"))
assert body =~ "kind=fast"
assert body =~ "\"host\""
assert body =~ "\"load1\""
assert body =~ "=====\n"
end
test "multiple casts serialize and do not interleave", %{dir: dir, pid: pid} do
for i <- 1..50 do
GenServer.cast(pid, {:command, "cmd#{i}", [], {:ok, "body#{i}"}, 1})
end
:ok = GenServer.call(pid, :flush)
body = File.read!(Path.join(dir, "commands.log"))
# Every body1..body50 must appear exactly once, in order.
for i <- 1..50 do
assert body =~ "body#{i}"
end
end
test "terminate closes handles cleanly", %{pid: pid} do
ref = Process.monitor(pid)
GenServer.stop(pid, :normal)
assert_receive {:DOWN, ^ref, :process, ^pid, :normal}, 1000
end
end
```
The `:flush` call is a synchronous no-op whose only purpose is to wait until every previously-cast message has been handled — a common ExUnit idiom for testing cast-based GenServers.
### - [ ] Step 2: Run tests — expect FAIL
Run: `cd agent && mix test test/proxmox_agent/diagnostics/writer_test.exs`
Expected: FAIL — Writer module does not exist.
### - [ ] Step 3: Create the Writer GenServer
Create `agent/lib/proxmox_agent/diagnostics/writer.ex`:
```elixir
defmodule ProxmoxAgent.Diagnostics.Writer do
@moduledoc """
Serializes diagnostic dump writes. Owns two append-only file handles
for commands.log and samples.log under a configured directory.
Started only when the agent's dump_dir is set. All writes are cast-based
to keep the caller path free of file I/O latency.
"""
use GenServer
require Logger
@spec start_link(keyword()) :: GenServer.on_start()
def start_link(opts) do
GenServer.start_link(__MODULE__, opts, name: __MODULE__)
end
@impl true
def init(opts) do
dir = Keyword.fetch!(opts, :dir)
with {:ok, commands} <- open(Path.join(dir, "commands.log")),
{:ok, samples} <- open(Path.join(dir, "samples.log")) do
Process.flag(:trap_exit, true)
{:ok, %{dir: dir, commands: commands, samples: samples}}
else
{:error, reason} ->
{:stop, {:open_failed, reason}}
end
end
@impl true
def handle_cast({:command, cmd, args, result, duration_us}, state) do
write(state.commands, format_command(cmd, args, result, duration_us))
{:noreply, state}
end
def handle_cast({:sample, kind, payload}, state) do
write(state.samples, format_sample(kind, payload))
{:noreply, state}
end
@impl true
def handle_call(:flush, _from, state) do
{:reply, :ok, state}
end
@impl true
def terminate(_reason, state) do
File.close(state.commands)
File.close(state.samples)
:ok
end
# --- private ----------------------------------------------------------
defp open(path), do: File.open(path, [:append, :utf8])
defp write(io, data) do
case IO.write(io, data) do
:ok ->
:ok
{:error, reason} ->
Logger.warning("diagnostics writer: write failed (#{inspect(reason)})")
:ok
end
end
defp format_command(cmd, args, result, duration_us) do
header = "===== #{now_iso()} =====\n$ #{cmd} #{Enum.join(args, " ")}\n"
status =
case result do
{:ok, body} ->
"exit=0 duration=#{ms(duration_us)}ms size=#{byte_size(body)}\n\n#{body}\n\n"
{:error, {:nonzero_exit, code, body}} ->
"exit=#{code} duration=#{ms(duration_us)}ms size=#{byte_size(body)}\n\n#{body}\n\n"
{:error, reason} ->
"error=#{inspect(reason)} duration=#{ms(duration_us)}ms\n\n"
end
header <> status
end
defp format_sample(kind, payload) do
body = Jason.encode!(payload, pretty: true)
"===== #{now_iso()} kind=#{kind} =====\n#{body}\n\n"
end
defp now_iso, do: DateTime.utc_now() |> DateTime.to_iso8601()
defp ms(us) when is_integer(us), do: div(us, 1_000)
end
```
### - [ ] Step 4: Run tests — expect PASS
Run: `cd agent && mix test test/proxmox_agent/diagnostics/writer_test.exs`
Expected: all tests pass.
### - [ ] Step 5: Commit
```bash
git add agent/lib/proxmox_agent/diagnostics/writer.ex \
agent/test/proxmox_agent/diagnostics/writer_test.exs
git commit -m "feat(agent): add Diagnostics.Writer GenServer"
```
---
## Task 4: Diagnostics end-to-end test with Writer
**Files:**
- Modify: `agent/test/proxmox_agent/diagnostics_test.exs`
This task adds a test that exercises `log_command/4` and `log_sample/2` through the real Writer, proving the cast path works and the file contents reflect the public API.
### - [ ] Step 1: Add the end-to-end tests
Append this block to `agent/test/proxmox_agent/diagnostics_test.exs` after the existing `describe` blocks:
```elixir
describe "log_command/4 and log_sample/2 with writer running" do
setup do
dir = Path.join(System.tmp_dir!(), "diag-e2e-#{System.unique_integer([:positive])}")
File.mkdir_p!(dir)
:ok = Diagnostics.configure(dir)
{:ok, pid} = ProxmoxAgent.Diagnostics.Writer.start_link(dir: dir)
on_exit(fn ->
if Process.alive?(pid), do: GenServer.stop(pid)
File.rm_rf(dir)
end)
%{dir: dir}
end
test "log_command/4 writes to commands.log", %{dir: dir} do
assert :ok = Diagnostics.log_command("zpool", ["list", "-j"], {:ok, "body"}, 2_500)
:ok = GenServer.call(ProxmoxAgent.Diagnostics.Writer, :flush)
body = File.read!(Path.join(dir, "commands.log"))
assert body =~ "$ zpool list -j"
assert body =~ "exit=0"
assert body =~ "size=4"
end
test "log_sample/2 writes to samples.log", %{dir: dir} do
assert :ok = Diagnostics.log_sample("fast", %{"zfs_pools" => %{"pools" => []}})
:ok = GenServer.call(ProxmoxAgent.Diagnostics.Writer, :flush)
body = File.read!(Path.join(dir, "samples.log"))
assert body =~ "kind=fast"
assert body =~ "\"zfs_pools\""
end
end
```
### - [ ] Step 2: Run tests — expect PASS
Run: `cd agent && mix test test/proxmox_agent/diagnostics_test.exs`
Expected: all tests pass (existing no-op tests plus the new e2e tests).
### - [ ] Step 3: Commit
```bash
git add agent/test/proxmox_agent/diagnostics_test.exs
git commit -m "test(agent): end-to-end Diagnostics test with running Writer"
```
---
## Task 5: Shell — wrap run/2 with timing and diagnostics hook
**Files:**
- Modify: `agent/lib/proxmox_agent/shell.ex`
- Modify: `agent/test/proxmox_agent/shell_test.exs`
### - [ ] Step 1: Add a failing Shell test that asserts a dump entry
Add this test to `agent/test/proxmox_agent/shell_test.exs`:
```elixir
test "run/2 logs to Diagnostics when dump_dir is set" do
dir = Path.join(System.tmp_dir!(), "shell-dump-#{System.unique_integer([:positive])}")
File.mkdir_p!(dir)
:ok = ProxmoxAgent.Diagnostics.configure(dir)
{:ok, writer} = ProxmoxAgent.Diagnostics.Writer.start_link(dir: dir)
on_exit(fn ->
if Process.alive?(writer), do: GenServer.stop(writer)
Application.delete_env(:agent, :dump_dir)
File.rm_rf(dir)
end)
assert {:ok, _} = Shell.run("/bin/echo", ["diagnostics-test"])
:ok = GenServer.call(ProxmoxAgent.Diagnostics.Writer, :flush)
body = File.read!(Path.join(dir, "commands.log"))
assert body =~ "$ /bin/echo diagnostics-test"
assert body =~ "exit=0"
assert body =~ "diagnostics-test"
end
```
The file should also change to `use ExUnit.Case, async: false` at the top, because manipulating the application env is not safe with async tests sharing the VM:
```elixir
defmodule ProxmoxAgent.ShellTest do
use ExUnit.Case, async: false
```
### - [ ] Step 2: Run Shell tests — expect FAIL
Run: `cd agent && mix test test/proxmox_agent/shell_test.exs`
Expected: the new test fails because `Shell.run/2` does not call `Diagnostics.log_command/4`.
### - [ ] Step 3: Rewrite Shell.run/2 to time the call and dispatch to Diagnostics
Replace the entire body of `agent/lib/proxmox_agent/shell.ex` with:
```elixir
defmodule ProxmoxAgent.Shell do
@moduledoc """
Thin wrapper over System.cmd for testability. Collectors accept an optional
:runner function of this shape so tests can inject fixture-backed fakes.
"""
@type result :: {:ok, String.t()} | {:error, term()}
@spec run(String.t(), [String.t()]) :: result
def run(command, args) do
start = System.monotonic_time(:microsecond)
result = do_run(command, args)
duration_us = System.monotonic_time(:microsecond) - start
ProxmoxAgent.Diagnostics.log_command(command, args, result, duration_us)
result
end
defp do_run(command, args) do
try do
case System.cmd(command, args, stderr_to_stdout: true) do
{output, 0} -> {:ok, output}
{output, code} -> {:error, {:nonzero_exit, code, output}}
end
rescue
e in ErlangError ->
case e.original do
:enoent -> {:error, {:enoent, command}}
other -> {:error, {:system_error, other}}
end
end
end
end
```
### - [ ] Step 4: Run Shell tests — expect PASS
Run: `cd agent && mix test test/proxmox_agent/shell_test.exs`
Expected: all four Shell tests pass.
### - [ ] Step 5: Run the full agent suite to verify no regressions
Run: `cd agent && mix test`
Expected: all tests pass.
### - [ ] Step 6: Commit
```bash
git add agent/lib/proxmox_agent/shell.ex \
agent/test/proxmox_agent/shell_test.exs
git commit -m "feat(agent): log every shell command to Diagnostics"
```
---
## Task 6: Reporter — log sample before push
**Files:**
- Modify: `agent/lib/proxmox_agent/reporter.ex`
This is a one-line addition in `push_metric/3` with no behavior change when dumping is off. The `Diagnostics.log_sample/2` path is already covered by Task 4's end-to-end test, so no new Reporter test is required.
### - [ ] Step 1: Add `log_sample/2` call in push_metric
Edit `agent/lib/proxmox_agent/reporter.ex`. Replace the existing `defp push_metric/3` with:
```elixir
defp push_metric(socket, event, data) do
payload = %{collected_at: DateTime.utc_now() |> DateTime.to_iso8601(), data: data}
kind = String.replace_prefix(event, "metric:", "")
ProxmoxAgent.Diagnostics.log_sample(kind, payload)
case push(socket, socket.assigns.topic, event, payload) do
{:ok, _ref} ->
:ok
{:error, reason} ->
Logger.warning("reporter: push #{event} failed: #{inspect(reason)}")
:ok
end
end
```
### - [ ] Step 2: Run the full agent suite to confirm no regressions
Run: `cd agent && mix test`
Expected: all tests pass.
### - [ ] Step 3: Commit
```bash
git add agent/lib/proxmox_agent/reporter.ex
git commit -m "feat(agent): log outgoing sample to Diagnostics before push"
```
---
## Task 7: Application — configure Diagnostics and start Writer
**Files:**
- Modify: `agent/lib/proxmox_agent/application.ex`
### - [ ] Step 1: Wire Diagnostics.configure and conditionally add the Writer child
Replace the `start/2` function in `agent/lib/proxmox_agent/application.ex` with:
```elixir
@impl true
def start(_type, _args) do
children =
case load_config() do
{:ok, cfg} ->
Logger.info("agent: starting with host_id=#{cfg.host_id}")
:ok = ProxmoxAgent.Diagnostics.configure(cfg.dump_dir)
diagnostics_children(ProxmoxAgent.Diagnostics.dump_dir()) ++
[{ProxmoxAgent.Reporter, cfg}]
{:error, reason} ->
Logger.error("agent: no config loaded (#{inspect(reason)}); running in idle mode")
[]
end
Supervisor.start_link(children, strategy: :one_for_one, name: ProxmoxAgent.Supervisor)
end
defp diagnostics_children(nil), do: []
defp diagnostics_children(dir), do: [{ProxmoxAgent.Diagnostics.Writer, [dir: dir]}]
```
Rationale: we call `Diagnostics.configure/1` with the config value, which normalizes nil/empty and creates the directory if needed. We then read back `Diagnostics.dump_dir()` — this returns nil if `configure/1` failed (e.g. mkdir failed) or if the config had no dump_dir. If non-nil, the Writer is prepended to the child list so it starts before the Reporter.
### - [ ] Step 2: Run the full agent suite to confirm no regressions
Run: `cd agent && mix test`
Expected: all tests pass. (The Application itself is not exercised in unit tests — it's started by `mix test` only if explicitly required, and our existing tests do not start the real supervision tree.)
### - [ ] Step 3: Smoke-check startup with dump enabled
Create a temporary config file:
```bash
cat > /tmp/proxmox-monitor-smoke.toml <<'EOF'
server_url = "wss://localhost:9999/socket/websocket"
token = "smoke"
host_id = "smoke-host"
[debug]
dump_dir = "/tmp/proxmox-monitor-smoke-dump"
EOF
```
Start the app in the IEx shell:
```bash
cd agent && AGENT_CONFIG=/tmp/proxmox-monitor-smoke.toml iex -S mix
```
In IEx, verify the Writer is alive and the directory exists:
```elixir
Process.whereis(ProxmoxAgent.Diagnostics.Writer) |> is_pid()
# expected: true
File.exists?("/tmp/proxmox-monitor-smoke-dump/commands.log")
# expected: true (may be empty if no commands ran yet)
ProxmoxAgent.Diagnostics.enabled?()
# expected: true
```
Exit IEx with `Ctrl-C Ctrl-C`. Clean up: `rm -rf /tmp/proxmox-monitor-smoke-dump /tmp/proxmox-monitor-smoke.toml`.
If IEx is not available in the environment, skip this manual smoke check and rely on the `mix test` suite.
### - [ ] Step 4: Commit
```bash
git add agent/lib/proxmox_agent/application.ex
git commit -m "feat(agent): start Diagnostics.Writer when dump_dir is configured"
```
---
## Final verification
After all seven tasks, run the full suites on both sides:
```bash
cd agent && mix test
cd ../server && mix test
```
Expected: agent has grown by the new tests (Config +3, Diagnostics +~8, Writer +5, Shell +1) and all pass; server suite is unchanged and still passes.

View file

@ -1,237 +0,0 @@
# Agent Diagnostic Dump — Design
Date: 2026-04-22
Status: approved (pending implementation)
## Problem
When the dashboard shows data that looks wrong, it's impossible to tell whether the agent's shell commands returned the wrong data, the collector parsed it wrong, the channel dropped a field, or the LiveView rendered it wrong. The agent currently does not log the raw `zpool` / `zfs` / `pvesh` output, nor the assembled payloads it pushes.
To verify source data is correct, the operator needs to see:
1. Every shell command the agent runs and its full raw output.
2. The final assembled payload for each fast/medium/slow push, right before it leaves the agent.
Both need to be inspectable with `tail -f`, on demand, without rebuilding the Burrito binary.
## Goal
Opt-in diagnostic dump, controlled by a single TOML config key. When enabled, the agent writes two append-only files containing command results and outgoing payloads. When disabled (default), behavior and I/O cost are unchanged.
## Scope
In scope:
1. Config — new `[debug] dump_dir` key in `agent.toml`.
2. `ProxmoxAgent.Shell.run/2` — when dump is enabled, append one entry per command containing timestamp, command + args, exit/error, duration, output size, full raw body, and a separator line.
3. `ProxmoxAgent.Reporter.push_metric/3` — when dump is enabled, append one entry per metric push right before `push/4`, containing timestamp, kind (`fast`/`medium`/`slow`), full payload as indented JSON, and a separator line.
4. Unit tests covering the dump-on / dump-off branches.
Out of scope (YAGNI):
- Log rotation / retention — rely on `logrotate` or manual truncation.
- CLI subcommand for a one-shot dump — can add later if tailing proves awkward.
- Structured formats (e.g. NDJSON, SQLite). Plain text with separator lines is sufficient for `grep` / `less` / diffing.
## Configuration
Add to `agent.toml`:
```toml
[debug]
dump_dir = "/var/log/proxmox-monitor"
```
- When `dump_dir` is absent or empty, dumping is off. This is the default.
- When `dump_dir` is set, the agent creates the directory at startup (ignoring "already exists") and writes `commands.log` and `samples.log` inside it.
- If directory creation fails, the agent logs a single warning and runs as if dumping is off. It does not crash.
- Permissions — the directory is created with mode `0755`. Files are created with mode `0644` (default). The agent runs as root per the existing deployment, so no additional permission handling is needed.
## Config struct changes
`ProxmoxAgent.Config` gains one field:
```elixir
defstruct [
:server_url,
:token,
:host_id,
:dump_dir, # new — String.t() | nil
fast_seconds: 30,
medium_seconds: 300,
slow_seconds: 1800
]
```
`build/1` reads `Map.get(map, "debug", %{}) |> Map.get("dump_dir")`, treating empty strings as nil.
## Runtime wiring
`ProxmoxAgent.Application.start/2` loads the config, then calls `ProxmoxAgent.Diagnostics.configure/1` with the `dump_dir` value. `Diagnostics` is a new thin module (see below) that holds the dump state in `Application.put_env/3` (not `:persistent_term` — persistent_term is GC-expensive and overkill for a single nilable string). The Shell and Reporter both read it via `Application.get_env(:agent, :dump_dir)`.
Rationale: passing `dump_dir` through every collector and shell call via opts pollutes signatures for a cross-cutting concern. A single application-env read is simple and the contract ("set once at startup, read everywhere") is clear.
## New module: `ProxmoxAgent.Diagnostics`
One module, one responsibility: decide whether dumping is on and append entries to the right file.
```elixir
defmodule ProxmoxAgent.Diagnostics do
@moduledoc """
Optional diagnostic dump for verifying agent source data.
Disabled when `:dump_dir` is nil. Entries append to commands.log or samples.log.
"""
@spec configure(String.t() | nil) :: :ok
def configure(nil), do: Application.delete_env(:agent, :dump_dir)
def configure(""), do: Application.delete_env(:agent, :dump_dir)
def configure(dir) when is_binary(dir) do
case File.mkdir_p(dir) do
:ok ->
Application.put_env(:agent, :dump_dir, dir)
:ok
{:error, reason} ->
Logger.warning("diagnostics: mkdir #{dir} failed (#{inspect(reason)}); dumping disabled")
Application.delete_env(:agent, :dump_dir)
:ok
end
end
@spec enabled?() :: boolean()
def enabled?, do: not is_nil(Application.get_env(:agent, :dump_dir))
@spec log_command(String.t(), [String.t()], result, non_neg_integer()) :: :ok
def log_command(cmd, args, result, duration_us) do
case Application.get_env(:agent, :dump_dir) do
nil -> :ok
dir -> append(Path.join(dir, "commands.log"), format_command(cmd, args, result, duration_us))
end
end
@spec log_sample(String.t(), map()) :: :ok
def log_sample(kind, payload) do
case Application.get_env(:agent, :dump_dir) do
nil -> :ok
dir -> append(Path.join(dir, "samples.log"), format_sample(kind, payload))
end
end
# private: append/2, format_command/4, format_sample/2
end
```
### Entry formats
`commands.log`:
```
===== 2026-04-22T15:32:17.104Z =====
$ zpool status -j --json-flat-vdevs --json-int
exit=0 duration=124ms size=12847
<full raw output body here, verbatim>
```
Error form:
```
===== 2026-04-22T15:32:19.312Z =====
$ zpool status -j --json-flat-vdevs --json-int
error={:enoent, "zpool"} duration=2ms
```
`samples.log`:
```
===== 2026-04-22T15:32:17.220Z kind=fast =====
{
"host": { ... },
"zfs_pools": { "pools": [ ... ] },
...
}
```
Separators (`===== ...`) are grep-friendly. Timestamps use `DateTime.utc_now() |> DateTime.to_iso8601()`. The payload is Jason.encode!(payload, pretty: true).
### Append safety
Multiple Shell calls can happen concurrently across collectors. `File.write!/3` with `[:append]` mode is atomic enough for line-oriented appends on ext4/xfs/btrfs on Linux up to PIPE_BUF (4 KB), but the full output bodies we write are larger than that. To avoid interleaved bytes, serialize writes through a single GenServer process:
`ProxmoxAgent.Diagnostics.Writer` — a GenServer started by the supervision tree when `dump_dir` is set. It owns both file handles and handles two casts: `{:command, formatted}` and `{:sample, formatted}`. Each cast does a single `IO.write/2` and returns. File handles open once at init; on `:file.write` error, log warning and continue (don't crash the agent).
`Diagnostics.log_command/4` and `Diagnostics.log_sample/2` cast to the writer if dumping is enabled; otherwise they are no-ops. The module encapsulates the enabled/disabled branching so Shell and Reporter don't need to know.
## Shell changes
Wrap the existing `System.cmd` call with timing and a post-call dump:
```elixir
def run(command, args) do
start = System.monotonic_time(:microsecond)
result = do_run(command, args)
duration = System.monotonic_time(:microsecond) - start
ProxmoxAgent.Diagnostics.log_command(command, args, result, duration)
result
end
```
`do_run/2` is the existing body moved unchanged.
## Reporter changes
One line added in `push_metric/3` before the existing `push/4`:
```elixir
defp push_metric(socket, event, data) do
payload = %{collected_at: DateTime.utc_now() |> DateTime.to_iso8601(), data: data}
kind = String.replace_prefix(event, "metric:", "")
ProxmoxAgent.Diagnostics.log_sample(kind, payload)
case push(socket, socket.assigns.topic, event, payload) do
...
end
end
```
## Supervision
`ProxmoxAgent.Application.start/2` starts `Diagnostics.Writer` conditionally. If `cfg.dump_dir` is set and `File.mkdir_p` succeeded, add the writer to the child spec list; otherwise don't start it. The writer's absence is fine because `log_command` / `log_sample` no-op when the application env key is absent.
Concretely: the existing application loads the config, then computes `dump_dir = configure(cfg.dump_dir)` returning either a valid dir or `nil`. If non-nil, add `ProxmoxAgent.Diagnostics.Writer` to the children list, before `ProxmoxAgent.Reporter`.
## Tests
Two targets, both in `agent/test/proxmox_agent/`:
### `diagnostics_test.exs`
- `enabled?/0` returns false when env key unset.
- `configure/1` with `nil` or `""` disables.
- `configure/1` with a valid path creates the directory and sets the env key.
- `log_command/4` with dumping enabled writes a well-formed entry containing the command, args, exit code, and raw body.
- `log_sample/2` with dumping enabled writes a well-formed JSON entry with the kind and payload.
- Both loggers are no-ops when disabled (no file created, no exception).
Uses `System.tmp_dir!() <> "/diag-<random>"` for the dump dir; cleans up on exit.
### `shell_test.exs` (extension)
One new test confirming that when dumping is on, `Shell.run/2` appends to `commands.log`. Uses a config where the shell command is `echo hello` so the test is deterministic and doesn't require `zpool` on the test host.
### `reporter` integration
Skipped — `Reporter` is a Slipstream client whose integration is awkward to mock. The `Diagnostics.log_sample/2` call is a one-line invocation with no branching; covering it via `diagnostics_test.exs` is sufficient.
## Risks
- **File size growth** — accepted. Verification is a bounded activity. A fast-path zpool status of ~12 KB every 30 s is ~35 MB/day. Bigger hosts more. User turns off when done.
- **Writer backlog** — if the writer GenServer falls behind (shouldn't, given small message rates — fast is ~30 s, medium ~5 min, slow ~30 min, and Shell calls are serialized by the collectors anyway), casts queue in the mailbox. Not a production concern; debug feature.
- **Concurrent shell errors during writer start** — if `Shell.run/2` fires before `Diagnostics.Writer` is up, `log_command` no-ops because `Application.get_env` returns nil until `configure/1` runs. The supervision order — Diagnostics.Writer before Reporter — ensures the dump path is primed before any collectors fire.
## Rollout
Additive. Default off. Operator enables by editing `agent.toml` and restarting the service. No migration, no impact on server or dashboard.