Initial commit: SpamLLM - DistilBERT spam classifier for RSpamd
Multilingual spam classifier (DE/EN) with language detection. Non-DE/EN mails receive an additional spam score bonus. - train.py: Fine-tune distilbert-base-multilingual-cased on spam/ham data - server.py: FastAPI service with langdetect integration - rspamd/: Lua plugin and config for RSpamd integration - export_rspamd_data.py: Export Maildir folders to CSV training data - test_classify.py: Local model validation with DE/EN/foreign test cases Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
commit
38efd20b4d
7 changed files with 671 additions and 0 deletions
129
rspamd/lua/spamllm.lua
Normal file
129
rspamd/lua/spamllm.lua
Normal file
|
|
@ -0,0 +1,129 @@
|
|||
-- RSpamd Lua Plugin für SpamLLM
|
||||
-- Kopiere nach /etc/rspamd/plugins.d/spamllm.lua
|
||||
--
|
||||
-- Dieser Plugin sendet Mail-Daten an den SpamLLM HTTP Service
|
||||
-- und setzt den Score basierend auf der Antwort.
|
||||
|
||||
local rspamd_http = require "rspamd_http"
|
||||
local rspamd_logger = require "rspamd_logger"
|
||||
local ucl = require "ucl"
|
||||
|
||||
local N = "spamllm"
|
||||
|
||||
local settings = {
|
||||
url = "http://127.0.0.1:8000/classify",
|
||||
timeout = 5.0,
|
||||
symbol_spam = "SPAMLLM_SPAM",
|
||||
symbol_ham = "SPAMLLM_HAM",
|
||||
symbol_foreign = "SPAMLLM_FOREIGN_LANG",
|
||||
threshold = 0.5,
|
||||
max_body_length = 4096,
|
||||
enabled = true,
|
||||
}
|
||||
|
||||
local function check_spamllm(task)
|
||||
-- Extrahiere Mail-Daten
|
||||
local from = task:get_from("smtp")
|
||||
local from_addr = ""
|
||||
if from and from[1] then
|
||||
from_addr = from[1].addr or ""
|
||||
end
|
||||
|
||||
local subject = task:get_subject() or ""
|
||||
|
||||
local text_parts = task:get_text_parts()
|
||||
local body = ""
|
||||
if text_parts then
|
||||
for _, part in ipairs(text_parts) do
|
||||
local content = part:get_content()
|
||||
if content then
|
||||
body = body .. tostring(content)
|
||||
if #body > settings.max_body_length then
|
||||
body = body:sub(1, settings.max_body_length)
|
||||
break
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
-- JSON Request Body
|
||||
local request_body = string.format(
|
||||
'{"from_addr":"%s","subject":"%s","body":"%s"}',
|
||||
from_addr:gsub('"', '\\"'),
|
||||
subject:gsub('"', '\\"'),
|
||||
body:gsub('"', '\\"'):gsub('\n', '\\n'):gsub('\r', '\\r')
|
||||
)
|
||||
|
||||
local function callback(err, code, response_body)
|
||||
if err then
|
||||
rspamd_logger.errx(task, "SpamLLM request failed: %s", err)
|
||||
return
|
||||
end
|
||||
|
||||
if code ~= 200 then
|
||||
rspamd_logger.errx(task, "SpamLLM returned HTTP %s", code)
|
||||
return
|
||||
end
|
||||
|
||||
local parser = ucl.parser()
|
||||
local ok, parse_err = parser:parse_string(response_body)
|
||||
if not ok then
|
||||
rspamd_logger.errx(task, "SpamLLM JSON parse error: %s", parse_err)
|
||||
return
|
||||
end
|
||||
|
||||
local result = parser:get_object()
|
||||
|
||||
if result.is_spam then
|
||||
task:insert_result(settings.symbol_spam, result.confidence, "SpamLLM")
|
||||
rspamd_logger.infox(task, "SpamLLM: SPAM (confidence=%.2f, score=%.2f, lang=%s)",
|
||||
result.confidence, result.score, result.language or "?")
|
||||
else
|
||||
task:insert_result(settings.symbol_ham, -result.confidence, "SpamLLM")
|
||||
end
|
||||
|
||||
-- Fremdsprachen-Bonus als separates Symbol
|
||||
if result.foreign_lang_bonus and result.foreign_lang_bonus > 0 then
|
||||
task:insert_result(settings.symbol_foreign, result.foreign_lang_bonus / 4.0,
|
||||
string.format("lang=%s", result.language or "unknown"))
|
||||
rspamd_logger.infox(task, "SpamLLM: Foreign language detected: %s (bonus=%.1f)",
|
||||
result.language, result.foreign_lang_bonus)
|
||||
end
|
||||
end
|
||||
|
||||
rspamd_http.request({
|
||||
task = task,
|
||||
url = settings.url,
|
||||
body = request_body,
|
||||
callback = callback,
|
||||
headers = {
|
||||
["Content-Type"] = "application/json",
|
||||
},
|
||||
timeout = settings.timeout,
|
||||
})
|
||||
end
|
||||
|
||||
-- Symbol registrieren
|
||||
rspamd_config:register_symbol({
|
||||
name = settings.symbol_spam,
|
||||
weight = 5.0,
|
||||
callback = check_spamllm,
|
||||
type = "normal",
|
||||
description = "SpamLLM DistilBERT spam classifier",
|
||||
})
|
||||
|
||||
rspamd_config:register_symbol({
|
||||
name = settings.symbol_ham,
|
||||
weight = -2.0,
|
||||
type = "virtual",
|
||||
parent = rspamd_config:get_symbol_id(settings.symbol_spam),
|
||||
description = "SpamLLM DistilBERT ham classification",
|
||||
})
|
||||
|
||||
rspamd_config:register_symbol({
|
||||
name = settings.symbol_foreign,
|
||||
weight = 4.0,
|
||||
type = "virtual",
|
||||
parent = rspamd_config:get_symbol_id(settings.symbol_spam),
|
||||
description = "Mail in unerwarteter Sprache (nicht DE/EN)",
|
||||
})
|
||||
Loading…
Add table
Add a link
Reference in a new issue