nixpkgs/nixos/tests/web-servers/llama-swap.nix
06kellyjac 6cf423db94 nixos/llama-swap: add nixos vm test with opt-in model usage
Add a basic llama-swap test and perform more extensive tests using
SmolLM2 if allowed.
2025-09-03 22:15:38 +02:00

259 lines
9.2 KiB
Nix

{ pkgs, lib, ... }:
let
wrapSrc = attrs: pkgs.runCommand "${attrs.pname}-${attrs.version}" attrs "ln -s $src $out";
smollm2-135m = wrapSrc rec {
pname = "smollm2-135m";
version = "9e6855bc4be717fca1ef21360a1db4b29d5c559a";
src = pkgs.fetchurl {
url = "https://huggingface.co/unsloth/SmolLM2-135M-Instruct-GGUF/resolve/${version}/SmolLM2-135M-Instruct-Q4_K_M.gguf";
hash = "sha256-7V+jDEh7KC7BVsKQYvEiLlwgh1qUSsmCidvSQulH90c=";
};
meta.license = with lib.licenses; [
asl20 # actual license of the model
unfree # to force an opt-in - do not remove
];
};
# grab allowUnfreePredicate if it exists or default deny
allowUnfreePredicate =
if builtins.hasAttr "allowUnfreePredicate" pkgs.config then
pkgs.config.allowUnfreePredicate
else
(_: false);
# check if we can use smollm2-135m taking either globally allowUnfree or
# explicit allow with predicate
useSmollm2-135m = pkgs.config.allowUnfree || allowUnfreePredicate smollm2-135m;
in
{
name = "llama-swap";
meta.maintainers = with lib.maintainers; [
jk
podium868909
];
nodes = {
machine =
{ pkgs, ... }:
{
# running models can be memory intensive but
# default `virtualisation.memorySize` is fine
services.llama-swap = {
enable = true;
settings =
# config for basic tests
if !useSmollm2-135m then
{ }
# config for extended tests using SmolLM2
else
let
llama-cpp = pkgs.llama-cpp;
llama-server = lib.getExe' llama-cpp "llama-server";
in
{
hooks.on_startup.preload = [
"smollm2"
];
# temperature and top-k important for SmolLM2 performance/accuracy
models = {
"smollm2" = {
ttl = 10;
cmd = "${llama-server} --port \${PORT} -m ${smollm2-135m} --no-webui --temp 0.2 --top-k 9";
};
"smollm2-group-1" = {
cmd = "${llama-server} --port \${PORT} -m ${smollm2-135m} --no-webui --temp 0.2 --top-k 9 -c 1024";
};
"smollm2-group-2" = {
proxy = "http://127.0.0.1:5802";
cmd = "${llama-server} --port 5802 -m ${smollm2-135m} --no-webui --temp 0.2 --top-k 9 -c 1024";
};
};
groups = {
"standalone" = {
swap = true;
exclusive = true;
members = [
"smollm2"
];
};
"group" = {
swap = false;
exclusive = true;
members = [
"smollm2-group-1"
"smollm2-group-2"
];
};
};
};
};
};
};
testScript =
{ nodes, ... }:
''
# core tests
import json
def get_json(route):
args = [
'-v',
'-s',
'--fail',
'-H "Content-Type: application/json"'
]
return json.loads(machine.succeed("curl {args} http://localhost:8080{route}".format(args=" ".join(args), route=route)))
def post_json(route, data):
args = [
'-v',
'-s',
'--fail',
'-H "Content-Type: application/json"',
'-H "Authorization: Bearer no-key"',
"-d '{d}'".format(d=json.dumps(data))
]
return json.loads(machine.succeed('curl {args} http://localhost:8080{route}'.format(args=" ".join(args), route=route)))
machine.wait_for_unit('llama-swap')
machine.wait_for_open_port(8080)
with subtest('check is serving ui'):
machine.succeed('curl --fail http:/localhost:8080/ui/')
with subtest('check is healthy'):
machine.wait_until_succeeds('curl --silent --fail http://localhost:8080/health | grep "OK"')
''
+ lib.optionalString useSmollm2-135m ''
# extended tests using SmolLM2
with subtest('check `/running` for preloaded smollm2'):
machine.wait_until_succeeds('curl --silent --fail http://localhost:8080/running | grep "smollm2"')
running_response = get_json('/running')
assert len(running_response['running']) == 1
running_model = running_response['running'][0]
assert running_model['model'] == 'smollm2'
assert running_model['state'] == 'ready'
with subtest('runs smollm2'):
response = None
with subtest('send request to smollm2'):
data = {
'model': 'smollm2',
'messages': [
{
'role': 'user',
'content': 'Say hello'
}
]
}
response = post_json('/v1/chat/completions', data)
with subtest('response is from smollm2'):
assert response['model'] == 'smollm2'
with subtest('response contains at least one item in "choices"'):
assert len(response['choices']) >= 1
assistant_choices = None
with subtest('response contains at least one "assistant" message'):
assistant_choices = [c for c in response['choices'] if c['message']['role'] == 'assistant']
assert len(assistant_choices) >= 1
with subtest('first message (lowercase) starts with "hello"'):
assert assistant_choices[0]['message']['content'].lower()[:5] == 'hello'
with subtest('check `/running` for just smollm2'):
running_response = get_json('/running')
assert len(running_response['running']) == 1
running_model = running_response['running'][0]
assert running_model['model'] == 'smollm2'
assert running_model['state'] == 'ready'
with subtest('check `/running` for smollm2 to timeout'):
machine.succeed('curl --silent --fail http://localhost:8080/running | grep "smollm2"')
machine.wait_until_succeeds('curl --silent --fail http://localhost:8080/running | grep -v "smollm2"', timeout=11)
running_response = get_json('/running')
assert len(running_response['running']) == 0
with subtest('runs smollm2-group-1 and smollm2-group-2'):
response_1 = None
with subtest('send request to smollm2-group-1'):
data = {
'model': 'smollm2-group-1',
'messages': [
{
'role': 'user',
'content': 'Say hello'
}
]
}
response_1 = post_json('/v1/chat/completions', data)
with subtest('response 1 is from smollm2-group-1'):
assert response_1['model'] == 'smollm2-group-1'
with subtest('response 1 contains at least one item in "choices"'):
assert len(response['choices']) >= 1
assistant_choices_1 = None
with subtest('response 1 contains at least one "assistant" message'):
assistant_choices_1 = [c for c in response_1['choices'] if c['message']['role'] == 'assistant']
assert len(assistant_choices_1) >= 1
with subtest('first message (lowercase) in response 1 starts with "hello"'):
assert assistant_choices_1[0]['message']['content'].lower()[:5] == 'hello'
with subtest('check `/running` for just smollm2-group-1'):
running_response = get_json('/running')
assert len(running_response['running']) == 1
running_model = running_response['running'][0]
assert running_model['model'] == 'smollm2-group-1'
assert running_model['state'] == 'ready'
response_2 = None
with subtest('send request to smollm2-group-2'):
data = {
'model': 'smollm2-group-2',
'messages': [
{
'role': 'user',
'content': 'Say hello'
}
]
}
response_2 = post_json('/v1/chat/completions', data)
with subtest('response 2 is from smollm2-group-2'):
assert response_2['model'] == 'smollm2-group-2'
with subtest('response 2 contains at least one item in "choices"'):
assert len(response['choices']) >= 1
assistant_choices_2 = None
with subtest('response 2 contains at least one "assistant" message'):
assistant_choices_2 = [c for c in response_2['choices'] if c['message']['role'] == 'assistant']
assert len(assistant_choices_2) >= 1
with subtest('first message (lowercase) in response 1 starts with "hello"'):
assert assistant_choices_2[0]['message']['content'].lower()[:5] == 'hello'
with subtest('check `/running` for both smollm2-group-1 and smollm2-group-2'):
running_response = get_json('/running')['running']
assert len(running_response) == 2
assert len([
rm for rm in running_response
if rm['state'] == 'ready' and rm['model'] == 'smollm2-group-1'
]) == 1
assert len([
rm for rm in running_response
if rm['state'] == 'ready' and rm['model'] == 'smollm2-group-2'
]) == 1
'';
}