nixpkgs/nixos/tests/web-servers/llama-swap.nix

{ pkgs, lib, ... }:

let
  wrapSrc = attrs: pkgs.runCommand "${attrs.pname}-${attrs.version}" attrs "ln -s $src $out";

  smollm2-135m = wrapSrc rec {
    pname = "smollm2-135m";
    version = "9e6855bc4be717fca1ef21360a1db4b29d5c559a";
    src = pkgs.fetchurl {
      url = "https://huggingface.co/unsloth/SmolLM2-135M-Instruct-GGUF/resolve/${version}/SmolLM2-135M-Instruct-Q4_K_M.gguf";
      hash = "sha256-7V+jDEh7KC7BVsKQYvEiLlwgh1qUSsmCidvSQulH90c=";
    };

    meta.license = with lib.licenses; [
      asl20 # actual license of the model
      unfree # to force an opt-in - do not remove
    ];
  };

  # grab allowUnfreePredicate if it exists or default deny
  allowUnfreePredicate =
    if builtins.hasAttr "allowUnfreePredicate" pkgs.config then
      pkgs.config.allowUnfreePredicate
    else
      (_: false);

  # check if we can use smollm2-135m taking either globally allowUnfree or
  # explicit allow with predicate
  useSmollm2-135m = pkgs.config.allowUnfree || allowUnfreePredicate smollm2-135m;
in
{
  name = "llama-swap";
  meta.maintainers = with lib.maintainers; [
    jk
    podium868909
  ];

  nodes = {
    machine =
      { pkgs, ... }:
      {
        # running models can be memory intensive but
        # default `virtualisation.memorySize` is fine

        services.llama-swap = {
          enable = true;
          settings =
            # config for basic tests
            if !useSmollm2-135m then
              { }
            # config for extended tests using SmolLM2
            else
              let
                llama-cpp = pkgs.llama-cpp;
                llama-server = lib.getExe' llama-cpp "llama-server";
              in
              {
                hooks.on_startup.preload = [
                  "smollm2"
                ];
                # temperature and top-k important for SmolLM2 performance/accuracy
                models = {
                  "smollm2" = {
                    ttl = 10;
                    cmd = "${llama-server} --port \${PORT} -m ${smollm2-135m} --no-webui --temp 0.2 --top-k 9";
                  };
                  "smollm2-group-1" = {
                    cmd = "${llama-server} --port \${PORT} -m ${smollm2-135m} --no-webui --temp 0.2 --top-k 9 -c 1024";
                  };
                  "smollm2-group-2" = {
                    proxy = "http://127.0.0.1:5802";
                    cmd = "${llama-server} --port 5802 -m ${smollm2-135m} --no-webui --temp 0.2 --top-k 9 -c 1024";
                  };
                };
                groups = {
                  "standalone" = {
                    swap = true;
                    exclusive = true;
                    members = [
                      "smollm2"
                    ];
                  };
                  "group" = {
                    swap = false;
                    exclusive = true;
                    members = [
                      "smollm2-group-1"
                      "smollm2-group-2"
                    ];
                  };
                };
              };
        };
      };
  };

  testScript =
    { nodes, ... }:
    ''
      # core tests
      import json

      def get_json(route):
        args = [
          '-v',
          '-s',
          '--fail',
          '-H "Content-Type: application/json"'
        ]
        return json.loads(machine.succeed("curl {args} http://localhost:8080{route}".format(args=" ".join(args), route=route)))

      def post_json(route, data):
        args = [
          '-v',
          '-s',
          '--fail',
          '-H "Content-Type: application/json"',
          '-H "Authorization: Bearer no-key"',
          "-d '{d}'".format(d=json.dumps(data))
        ]
        return json.loads(machine.succeed('curl {args} http://localhost:8080{route}'.format(args=" ".join(args), route=route)))

      machine.wait_for_unit('llama-swap')
      machine.wait_for_open_port(8080)

      with subtest('check is serving ui'):
        machine.succeed('curl --fail http:/localhost:8080/ui/')

      with subtest('check is healthy'):
        machine.wait_until_succeeds('curl --silent --fail http://localhost:8080/health | grep "OK"')

    ''
    + lib.optionalString useSmollm2-135m ''
      # extended tests using SmolLM2
      with subtest('check `/running` for preloaded smollm2'):
        machine.wait_until_succeeds('curl --silent --fail http://localhost:8080/running | grep "smollm2"')
        running_response = get_json('/running')
        assert len(running_response['running']) == 1
        running_model = running_response['running'][0]
        assert running_model['model'] == 'smollm2'
        assert running_model['state'] == 'ready'

      with subtest('runs smollm2'):
        response = None
        with subtest('send request to smollm2'):
          data = {
            'model': 'smollm2',
            'messages': [
              {
                'role': 'user',
                'content': 'Say hello'
              }
            ]
          }
          response = post_json('/v1/chat/completions', data)

        with subtest('response is from smollm2'):
          assert response['model'] == 'smollm2'

        with subtest('response contains at least one item in "choices"'):
          assert len(response['choices']) >= 1

        assistant_choices = None
        with subtest('response contains at least one "assistant" message'):
          assistant_choices = [c for c in response['choices'] if c['message']['role'] == 'assistant']
          assert len(assistant_choices) >= 1

        with subtest('first message (lowercase) starts with "hello"'):
          assert assistant_choices[0]['message']['content'].lower()[:5] == 'hello'

        with subtest('check `/running` for just smollm2'):
          running_response = get_json('/running')
          assert len(running_response['running']) == 1
          running_model = running_response['running'][0]
          assert running_model['model'] == 'smollm2'
          assert running_model['state'] == 'ready'

      with subtest('check `/running` for smollm2 to timeout'):
        machine.succeed('curl --silent --fail http://localhost:8080/running | grep "smollm2"')
        machine.wait_until_succeeds('curl --silent --fail http://localhost:8080/running | grep -v "smollm2"', timeout=11)
        running_response = get_json('/running')
        assert len(running_response['running']) == 0

      with subtest('runs smollm2-group-1 and smollm2-group-2'):
        response_1 = None
        with subtest('send request to smollm2-group-1'):
          data = {
            'model': 'smollm2-group-1',
            'messages': [
              {
                'role': 'user',
                'content': 'Say hello'
              }
            ]
          }
          response_1 = post_json('/v1/chat/completions', data)

        with subtest('response 1 is from smollm2-group-1'):
          assert response_1['model'] == 'smollm2-group-1'

        with subtest('response 1 contains at least one item in "choices"'):
          assert len(response['choices']) >= 1

        assistant_choices_1 = None
        with subtest('response 1 contains at least one "assistant" message'):
          assistant_choices_1 = [c for c in response_1['choices'] if c['message']['role'] == 'assistant']
          assert len(assistant_choices_1) >= 1

        with subtest('first message (lowercase) in response 1 starts with "hello"'):
          assert assistant_choices_1[0]['message']['content'].lower()[:5] == 'hello'

        with subtest('check `/running` for just smollm2-group-1'):
          running_response = get_json('/running')
          assert len(running_response['running']) == 1
          running_model = running_response['running'][0]
          assert running_model['model'] == 'smollm2-group-1'
          assert running_model['state'] == 'ready'

        response_2 = None
        with subtest('send request to smollm2-group-2'):
          data = {
            'model': 'smollm2-group-2',
            'messages': [
              {
                'role': 'user',
                'content': 'Say hello'
              }
            ]
          }
          response_2 = post_json('/v1/chat/completions', data)

        with subtest('response 2 is from smollm2-group-2'):
          assert response_2['model'] == 'smollm2-group-2'

        with subtest('response 2 contains at least one item in "choices"'):
          assert len(response['choices']) >= 1

        assistant_choices_2 = None
        with subtest('response 2 contains at least one "assistant" message'):
          assistant_choices_2 = [c for c in response_2['choices'] if c['message']['role'] == 'assistant']
          assert len(assistant_choices_2) >= 1

        with subtest('first message (lowercase) in response 1 starts with "hello"'):
          assert assistant_choices_2[0]['message']['content'].lower()[:5] == 'hello'

        with subtest('check `/running` for both smollm2-group-1 and smollm2-group-2'):
          running_response = get_json('/running')['running']
          assert len(running_response) == 2
          assert len([
            rm for rm in running_response
            if rm['state'] == 'ready' and rm['model'] == 'smollm2-group-1'
          ]) == 1
          assert len([
            rm for rm in running_response
            if rm['state'] == 'ready' and rm['model'] == 'smollm2-group-2'
          ]) == 1
    '';
}