Skip to content

Can't process response from llamacpp server #97

@gergap

Description

@gergap

I have nvim/llm working with ollama, which uses llm-ls-x86_64-unknown-linux-gnu-0.5.3.
I tried to switch the config to use OpenAI API to connect to llamacpp server,
because this does support my AMD GPU, which ollama does not.

I can see in Wireshark that the request is sent and the llamacppserver send back a successful response.
However I don' get any completion in nvim, which is probably caused by llm-ls processing the response.

I get this error in nvim: [LLM] serde json error: data did not match any variant of untagged enum OpenAIAPIResponse

request:

{
  "model": "models/codellama-7b.Q4_K_M.gguf",
  "options": {
    "temperature": 0.2,
    "top_p": 0.95
  },
  "parameters": {
    "max_new_tokens": 60,
    "temperature": 0.2,
    "top_p": 0.95
  },
  "prompt": "<PRE> #include <stdio.h>\n\nfloat multiply(float a, float b)\n{\n     <SUF>\n}\n\nint main(int argc, char *argv[])\n{\n    return 0;\n}\n\n\n <MID>",
  "stream": false
}

response:

{
  "content": "return a * b; <EOT>",
  "id_slot": 0,
  "stop": true,
  "model": "models/codellama-7b.Q4_K_M.gguf",
  "tokens_predicted": 6,
  "tokens_evaluated": 54,
  "generation_settings": {
    "n_ctx": 512,
    "n_predict": -1,
    "model": "models/codellama-7b.Q4_K_M.gguf",
    "seed": 4294967295,
    "temperature": 0.800000011920929,
    "dynatemp_range": 0,
    "dynatemp_exponent": 1,
    "top_k": 40,
    "top_p": 0.949999988079071,
    "min_p": 0.05000000074505806,
    "tfs_z": 1,
    "typical_p": 1,
    "repeat_last_n": 64,
    "repeat_penalty": 1,
    "presence_penalty": 0,
    "frequency_penalty": 0,
    "penalty_prompt_tokens": [],
    "use_penalty_prompt_tokens": false,
    "mirostat": 0,
    "mirostat_tau": 5,
    "mirostat_eta": 0.10000000149011612,
    "penalize_nl": false,
    "stop": [],
    "n_keep": 0,
    "n_discard": 0,
    "ignore_eos": false,
    "stream": false,
    "logit_bias": [],
    "n_probs": 0,
    "min_keep": 0,
    "grammar": "",
    "samplers": [
      "top_k",
      "tfs_z",
      "typical_p",
      "top_p",
      "min_p",
      "temperature"
    ]
  },
  "prompt": "<PRE> #include <stdio.h>\n\nfloat multiply(float a, float b)\n{\n     <SUF>\n}\n\nint main(int argc, char *argv[])\n{\n    return 0;\n}\n\n\n <MID>",
  "truncated": false,
  "stopped_eos": true,
  "stopped_word": false,
  "stopped_limit": false,
  "stopping_word": "",
  "tokens_cached": 59,
  "timings": {
    "prompt_n": 54,
    "prompt_ms": 601.562,
    "prompt_per_token_ms": 11.140037037037038,
    "prompt_per_second": 89.76630837719138,
    "predicted_n": 6,
    "predicted_ms": 315.451,
    "predicted_per_token_ms": 52.57516666666667,
    "predicted_per_second": 19.020386684461293
  }
}

I hope you can fix that or tell me what I did wrong if its my fault.

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions