hf download unsloth/Qwen3.5-9B-GGUF Qwen3.5-9B-Q8_0.gguf --local-dir C:\\llama.cpp\\models
Model ~9.0 GB
32k ISL/OSL total context ~1.0 GB
$env:GGML_VK_VISIBLE_DEVICES = "0"
C:\\llama.cpp\\llama-server.exe `
-m C:\\llama.cpp\\models\\Qwen3.5-9B-Q8_0.gguf `
--alias qwen-local `
--jinja `
--port 8000 --host 0.0.0.0 `
-c 32768 `
-ngl 99 `
--flash-attn true `
--api-key local-anything

$body = @{
model = "qwen-local"
messages = @(@{ role = "user"; content = "What's the weather in Tokyo?" })
tools = @(@{
type = "function"
function = @{
name = "get_weather"
description = "Get current weather for a city"
parameters = @{
type = "object"
properties = @{ city = @{ type = "string" } }
required = @("city")
}
}
})
} | ConvertTo-Json -Depth 10
$headers = @{ "Authorization" = "Bearer local-anything" }
$r = Invoke-RestMethod -Uri "<http://localhost:8000/v1/chat/completions>" `
-Method POST -ContentType "application/json" `
-Headers $headers -Body $body
$r.choices[0].message | ConvertTo-Json -Depth 10
PS C:\\llama.cpp> $headers = @{ "Authorization" = "Bearer local-anything" }
PS C:\\llama.cpp>
PS C:\\llama.cpp> $r = Invoke-RestMethod -Uri "<http://localhost:8000/v1/chat/completions>" `
>> -Method POST -ContentType "application/json" `
>> -Headers $headers -Body $body
PS C:\\llama.cpp>
PS C:\\llama.cpp> $r.choices[0].message | ConvertTo-Json -Depth 10
{
"role": "assistant",
"content": "",
"reasoning_content": "The user is asking about the weather in Tokyo. I have access to a get_weather function that can get current weather for a city. I need to call this function with \\"Tokyo\\" as the city parameter.\\n",
"tool_calls": [
{
"type": "function",
"function": {
"name": "get_weather",
"arguments": "{\\"city\\":\\"Tokyo\\"}"
},
"id": "80o9sz5lnRSjS2wyz5XYVSuIQNC5ERJQ"
}
]
}
That's a perfect tool call above. The model:
reasoning_content rather than leaking into content)content and a structured tool_calls array — exactly what an OpenAI/Anthropic client expects