@
zsj1029 ```
services:
vllm-qwen36-27b:
image: vllm/vllm-openai:v0.20.0
container_name: vllm-qwen36-27b
restart: "unless-stopped"
ports:
- "8085:8000"
volumes:
- models/huggingface/qwen3.6-27b-autoround-int4:/model
environment:
- CUDA_VISIBLE_DEVICES=0
- VLLM_WORKER_MULTIPROC_METHOD=spawn
- PYTORCH_CUDA_ALLOC_COnF=expandable_segments:True
shm_size: "16gb"
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: all
capabilities: [gpu]
command:
- --model
- /model
- --served-model-name
- qwen3.6-27b
- --quantization
- auto-round
- --dtype
- float16
- --gpu-memory-utilization
- "0.92"
- --max-model-len
- "160000"
- --kv-cache-dtype
- fp8_e5m2
- --max-num-seqs
- "1"
- --trust-remote-code
- --reasoning-parser
- qwen3
- --enable-auto-tool-choice
- --tool-call-parser
- qwen3_xml
- --enable-prefix-caching
- --speculative-config
- '{"method":"mtp","num_speculative_tokens":2}'
- --host
- "0.0.0.0"
- --port
- "8000"
```