21 行代码自部署 Qwen3-VL-8B-Instruct-FP8 成功
准备:
modal secret create qwen-auth QWEN_API_KEY=sk-123abc
modal secret create huggingface-secret HF_TOKEN=hf_…
import modal, os
MODEL_ID = "Qwen/Qwen3-VL-8B-Instruct-FP8"
MODEL_DIR = "/data/model"
vol = modal.Volume.from_name("qwen-storage", create_if_missing=True)
image = (modal.Image.debian_slim().apt_install("ffmpeg", "libsm6", "libxext6").run_commands("pip install -U pip").pip_install("vllm>=0.7.0").pip_install("huggingface_hub", "hf_transfer", "decord","torch-c-dlpack-ext").env({"HF_HUB_ENABLE_HF_TRANSFER": "1","PYTORCH_CUDA_ALLOC_CONF": "expandable_segments:True"}))
app = modal.App("qwen-vl-volume")
@app.function("/data": vol},timeout=1800,secrets=[modal.Secret.from_name("huggingface-secret")]) def download_model_to_volume():
from huggingface_hub import snapshot_download
snapshot_download(MODEL_ID,local_dir=MODEL_DIR,ignore_patterns=["*.pt", "*.bin"])
vol.commit()
@app.cls(gpu="l4", image=image, volumes={"/data": vol}, scaledown_window=180, timeout=600, secrets=[modal.Secret.from_name("qwen-auth")]) class QwenServer:
@modal.web_server(port=8000, startup_timeout=600) def serve(self):
import subprocess, sys
if not os.path.exists(MODEL_DIR):
return
api_key = os.environ.get("QWEN_API_KEY", "sk-default")
cmd = [sys.executable, "-m", "vllm.entrypoints.openai.api_server","--model", MODEL_DIR,"--served-model-name", MODEL_ID,"--trust-remote-code","--tensor-parallel-size", "1","--api-key", api_key,"--gpu-memory-utilization", "0.90", "--max-model-len", "8192", "--kv-cache-dtype", "auto","--limit-mm-per-prompt", '{"image": 16, "video": 4}',"--port", "8000"]
subprocess.Popen(cmd, stdout=sys.stdout, stderr=sys.stderr)
效果:
原图:
29.0 tok/s


