准备:

modal secret create qwen-auth QWEN_API_KEY=sk-123abc
modal secret create huggingface-secret HF_TOKEN=hf_…
import modal, os
MODEL_ID = "Qwen/Qwen3-VL-8B-Instruct-FP8"
MODEL_DIR = "/data/model"
vol = modal.Volume.from_name("qwen-storage", create_if_missing=True)
image = (modal.Image.debian_slim().apt_install("ffmpeg", "libsm6", "libxext6").run_commands("pip install -U pip").pip_install("vllm>=0.7.0").pip_install("huggingface_hub", "hf_transfer", "decord","torch-c-dlpack-ext").env({"HF_HUB_ENABLE_HF_TRANSFER": "1","PYTORCH_CUDA_ALLOC_CONF": "expandable_segments:True"}))
app = modal.App("qwen-vl-volume")
@app.function("/data": vol},timeout=1800,secrets=[modal.Secret.from_name("huggingface-secret")]) def download_model_to_volume():
    from huggingface_hub import snapshot_download
    snapshot_download(MODEL_ID,local_dir=MODEL_DIR,ignore_patterns=["*.pt", "*.bin"])
    vol.commit()
@app.cls(gpu="l4", image=image, volumes={"/data": vol}, scaledown_window=180, timeout=600, secrets=[modal.Secret.from_name("qwen-auth")]) class QwenServer:
@modal.web_server(port=8000, startup_timeout=600) def serve(self):
        import subprocess, sys
        if not os.path.exists(MODEL_DIR):
            return
        api_key = os.environ.get("QWEN_API_KEY", "sk-default")
        cmd = [sys.executable, "-m", "vllm.entrypoints.openai.api_server","--model", MODEL_DIR,"--served-model-name", MODEL_ID,"--trust-remote-code","--tensor-parallel-size", "1","--api-key", api_key,"--gpu-memory-utilization", "0.90", "--max-model-len", "8192", "--kv-cache-dtype", "auto","--limit-mm-per-prompt", '{"image": 16, "video": 4}',"--port", "8000"]
        subprocess.Popen(cmd, stdout=sys.stdout, stderr=sys.stderr)


效果:

原图:


29.0 tok/s


📌 转载信息
原作者:
Clancy
转载时间:
2026/1/6 12:02:52

标签: 模型部署, 多模态模型, vLLM, Qwen3-VL-8B-Instruct-FP8, Modal

添加新评论