embedding_model/serve/qwen/start_vllm.sh
2025-12-28 09:07:48 +00:00

20 lines
423 B
Bash

export CUDA_VISIBLE_DEVICES=0
export PYTHONPATH="/app"
# export VLLM_LOGGING_CONFIG_PATH=/app/logging_config.json
export TZ="Asia/Tehran"
# mkdir -p /app/logs
# sleep 200
text-embeddings-router \
--model-id /app/data/models/Qwen3-Embedding-0.6B/model \
--port 8080 \
--dtype float16 \
--max-client-batch-size 1024 \
--max-concurrent-requests 1024 \
--max-batch-requests 1024 \
--max-batch-tokens 32768