permission denied while trying to connect to the docker API at unix:///var/run/docker.sock permission denied while trying to connect to the docker API at unix:///var/run/docker.sock permission denied while trying to connect to the docker API at unix:///var/run/docker.sock permission denied while trying to connect to the docker API at unix:///var/run/docker.sock db7adbd16aa74ab46d7e4912516bb1e829c31faeaae466a4bc54c2c79c7df7f4 Started vibevoice-asr. Waiting for /v1/models readiness (timeout 600s). Container exited before readiness. Recent logs: Error response from daemon: No such container: vibevoice-asr 9e1f140c8d965123ecfa9219fdea41f3fbee008c802238a5346efcc24dae62a1 Started vibevoice-asr. Waiting for /v1/models readiness (timeout 600s). Container exited before readiness. Recent logs: (EngineCore_DP0 pid=64) ERROR 04-30 07:58:48 [core.py:936] return self._call_with_optional_nvtx_range( (EngineCore_DP0 pid=64) ERROR 04-30 07:58:48 [core.py:936] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (EngineCore_DP0 pid=64) ERROR 04-30 07:58:48 [core.py:936] File "/usr/local/lib/python3.12/dist-packages/vllm/compilation/wrapper.py", line 119, in _call_with_optional_nvtx_range (EngineCore_DP0 pid=64) ERROR 04-30 07:58:48 [core.py:936] return callable_fn(*args, **kwargs) (EngineCore_DP0 pid=64) ERROR 04-30 07:58:48 [core.py:936] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (EngineCore_DP0 pid=64) ERROR 04-30 07:58:48 [core.py:936] File "/usr/local/lib/python3.12/dist-packages/torch/_dynamo/eval_frame.py", line 845, in compile_wrapper (EngineCore_DP0 pid=64) ERROR 04-30 07:58:48 [core.py:936] raise e.remove_dynamo_frames() from None # see TORCHDYNAMO_VERBOSE=1 (EngineCore_DP0 pid=64) ERROR 04-30 07:58:48 [core.py:936] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (EngineCore_DP0 pid=64) ERROR 04-30 07:58:48 [core.py:936] File "/usr/local/lib/python3.12/dist-packages/torch/_inductor/compile_fx.py", line 990, in _compile_fx_inner (EngineCore_DP0 pid=64) ERROR 04-30 07:58:48 [core.py:936] raise InductorError(e, currentframe()).with_traceback( (EngineCore_DP0 pid=64) ERROR 04-30 07:58:48 [core.py:936] File "/usr/local/lib/python3.12/dist-packages/torch/_inductor/compile_fx.py", line 974, in _compile_fx_inner (EngineCore_DP0 pid=64) ERROR 04-30 07:58:48 [core.py:936] mb_compiled_graph = fx_codegen_and_compile( (EngineCore_DP0 pid=64) ERROR 04-30 07:58:48 [core.py:936] ^^^^^^^^^^^^^^^^^^^^^^^ (EngineCore_DP0 pid=64) ERROR 04-30 07:58:48 [core.py:936] File "/usr/local/lib/python3.12/dist-packages/torch/_inductor/compile_fx.py", line 1695, in fx_codegen_and_compile (EngineCore_DP0 pid=64) ERROR 04-30 07:58:48 [core.py:936] return scheme.codegen_and_compile(gm, example_inputs, inputs_to_check, graph_kwargs) (EngineCore_DP0 pid=64) ERROR 04-30 07:58:48 [core.py:936] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (EngineCore_DP0 pid=64) ERROR 04-30 07:58:48 [core.py:936] File "/usr/local/lib/python3.12/dist-packages/torch/_inductor/compile_fx.py", line 1505, in codegen_and_compile (EngineCore_DP0 pid=64) ERROR 04-30 07:58:48 [core.py:936] compiled_module = graph.compile_to_module() (EngineCore_DP0 pid=64) ERROR 04-30 07:58:48 [core.py:936] ^^^^^^^^^^^^^^^^^^^^^^^^^ (EngineCore_DP0 pid=64) ERROR 04-30 07:58:48 [core.py:936] File "/usr/local/lib/python3.12/dist-packages/torch/_inductor/graph.py", line 2319, in compile_to_module (EngineCore_DP0 pid=64) ERROR 04-30 07:58:48 [core.py:936] return self._compile_to_module() (EngineCore_DP0 pid=64) ERROR 04-30 07:58:48 [core.py:936] ^^^^^^^^^^^^^^^^^^^^^^^^^ (EngineCore_DP0 pid=64) ERROR 04-30 07:58:48 [core.py:936] File "/usr/local/lib/python3.12/dist-packages/torch/_inductor/graph.py", line 2325, in _compile_to_module (EngineCore_DP0 pid=64) ERROR 04-30 07:58:48 [core.py:936] self.codegen_with_cpp_wrapper() if self.cpp_wrapper else self.codegen() (EngineCore_DP0 pid=64) ERROR 04-30 07:58:48 [core.py:936] ^^^^^^^^^^^^^^ (EngineCore_DP0 pid=64) ERROR 04-30 07:58:48 [core.py:936] File "/usr/local/lib/python3.12/dist-packages/torch/_inductor/graph.py", line 2271, in codegen (EngineCore_DP0 pid=64) ERROR 04-30 07:58:48 [core.py:936] result = self.wrapper_code.generate(self.is_inference) (EngineCore_DP0 pid=64) Process EngineCore_DP0: (EngineCore_DP0 pid=64) ERROR 04-30 07:58:48 [core.py:936] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (EngineCore_DP0 pid=64) ERROR 04-30 07:58:48 [core.py:936] File "/usr/local/lib/python3.12/dist-packages/torch/_inductor/codegen/wrapper.py", line 1552, in generate (EngineCore_DP0 pid=64) ERROR 04-30 07:58:48 [core.py:936] return self._generate(is_inference) (EngineCore_DP0 pid=64) ERROR 04-30 07:58:48 [core.py:936] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (EngineCore_DP0 pid=64) ERROR 04-30 07:58:48 [core.py:936] File "/usr/local/lib/python3.12/dist-packages/torch/_inductor/codegen/wrapper.py", line 1615, in _generate (EngineCore_DP0 pid=64) ERROR 04-30 07:58:48 [core.py:936] self.generate_and_run_autotune_block() (EngineCore_DP0 pid=64) ERROR 04-30 07:58:48 [core.py:936] File "/usr/local/lib/python3.12/dist-packages/torch/_inductor/codegen/wrapper.py", line 1695, in generate_and_run_autotune_block (EngineCore_DP0 pid=64) ERROR 04-30 07:58:48 [core.py:936] raise RuntimeError(f"Failed to run autotuning code block: {e}") from e (EngineCore_DP0 pid=64) ERROR 04-30 07:58:48 [core.py:936] torch._inductor.exc.InductorError: RuntimeError: Failed to run autotuning code block: No valid triton configs. PTXASError: PTXAS error: Internal Triton PTX codegen error (EngineCore_DP0 pid=64) ERROR 04-30 07:58:48 [core.py:936] `ptxas` stderr: (EngineCore_DP0 pid=64) ERROR 04-30 07:58:48 [core.py:936] ptxas fatal : Value 'sm_121a' is not defined for option 'gpu-name' (EngineCore_DP0 pid=64) ERROR 04-30 07:58:48 [core.py:936] (EngineCore_DP0 pid=64) ERROR 04-30 07:58:48 [core.py:936] Repro command: /usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/bin/ptxas -lineinfo -v --gpu-name=sm_121a /tmp/tmphjhx5dp3.ptx -o /tmp/tmphjhx5dp3.ptx.o (EngineCore_DP0 pid=64) ERROR 04-30 07:58:48 [core.py:936] (EngineCore_DP0 pid=64) Traceback (most recent call last): (EngineCore_DP0 pid=64) File "/usr/lib/python3.12/multiprocessing/process.py", line 314, in _bootstrap (EngineCore_DP0 pid=64) self.run() (EngineCore_DP0 pid=64) File "/usr/lib/python3.12/multiprocessing/process.py", line 108, in run (EngineCore_DP0 pid=64) self._target(*self._args, **self._kwargs) (EngineCore_DP0 pid=64) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 940, in run_engine_core (EngineCore_DP0 pid=64) raise e (EngineCore_DP0 pid=64) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 927, in run_engine_core (EngineCore_DP0 pid=64) engine_core = EngineCoreProc(*args, engine_index=dp_rank, **kwargs) (EngineCore_DP0 pid=64) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (EngineCore_DP0 pid=64) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 692, in __init__ (EngineCore_DP0 pid=64) super().__init__( (EngineCore_DP0 pid=64) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 113, in __init__ (EngineCore_DP0 pid=64) num_gpu_blocks, num_cpu_blocks, kv_cache_config = self._initialize_kv_caches( (EngineCore_DP0 pid=64) ^^^^^^^^^^^^^^^^^^^^^^^^^^^ (EngineCore_DP0 pid=64) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 243, in _initialize_kv_caches (EngineCore_DP0 pid=64) available_gpu_memory = self.model_executor.determine_available_memory() (EngineCore_DP0 pid=64) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (EngineCore_DP0 pid=64) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/abstract.py", line 126, in determine_available_memory (EngineCore_DP0 pid=64) return self.collective_rpc("determine_available_memory") (EngineCore_DP0 pid=64) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (EngineCore_DP0 pid=64) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/uniproc_executor.py", line 75, in collective_rpc (EngineCore_DP0 pid=64) result = run_method(self.driver_worker, method, args, kwargs) (EngineCore_DP0 pid=64) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (EngineCore_DP0 pid=64) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/serial_utils.py", line 461, in run_method (EngineCore_DP0 pid=64) return func(*args, **kwargs) (EngineCore_DP0 pid=64) ^^^^^^^^^^^^^^^^^^^^^ (EngineCore_DP0 pid=64) File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 120, in decorate_context (EngineCore_DP0 pid=64) return func(*args, **kwargs) (EngineCore_DP0 pid=64) ^^^^^^^^^^^^^^^^^^^^^ (EngineCore_DP0 pid=64) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 324, in determine_available_memory (EngineCore_DP0 pid=64) self.model_runner.profile_run() (EngineCore_DP0 pid=64) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 4743, in profile_run (EngineCore_DP0 pid=64) hidden_states, last_hidden_states = self._dummy_run( (EngineCore_DP0 pid=64) ^^^^^^^^^^^^^^^^ (EngineCore_DP0 pid=64) File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 120, in decorate_context (EngineCore_DP0 pid=64) return func(*args, **kwargs) (EngineCore_DP0 pid=64) ^^^^^^^^^^^^^^^^^^^^^ (EngineCore_DP0 pid=64) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 4461, in _dummy_run (EngineCore_DP0 pid=64) outputs = self.model( (EngineCore_DP0 pid=64) ^^^^^^^^^^^ (EngineCore_DP0 pid=64) File "/usr/local/lib/python3.12/dist-packages/vllm/compilation/cuda_graph.py", line 222, in __call__ (EngineCore_DP0 pid=64) return self.runnable(*args, **kwargs) (EngineCore_DP0 pid=64) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (EngineCore_DP0 pid=64) File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl (EngineCore_DP0 pid=64) return self._call_impl(*args, **kwargs) (EngineCore_DP0 pid=64) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (EngineCore_DP0 pid=64) File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1786, in _call_impl (EngineCore_DP0 pid=64) return forward_call(*args, **kwargs) (EngineCore_DP0 pid=64) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (EngineCore_DP0 pid=64) File "/opt/VibeVoice/vllm_plugin/model.py", line 1241, in forward (EngineCore_DP0 pid=64) hidden_states = language_model.model( (EngineCore_DP0 pid=64) ^^^^^^^^^^^^^^^^^^^^^ (EngineCore_DP0 pid=64) File "/usr/local/lib/python3.12/dist-packages/vllm/compilation/decorators.py", line 558, in __call__ (EngineCore_DP0 pid=64) output = TorchCompileWithNoGuardsWrapper.__call__(self, *args, **kwargs) # type: ignore[arg-type] (EngineCore_DP0 pid=64) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (EngineCore_DP0 pid=64) File "/usr/local/lib/python3.12/dist-packages/vllm/compilation/wrapper.py", line 228, in __call__ (EngineCore_DP0 pid=64) return self._call_with_optional_nvtx_range( (EngineCore_DP0 pid=64) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (EngineCore_DP0 pid=64) File "/usr/local/lib/python3.12/dist-packages/vllm/compilation/wrapper.py", line 119, in _call_with_optional_nvtx_range (EngineCore_DP0 pid=64) return callable_fn(*args, **kwargs) (EngineCore_DP0 pid=64) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (EngineCore_DP0 pid=64) File "/usr/local/lib/python3.12/dist-packages/torch/_dynamo/eval_frame.py", line 845, in compile_wrapper (EngineCore_DP0 pid=64) raise e.remove_dynamo_frames() from None # see TORCHDYNAMO_VERBOSE=1 (EngineCore_DP0 pid=64) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (EngineCore_DP0 pid=64) File "/usr/local/lib/python3.12/dist-packages/torch/_inductor/compile_fx.py", line 990, in _compile_fx_inner (EngineCore_DP0 pid=64) raise InductorError(e, currentframe()).with_traceback( (EngineCore_DP0 pid=64) File "/usr/local/lib/python3.12/dist-packages/torch/_inductor/compile_fx.py", line 974, in _compile_fx_inner (EngineCore_DP0 pid=64) mb_compiled_graph = fx_codegen_and_compile( (EngineCore_DP0 pid=64) ^^^^^^^^^^^^^^^^^^^^^^^ (EngineCore_DP0 pid=64) File "/usr/local/lib/python3.12/dist-packages/torch/_inductor/compile_fx.py", line 1695, in fx_codegen_and_compile (EngineCore_DP0 pid=64) return scheme.codegen_and_compile(gm, example_inputs, inputs_to_check, graph_kwargs) (EngineCore_DP0 pid=64) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (EngineCore_DP0 pid=64) File "/usr/local/lib/python3.12/dist-packages/torch/_inductor/compile_fx.py", line 1505, in codegen_and_compile (EngineCore_DP0 pid=64) compiled_module = graph.compile_to_module() (EngineCore_DP0 pid=64) ^^^^^^^^^^^^^^^^^^^^^^^^^ (EngineCore_DP0 pid=64) File "/usr/local/lib/python3.12/dist-packages/torch/_inductor/graph.py", line 2319, in compile_to_module (EngineCore_DP0 pid=64) return self._compile_to_module() (EngineCore_DP0 pid=64) ^^^^^^^^^^^^^^^^^^^^^^^^^ (EngineCore_DP0 pid=64) File "/usr/local/lib/python3.12/dist-packages/torch/_inductor/graph.py", line 2325, in _compile_to_module (EngineCore_DP0 pid=64) self.codegen_with_cpp_wrapper() if self.cpp_wrapper else self.codegen() (EngineCore_DP0 pid=64) ^^^^^^^^^^^^^^ (EngineCore_DP0 pid=64) File "/usr/local/lib/python3.12/dist-packages/torch/_inductor/graph.py", line 2271, in codegen (EngineCore_DP0 pid=64) result = self.wrapper_code.generate(self.is_inference) (EngineCore_DP0 pid=64) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (EngineCore_DP0 pid=64) File "/usr/local/lib/python3.12/dist-packages/torch/_inductor/codegen/wrapper.py", line 1552, in generate (EngineCore_DP0 pid=64) return self._generate(is_inference) (EngineCore_DP0 pid=64) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (EngineCore_DP0 pid=64) File "/usr/local/lib/python3.12/dist-packages/torch/_inductor/codegen/wrapper.py", line 1615, in _generate (EngineCore_DP0 pid=64) self.generate_and_run_autotune_block() (EngineCore_DP0 pid=64) File "/usr/local/lib/python3.12/dist-packages/torch/_inductor/codegen/wrapper.py", line 1695, in generate_and_run_autotune_block (EngineCore_DP0 pid=64) raise RuntimeError(f"Failed to run autotuning code block: {e}") from e (EngineCore_DP0 pid=64) torch._inductor.exc.InductorError: RuntimeError: Failed to run autotuning code block: No valid triton configs. PTXASError: PTXAS error: Internal Triton PTX codegen error (EngineCore_DP0 pid=64) `ptxas` stderr: (EngineCore_DP0 pid=64) ptxas fatal : Value 'sm_121a' is not defined for option 'gpu-name' (EngineCore_DP0 pid=64) (EngineCore_DP0 pid=64) Repro command: /usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/bin/ptxas -lineinfo -v --gpu-name=sm_121a /tmp/tmphjhx5dp3.ptx -o /tmp/tmphjhx5dp3.ptx.o (EngineCore_DP0 pid=64) [rank0]:[W430 07:58:49.263934227 ProcessGroupNCCL.cpp:1524] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator()) (APIServer pid=1) Traceback (most recent call last): (APIServer pid=1) File "/usr/local/bin/vllm", line 10, in (APIServer pid=1) sys.exit(main()) (APIServer pid=1) ^^^^^^ (APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/cli/main.py", line 73, in main (APIServer pid=1) args.dispatch_function(args) (APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/cli/serve.py", line 60, in cmd (APIServer pid=1) uvloop.run(run_server(args)) (APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/uvloop/__init__.py", line 96, in run (APIServer pid=1) return __asyncio.run( (APIServer pid=1) ^^^^^^^^^^^^^^ (APIServer pid=1) File "/usr/lib/python3.12/asyncio/runners.py", line 195, in run (APIServer pid=1) return runner.run(main) (APIServer pid=1) ^^^^^^^^^^^^^^^^ (APIServer pid=1) File "/usr/lib/python3.12/asyncio/runners.py", line 118, in run (APIServer pid=1) return self._loop.run_until_complete(task) (APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (APIServer pid=1) File "uvloop/loop.pyx", line 1518, in uvloop.loop.Loop.run_until_complete (APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/uvloop/__init__.py", line 48, in wrapper (APIServer pid=1) return await main (APIServer pid=1) ^^^^^^^^^^ (APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 1319, in run_server (APIServer pid=1) await run_server_worker(listen_address, sock, args, **uvicorn_kwargs) (APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 1338, in run_server_worker (APIServer pid=1) async with build_async_engine_client( (APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^ (APIServer pid=1) File "/usr/lib/python3.12/contextlib.py", line 210, in __aenter__ (APIServer pid=1) return await anext(self.gen) (APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^ (APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 173, in build_async_engine_client (APIServer pid=1) async with build_async_engine_client_from_engine_args( (APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (APIServer pid=1) File "/usr/lib/python3.12/contextlib.py", line 210, in __aenter__ (APIServer pid=1) return await anext(self.gen) (APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^ (APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 214, in build_async_engine_client_from_engine_args (APIServer pid=1) async_llm = AsyncLLM.from_vllm_config( (APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^ (APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/async_llm.py", line 205, in from_vllm_config (APIServer pid=1) return cls( (APIServer pid=1) ^^^^ (APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/async_llm.py", line 132, in __init__ (APIServer pid=1) self.engine_core = EngineCoreClient.make_async_mp_client( (APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core_client.py", line 122, in make_async_mp_client (APIServer pid=1) return AsyncMPClient(*client_args) (APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^ (APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core_client.py", line 824, in __init__ (APIServer pid=1) super().__init__( (APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core_client.py", line 479, in __init__ (APIServer pid=1) with launch_core_engines(vllm_config, executor_class, log_stats) as ( (APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (APIServer pid=1) File "/usr/lib/python3.12/contextlib.py", line 144, in __exit__ (APIServer pid=1) next(self.gen) (APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/utils.py", line 921, in launch_core_engines (APIServer pid=1) wait_for_engine_startup( (APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/utils.py", line 980, in wait_for_engine_startup (APIServer pid=1) raise RuntimeError( (APIServer pid=1) RuntimeError: Engine core initialization failed. See root cause above. Failed core proc(s): {} 8e58274aaa07ed2434e325f125c65e7d968c518312365fde6fe8cb62315781b1 Started vibevoice-asr. Waiting for /v1/models readiness (timeout 900s). 52c826999ac5023e10fa711bde2ac8008d391fd1d3457420d8e393127605b408 Started vibevoice-asr. Waiting for /v1/models readiness (timeout 600s). Container exited before readiness. Recent logs: Error response from daemon: No such container: vibevoice-asr Container exited before readiness. Recent logs: Error response from daemon: No such container: vibevoice-asr e8a98faa33823b0f882fb73c7b37f52cae15a26763bfb67df27efb3285451825 Started vibevoice-asr. Waiting for /v1/models readiness (timeout 600s). Container exited before readiness. Recent logs: Error response from daemon: No such container: vibevoice-asr 38c12c74a301caf5576c3d0b11d239a568bc55b4f4466b6dca27e7650b6e3de8 Started vibevoice-asr. Waiting for /v1/models readiness (timeout 600s). Container exited before readiness. Recent logs: (EngineCore_DP0 pid=52) ERROR 05-04 03:12:21 [core.py:936] Traceback (most recent call last): (EngineCore_DP0 pid=52) ERROR 05-04 03:12:21 [core.py:936] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 927, in run_engine_core (EngineCore_DP0 pid=52) ERROR 05-04 03:12:21 [core.py:936] engine_core = EngineCoreProc(*args, engine_index=dp_rank, **kwargs) (EngineCore_DP0 pid=52) ERROR 05-04 03:12:21 [core.py:936] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (EngineCore_DP0 pid=52) ERROR 05-04 03:12:21 [core.py:936] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 692, in __init__ (EngineCore_DP0 pid=52) ERROR 05-04 03:12:21 [core.py:936] super().__init__( (EngineCore_DP0 pid=52) ERROR 05-04 03:12:21 [core.py:936] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 113, in __init__ (EngineCore_DP0 pid=52) ERROR 05-04 03:12:21 [core.py:936] num_gpu_blocks, num_cpu_blocks, kv_cache_config = self._initialize_kv_caches( (EngineCore_DP0 pid=52) ERROR 05-04 03:12:21 [core.py:936] ^^^^^^^^^^^^^^^^^^^^^^^^^^^ (EngineCore_DP0 pid=52) ERROR 05-04 03:12:21 [core.py:936] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 270, in _initialize_kv_caches (EngineCore_DP0 pid=52) ERROR 05-04 03:12:21 [core.py:936] self.model_executor.initialize_from_config(kv_cache_configs) (EngineCore_DP0 pid=52) ERROR 05-04 03:12:21 [core.py:936] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/abstract.py", line 116, in initialize_from_config (EngineCore_DP0 pid=52) ERROR 05-04 03:12:21 [core.py:936] self.collective_rpc("compile_or_warm_up_model") (EngineCore_DP0 pid=52) ERROR 05-04 03:12:21 [core.py:936] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/uniproc_executor.py", line 75, in collective_rpc (EngineCore_DP0 pid=52) ERROR 05-04 03:12:21 [core.py:936] result = run_method(self.driver_worker, method, args, kwargs) (EngineCore_DP0 pid=52) ERROR 05-04 03:12:21 [core.py:936] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (EngineCore_DP0 pid=52) ERROR 05-04 03:12:21 [core.py:936] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/serial_utils.py", line 461, in run_method (EngineCore_DP0 pid=52) ERROR 05-04 03:12:21 [core.py:936] return func(*args, **kwargs) (EngineCore_DP0 pid=52) ERROR 05-04 03:12:21 [core.py:936] ^^^^^^^^^^^^^^^^^^^^^ (EngineCore_DP0 pid=52) ERROR 05-04 03:12:21 [core.py:936] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 451, in compile_or_warm_up_model (EngineCore_DP0 pid=52) ERROR 05-04 03:12:21 [core.py:936] kernel_warmup(self) (EngineCore_DP0 pid=52) ERROR 05-04 03:12:21 [core.py:936] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/warmup/kernel_warmup.py", line 67, in kernel_warmup (EngineCore_DP0 pid=52) ERROR 05-04 03:12:21 [core.py:936] worker.model_runner._dummy_run( (EngineCore_DP0 pid=52) ERROR 05-04 03:12:21 [core.py:936] File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 120, in decorate_context (EngineCore_DP0 pid=52) ERROR 05-04 03:12:21 [core.py:936] return func(*args, **kwargs) (EngineCore_DP0 pid=52) ERROR 05-04 03:12:21 [core.py:936] ^^^^^^^^^^^^^^^^^^^^^ (EngineCore_DP0 pid=52) ERROR 05-04 03:12:21 [core.py:936] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 4385, in _dummy_run (EngineCore_DP0 pid=52) ERROR 05-04 03:12:21 [core.py:936] attn_metadata, _ = self._build_attention_metadata( (EngineCore_DP0 pid=52) ERROR 05-04 03:12:21 [core.py:936] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (EngineCore_DP0 pid=52) ERROR 05-04 03:12:21 [core.py:936] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 1780, in _build_attention_metadata (EngineCore_DP0 pid=52) ERROR 05-04 03:12:21 [core.py:936] _build_attn_group_metadata(kv_cache_gid, attn_gid, cm) (EngineCore_DP0 pid=52) ERROR 05-04 03:12:21 [core.py:936] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 1731, in _build_attn_group_metadata (EngineCore_DP0 pid=52) ERROR 05-04 03:12:21 [core.py:936] attn_metadata_i = builder.build( (EngineCore_DP0 pid=52) ERROR 05-04 03:12:21 [core.py:936] ^^^^^^^^^^^^^^ (EngineCore_DP0 pid=52) ERROR 05-04 03:12:21 [core.py:936] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/attention/backends/flashinfer.py", line 918, in build (EngineCore_DP0 pid=52) ERROR 05-04 03:12:21 [core.py:936] paged_kv_indices = self._compute_flashinfer_kv_metadata( (EngineCore_DP0 pid=52) ERROR 05-04 03:12:21 [core.py:936] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (EngineCore_DP0 pid=52) ERROR 05-04 03:12:21 [core.py:936] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/attention/backends/flashinfer.py", line 760, in _compute_flashinfer_kv_metadata (EngineCore_DP0 pid=52) ERROR 05-04 03:12:21 [core.py:936] _copy_page_indices_kernel[(num_reqs,)]( (EngineCore_DP0 pid=52) ERROR 05-04 03:12:21 [core.py:936] File "/usr/local/lib/python3.12/dist-packages/triton/runtime/jit.py", line 419, in (EngineCore_DP0 pid=52) ERROR 05-04 03:12:21 [core.py:936] return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs) (EngineCore_DP0 pid=52) ERROR 05-04 03:12:21 [core.py:936] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (EngineCore_DP0 pid=52) ERROR 05-04 03:12:21 [core.py:936] File "/usr/local/lib/python3.12/dist-packages/triton/runtime/jit.py", line 733, in run (EngineCore_DP0 pid=52) ERROR 05-04 03:12:21 [core.py:936] kernel = self._do_compile(key, signature, device, constexprs, options, attrs, warmup) (EngineCore_DP0 pid=52) ERROR 05-04 03:12:21 [core.py:936] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (EngineCore_DP0 pid=52) ERROR 05-04 03:12:21 [core.py:936] File "/usr/local/lib/python3.12/dist-packages/triton/runtime/jit.py", line 861, in _do_compile (EngineCore_DP0 pid=52) ERROR 05-04 03:12:21 [core.py:936] kernel = self.compile(src, target=target, options=options.__dict__) (EngineCore_DP0 pid=52) ERROR 05-04 03:12:21 [core.py:936] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (EngineCore_DP0 pid=52) ERROR 05-04 03:12:21 [core.py:936] File "/usr/local/lib/python3.12/dist-packages/triton/compiler/compiler.py", line 320, in compile (EngineCore_DP0 pid=52) ERROR 05-04 03:12:21 [core.py:936] next_module = compile_ir(module, metadata) (EngineCore_DP0 pid=52) ERROR 05-04 03:12:21 [core.py:936] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (EngineCore_DP0 pid=52) ERROR 05-04 03:12:21 [core.py:936] File "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/compiler.py", line 520, in (EngineCore_DP0 pid=52) ERROR 05-04 03:12:21 [core.py:936] stages["cubin"] = lambda src, metadata: self.make_cubin(src, metadata, options, self.target.arch) (EngineCore_DP0 pid=52) ERROR 05-04 03:12:21 [core.py:936] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (EngineCore_DP0 pid=52) ERROR 05-04 03:12:21 [core.py:936] File "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/compiler.py", line 503, in make_cubin (EngineCore_DP0 pid=52) ERROR 05-04 03:12:21 [core.py:936] raise PTXASError(error) (EngineCore_DP0 pid=52) ERROR 05-04 03:12:21 [core.py:936] triton.runtime.errors.PTXASError: PTXAS error: Internal Triton PTX codegen error (EngineCore_DP0 pid=52) ERROR 05-04 03:12:21 [core.py:936] `ptxas` stderr: (EngineCore_DP0 pid=52) ERROR 05-04 03:12:21 [core.py:936] ptxas fatal : Value 'sm_121a' is not defined for option 'gpu-name' (EngineCore_DP0 pid=52) ERROR 05-04 03:12:21 [core.py:936] (EngineCore_DP0 pid=52) ERROR 05-04 03:12:21 [core.py:936] Repro command: /usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/bin/ptxas -lineinfo -v --gpu-name=sm_121a /tmp/tmpo_aaz38d.ptx -o /tmp/tmpo_aaz38d.ptx.o (EngineCore_DP0 pid=52) ERROR 05-04 03:12:21 [core.py:936] (EngineCore_DP0 pid=52) Process EngineCore_DP0: (EngineCore_DP0 pid=52) Traceback (most recent call last): (EngineCore_DP0 pid=52) File "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/compiler.py", line 468, in make_cubin (EngineCore_DP0 pid=52) subprocess.run(ptxas_cmd, check=True, close_fds=False, stderr=flog) (EngineCore_DP0 pid=52) File "/usr/lib/python3.12/subprocess.py", line 571, in run (EngineCore_DP0 pid=52) raise CalledProcessError(retcode, process.args, (EngineCore_DP0 pid=52) subprocess.CalledProcessError: Command '['/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/bin/ptxas', '-lineinfo', '-v', '--gpu-name=sm_121a', '/tmp/tmpo_aaz38d.ptx', '-o', '/tmp/tmpo_aaz38d.ptx.o']' returned non-zero exit status 255. (EngineCore_DP0 pid=52) (EngineCore_DP0 pid=52) During handling of the above exception, another exception occurred: (EngineCore_DP0 pid=52) (EngineCore_DP0 pid=52) Traceback (most recent call last): (EngineCore_DP0 pid=52) File "/usr/lib/python3.12/multiprocessing/process.py", line 314, in _bootstrap (EngineCore_DP0 pid=52) self.run() (EngineCore_DP0 pid=52) File "/usr/lib/python3.12/multiprocessing/process.py", line 108, in run (EngineCore_DP0 pid=52) self._target(*self._args, **self._kwargs) (EngineCore_DP0 pid=52) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 940, in run_engine_core (EngineCore_DP0 pid=52) raise e (EngineCore_DP0 pid=52) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 927, in run_engine_core (EngineCore_DP0 pid=52) engine_core = EngineCoreProc(*args, engine_index=dp_rank, **kwargs) (EngineCore_DP0 pid=52) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (EngineCore_DP0 pid=52) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 692, in __init__ (EngineCore_DP0 pid=52) super().__init__( (EngineCore_DP0 pid=52) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 113, in __init__ (EngineCore_DP0 pid=52) num_gpu_blocks, num_cpu_blocks, kv_cache_config = self._initialize_kv_caches( (EngineCore_DP0 pid=52) ^^^^^^^^^^^^^^^^^^^^^^^^^^^ (EngineCore_DP0 pid=52) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 270, in _initialize_kv_caches (EngineCore_DP0 pid=52) self.model_executor.initialize_from_config(kv_cache_configs) (EngineCore_DP0 pid=52) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/abstract.py", line 116, in initialize_from_config (EngineCore_DP0 pid=52) self.collective_rpc("compile_or_warm_up_model") (EngineCore_DP0 pid=52) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/uniproc_executor.py", line 75, in collective_rpc (EngineCore_DP0 pid=52) result = run_method(self.driver_worker, method, args, kwargs) (EngineCore_DP0 pid=52) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (EngineCore_DP0 pid=52) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/serial_utils.py", line 461, in run_method (EngineCore_DP0 pid=52) return func(*args, **kwargs) (EngineCore_DP0 pid=52) ^^^^^^^^^^^^^^^^^^^^^ (EngineCore_DP0 pid=52) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 451, in compile_or_warm_up_model (EngineCore_DP0 pid=52) kernel_warmup(self) (EngineCore_DP0 pid=52) File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/warmup/kernel_warmup.py", line 67, in kernel_warmup (EngineCore_DP0 pid=52) worker.model_runner._dummy_run( (EngineCore_DP0 pid=52) File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 120, in decorate_context (EngineCore_DP0 pid=52) return func(*args, **kwargs) (EngineCore_DP0 pid=52) ^^^^^^^^^^^^^^^^^^^^^ (EngineCore_DP0 pid=52) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 4385, in _dummy_run (EngineCore_DP0 pid=52) attn_metadata, _ = self._build_attention_metadata( (EngineCore_DP0 pid=52) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (EngineCore_DP0 pid=52) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 1780, in _build_attention_metadata (EngineCore_DP0 pid=52) _build_attn_group_metadata(kv_cache_gid, attn_gid, cm) (EngineCore_DP0 pid=52) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 1731, in _build_attn_group_metadata (EngineCore_DP0 pid=52) attn_metadata_i = builder.build( (EngineCore_DP0 pid=52) ^^^^^^^^^^^^^^ (EngineCore_DP0 pid=52) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/attention/backends/flashinfer.py", line 918, in build (EngineCore_DP0 pid=52) paged_kv_indices = self._compute_flashinfer_kv_metadata( (EngineCore_DP0 pid=52) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (EngineCore_DP0 pid=52) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/attention/backends/flashinfer.py", line 760, in _compute_flashinfer_kv_metadata (EngineCore_DP0 pid=52) _copy_page_indices_kernel[(num_reqs,)]( (EngineCore_DP0 pid=52) File "/usr/local/lib/python3.12/dist-packages/triton/runtime/jit.py", line 419, in (EngineCore_DP0 pid=52) return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs) (EngineCore_DP0 pid=52) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (EngineCore_DP0 pid=52) File "/usr/local/lib/python3.12/dist-packages/triton/runtime/jit.py", line 733, in run (EngineCore_DP0 pid=52) kernel = self._do_compile(key, signature, device, constexprs, options, attrs, warmup) (EngineCore_DP0 pid=52) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (EngineCore_DP0 pid=52) File "/usr/local/lib/python3.12/dist-packages/triton/runtime/jit.py", line 861, in _do_compile (EngineCore_DP0 pid=52) kernel = self.compile(src, target=target, options=options.__dict__) (EngineCore_DP0 pid=52) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (EngineCore_DP0 pid=52) File "/usr/local/lib/python3.12/dist-packages/triton/compiler/compiler.py", line 320, in compile (EngineCore_DP0 pid=52) next_module = compile_ir(module, metadata) (EngineCore_DP0 pid=52) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (EngineCore_DP0 pid=52) File "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/compiler.py", line 520, in (EngineCore_DP0 pid=52) stages["cubin"] = lambda src, metadata: self.make_cubin(src, metadata, options, self.target.arch) (EngineCore_DP0 pid=52) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (EngineCore_DP0 pid=52) File "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/compiler.py", line 503, in make_cubin (EngineCore_DP0 pid=52) raise PTXASError(error) (EngineCore_DP0 pid=52) triton.runtime.errors.PTXASError: PTXAS error: Internal Triton PTX codegen error (EngineCore_DP0 pid=52) `ptxas` stderr: (EngineCore_DP0 pid=52) ptxas fatal : Value 'sm_121a' is not defined for option 'gpu-name' (EngineCore_DP0 pid=52) (EngineCore_DP0 pid=52) Repro command: /usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/bin/ptxas -lineinfo -v --gpu-name=sm_121a /tmp/tmpo_aaz38d.ptx -o /tmp/tmpo_aaz38d.ptx.o (EngineCore_DP0 pid=52) [rank0]:[W504 03:12:22.161718520 ProcessGroupNCCL.cpp:1524] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator()) (APIServer pid=1) Traceback (most recent call last): (APIServer pid=1) File "/usr/local/bin/vllm", line 10, in (APIServer pid=1) sys.exit(main()) (APIServer pid=1) ^^^^^^ (APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/cli/main.py", line 73, in main (APIServer pid=1) args.dispatch_function(args) (APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/cli/serve.py", line 60, in cmd (APIServer pid=1) uvloop.run(run_server(args)) (APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/uvloop/__init__.py", line 96, in run (APIServer pid=1) return __asyncio.run( (APIServer pid=1) ^^^^^^^^^^^^^^ (APIServer pid=1) File "/usr/lib/python3.12/asyncio/runners.py", line 195, in run (APIServer pid=1) return runner.run(main) (APIServer pid=1) ^^^^^^^^^^^^^^^^ (APIServer pid=1) File "/usr/lib/python3.12/asyncio/runners.py", line 118, in run (APIServer pid=1) return self._loop.run_until_complete(task) (APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (APIServer pid=1) File "uvloop/loop.pyx", line 1518, in uvloop.loop.Loop.run_until_complete (APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/uvloop/__init__.py", line 48, in wrapper (APIServer pid=1) return await main (APIServer pid=1) ^^^^^^^^^^ (APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 1319, in run_server (APIServer pid=1) await run_server_worker(listen_address, sock, args, **uvicorn_kwargs) (APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 1338, in run_server_worker (APIServer pid=1) async with build_async_engine_client( (APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^ (APIServer pid=1) File "/usr/lib/python3.12/contextlib.py", line 210, in __aenter__ (APIServer pid=1) return await anext(self.gen) (APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^ (APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 173, in build_async_engine_client (APIServer pid=1) async with build_async_engine_client_from_engine_args( (APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (APIServer pid=1) File "/usr/lib/python3.12/contextlib.py", line 210, in __aenter__ (APIServer pid=1) return await anext(self.gen) (APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^ (APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 214, in build_async_engine_client_from_engine_args (APIServer pid=1) async_llm = AsyncLLM.from_vllm_config( (APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^ (APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/async_llm.py", line 205, in from_vllm_config (APIServer pid=1) return cls( (APIServer pid=1) ^^^^ (APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/async_llm.py", line 132, in __init__ (APIServer pid=1) self.engine_core = EngineCoreClient.make_async_mp_client( (APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core_client.py", line 122, in make_async_mp_client (APIServer pid=1) return AsyncMPClient(*client_args) (APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^ (APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core_client.py", line 824, in __init__ (APIServer pid=1) super().__init__( (APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core_client.py", line 479, in __init__ (APIServer pid=1) with launch_core_engines(vllm_config, executor_class, log_stats) as ( (APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (APIServer pid=1) File "/usr/lib/python3.12/contextlib.py", line 144, in __exit__ (APIServer pid=1) next(self.gen) (APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/utils.py", line 921, in launch_core_engines (APIServer pid=1) wait_for_engine_startup( (APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/utils.py", line 980, in wait_for_engine_startup (APIServer pid=1) raise RuntimeError( (APIServer pid=1) RuntimeError: Engine core initialization failed. See root cause above. Failed core proc(s): {} 6b4a91e67b7e7cdcf5efb95238100d8f7d83d66133d26182b5b06ed735d42691 Started vibevoice-asr. Waiting for /v1/models readiness (timeout 600s). Container exited before readiness. Recent logs: (EngineCore_DP0 pid=64) ERROR 05-04 03:17:27 [core.py:936] Traceback (most recent call last): (EngineCore_DP0 pid=64) ERROR 05-04 03:17:27 [core.py:936] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 927, in run_engine_core (EngineCore_DP0 pid=64) ERROR 05-04 03:17:27 [core.py:936] engine_core = EngineCoreProc(*args, engine_index=dp_rank, **kwargs) (EngineCore_DP0 pid=64) ERROR 05-04 03:17:27 [core.py:936] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (EngineCore_DP0 pid=64) ERROR 05-04 03:17:27 [core.py:936] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 692, in __init__ (EngineCore_DP0 pid=64) ERROR 05-04 03:17:27 [core.py:936] super().__init__( (EngineCore_DP0 pid=64) ERROR 05-04 03:17:27 [core.py:936] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 113, in __init__ (EngineCore_DP0 pid=64) ERROR 05-04 03:17:27 [core.py:936] num_gpu_blocks, num_cpu_blocks, kv_cache_config = self._initialize_kv_caches( (EngineCore_DP0 pid=64) ERROR 05-04 03:17:27 [core.py:936] ^^^^^^^^^^^^^^^^^^^^^^^^^^^ (EngineCore_DP0 pid=64) ERROR 05-04 03:17:27 [core.py:936] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 270, in _initialize_kv_caches (EngineCore_DP0 pid=64) ERROR 05-04 03:17:27 [core.py:936] self.model_executor.initialize_from_config(kv_cache_configs) (EngineCore_DP0 pid=64) ERROR 05-04 03:17:27 [core.py:936] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/abstract.py", line 116, in initialize_from_config (EngineCore_DP0 pid=64) ERROR 05-04 03:17:27 [core.py:936] self.collective_rpc("compile_or_warm_up_model") (EngineCore_DP0 pid=64) ERROR 05-04 03:17:27 [core.py:936] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/uniproc_executor.py", line 75, in collective_rpc (EngineCore_DP0 pid=64) ERROR 05-04 03:17:27 [core.py:936] result = run_method(self.driver_worker, method, args, kwargs) (EngineCore_DP0 pid=64) ERROR 05-04 03:17:27 [core.py:936] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (EngineCore_DP0 pid=64) ERROR 05-04 03:17:27 [core.py:936] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/serial_utils.py", line 461, in run_method (EngineCore_DP0 pid=64) ERROR 05-04 03:17:27 [core.py:936] return func(*args, **kwargs) (EngineCore_DP0 pid=64) ERROR 05-04 03:17:27 [core.py:936] ^^^^^^^^^^^^^^^^^^^^^ (EngineCore_DP0 pid=64) ERROR 05-04 03:17:27 [core.py:936] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 451, in compile_or_warm_up_model (EngineCore_DP0 pid=64) ERROR 05-04 03:17:27 [core.py:936] kernel_warmup(self) (EngineCore_DP0 pid=64) ERROR 05-04 03:17:27 [core.py:936] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/warmup/kernel_warmup.py", line 67, in kernel_warmup (EngineCore_DP0 pid=64) ERROR 05-04 03:17:27 [core.py:936] worker.model_runner._dummy_run( (EngineCore_DP0 pid=64) ERROR 05-04 03:17:27 [core.py:936] File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 120, in decorate_context (EngineCore_DP0 pid=64) ERROR 05-04 03:17:27 [core.py:936] return func(*args, **kwargs) (EngineCore_DP0 pid=64) ERROR 05-04 03:17:27 [core.py:936] ^^^^^^^^^^^^^^^^^^^^^ (EngineCore_DP0 pid=64) ERROR 05-04 03:17:27 [core.py:936] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 4385, in _dummy_run (EngineCore_DP0 pid=64) ERROR 05-04 03:17:27 [core.py:936] attn_metadata, _ = self._build_attention_metadata( (EngineCore_DP0 pid=64) ERROR 05-04 03:17:27 [core.py:936] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (EngineCore_DP0 pid=64) ERROR 05-04 03:17:27 [core.py:936] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 1780, in _build_attention_metadata (EngineCore_DP0 pid=64) ERROR 05-04 03:17:27 [core.py:936] _build_attn_group_metadata(kv_cache_gid, attn_gid, cm) (EngineCore_DP0 pid=64) ERROR 05-04 03:17:27 [core.py:936] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 1731, in _build_attn_group_metadata (EngineCore_DP0 pid=64) ERROR 05-04 03:17:27 [core.py:936] attn_metadata_i = builder.build( (EngineCore_DP0 pid=64) ERROR 05-04 03:17:27 [core.py:936] ^^^^^^^^^^^^^^ (EngineCore_DP0 pid=64) ERROR 05-04 03:17:27 [core.py:936] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/attention/backends/flashinfer.py", line 918, in build (EngineCore_DP0 pid=64) ERROR 05-04 03:17:27 [core.py:936] paged_kv_indices = self._compute_flashinfer_kv_metadata( (EngineCore_DP0 pid=64) ERROR 05-04 03:17:27 [core.py:936] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (EngineCore_DP0 pid=64) ERROR 05-04 03:17:27 [core.py:936] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/attention/backends/flashinfer.py", line 760, in _compute_flashinfer_kv_metadata (EngineCore_DP0 pid=64) ERROR 05-04 03:17:27 [core.py:936] _copy_page_indices_kernel[(num_reqs,)]( (EngineCore_DP0 pid=64) ERROR 05-04 03:17:27 [core.py:936] File "/usr/local/lib/python3.12/dist-packages/triton/runtime/jit.py", line 419, in (EngineCore_DP0 pid=64) ERROR 05-04 03:17:27 [core.py:936] return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs) (EngineCore_DP0 pid=64) ERROR 05-04 03:17:27 [core.py:936] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (EngineCore_DP0 pid=64) ERROR 05-04 03:17:27 [core.py:936] File "/usr/local/lib/python3.12/dist-packages/triton/runtime/jit.py", line 733, in run (EngineCore_DP0 pid=64) ERROR 05-04 03:17:27 [core.py:936] kernel = self._do_compile(key, signature, device, constexprs, options, attrs, warmup) (EngineCore_DP0 pid=64) ERROR 05-04 03:17:27 [core.py:936] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (EngineCore_DP0 pid=64) ERROR 05-04 03:17:27 [core.py:936] File "/usr/local/lib/python3.12/dist-packages/triton/runtime/jit.py", line 861, in _do_compile (EngineCore_DP0 pid=64) ERROR 05-04 03:17:27 [core.py:936] kernel = self.compile(src, target=target, options=options.__dict__) (EngineCore_DP0 pid=64) ERROR 05-04 03:17:27 [core.py:936] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (EngineCore_DP0 pid=64) ERROR 05-04 03:17:27 [core.py:936] File "/usr/local/lib/python3.12/dist-packages/triton/compiler/compiler.py", line 320, in compile (EngineCore_DP0 pid=64) ERROR 05-04 03:17:27 [core.py:936] next_module = compile_ir(module, metadata) (EngineCore_DP0 pid=64) ERROR 05-04 03:17:27 [core.py:936] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (EngineCore_DP0 pid=64) ERROR 05-04 03:17:27 [core.py:936] File "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/compiler.py", line 520, in (EngineCore_DP0 pid=64) ERROR 05-04 03:17:27 [core.py:936] stages["cubin"] = lambda src, metadata: self.make_cubin(src, metadata, options, self.target.arch) (EngineCore_DP0 pid=64) ERROR 05-04 03:17:27 [core.py:936] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (EngineCore_DP0 pid=64) ERROR 05-04 03:17:27 [core.py:936] File "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/compiler.py", line 503, in make_cubin (EngineCore_DP0 pid=64) ERROR 05-04 03:17:27 [core.py:936] raise PTXASError(error) (EngineCore_DP0 pid=64) ERROR 05-04 03:17:27 [core.py:936] triton.runtime.errors.PTXASError: PTXAS error: Internal Triton PTX codegen error (EngineCore_DP0 pid=64) ERROR 05-04 03:17:27 [core.py:936] `ptxas` stderr: (EngineCore_DP0 pid=64) ERROR 05-04 03:17:27 [core.py:936] ptxas fatal : Value 'sm_121a' is not defined for option 'gpu-name' (EngineCore_DP0 pid=64) ERROR 05-04 03:17:27 [core.py:936] (EngineCore_DP0 pid=64) ERROR 05-04 03:17:27 [core.py:936] Repro command: /usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/bin/ptxas -lineinfo -v --gpu-name=sm_121a /tmp/tmps2bfse10.ptx -o /tmp/tmps2bfse10.ptx.o (EngineCore_DP0 pid=64) ERROR 05-04 03:17:27 [core.py:936] (EngineCore_DP0 pid=64) Process EngineCore_DP0: (EngineCore_DP0 pid=64) Traceback (most recent call last): (EngineCore_DP0 pid=64) File "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/compiler.py", line 468, in make_cubin (EngineCore_DP0 pid=64) subprocess.run(ptxas_cmd, check=True, close_fds=False, stderr=flog) (EngineCore_DP0 pid=64) File "/usr/lib/python3.12/subprocess.py", line 571, in run (EngineCore_DP0 pid=64) raise CalledProcessError(retcode, process.args, (EngineCore_DP0 pid=64) subprocess.CalledProcessError: Command '['/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/bin/ptxas', '-lineinfo', '-v', '--gpu-name=sm_121a', '/tmp/tmps2bfse10.ptx', '-o', '/tmp/tmps2bfse10.ptx.o']' returned non-zero exit status 255. (EngineCore_DP0 pid=64) (EngineCore_DP0 pid=64) During handling of the above exception, another exception occurred: (EngineCore_DP0 pid=64) (EngineCore_DP0 pid=64) Traceback (most recent call last): (EngineCore_DP0 pid=64) File "/usr/lib/python3.12/multiprocessing/process.py", line 314, in _bootstrap (EngineCore_DP0 pid=64) self.run() (EngineCore_DP0 pid=64) File "/usr/lib/python3.12/multiprocessing/process.py", line 108, in run (EngineCore_DP0 pid=64) self._target(*self._args, **self._kwargs) (EngineCore_DP0 pid=64) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 940, in run_engine_core (EngineCore_DP0 pid=64) raise e (EngineCore_DP0 pid=64) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 927, in run_engine_core (EngineCore_DP0 pid=64) engine_core = EngineCoreProc(*args, engine_index=dp_rank, **kwargs) (EngineCore_DP0 pid=64) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (EngineCore_DP0 pid=64) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 692, in __init__ (EngineCore_DP0 pid=64) super().__init__( (EngineCore_DP0 pid=64) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 113, in __init__ (EngineCore_DP0 pid=64) num_gpu_blocks, num_cpu_blocks, kv_cache_config = self._initialize_kv_caches( (EngineCore_DP0 pid=64) ^^^^^^^^^^^^^^^^^^^^^^^^^^^ (EngineCore_DP0 pid=64) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 270, in _initialize_kv_caches (EngineCore_DP0 pid=64) self.model_executor.initialize_from_config(kv_cache_configs) (EngineCore_DP0 pid=64) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/abstract.py", line 116, in initialize_from_config (EngineCore_DP0 pid=64) self.collective_rpc("compile_or_warm_up_model") (EngineCore_DP0 pid=64) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/uniproc_executor.py", line 75, in collective_rpc (EngineCore_DP0 pid=64) result = run_method(self.driver_worker, method, args, kwargs) (EngineCore_DP0 pid=64) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (EngineCore_DP0 pid=64) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/serial_utils.py", line 461, in run_method (EngineCore_DP0 pid=64) return func(*args, **kwargs) (EngineCore_DP0 pid=64) ^^^^^^^^^^^^^^^^^^^^^ (EngineCore_DP0 pid=64) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 451, in compile_or_warm_up_model (EngineCore_DP0 pid=64) kernel_warmup(self) (EngineCore_DP0 pid=64) File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/warmup/kernel_warmup.py", line 67, in kernel_warmup (EngineCore_DP0 pid=64) worker.model_runner._dummy_run( (EngineCore_DP0 pid=64) File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 120, in decorate_context (EngineCore_DP0 pid=64) return func(*args, **kwargs) (EngineCore_DP0 pid=64) ^^^^^^^^^^^^^^^^^^^^^ (EngineCore_DP0 pid=64) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 4385, in _dummy_run (EngineCore_DP0 pid=64) attn_metadata, _ = self._build_attention_metadata( (EngineCore_DP0 pid=64) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (EngineCore_DP0 pid=64) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 1780, in _build_attention_metadata (EngineCore_DP0 pid=64) _build_attn_group_metadata(kv_cache_gid, attn_gid, cm) (EngineCore_DP0 pid=64) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 1731, in _build_attn_group_metadata (EngineCore_DP0 pid=64) attn_metadata_i = builder.build( (EngineCore_DP0 pid=64) ^^^^^^^^^^^^^^ (EngineCore_DP0 pid=64) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/attention/backends/flashinfer.py", line 918, in build (EngineCore_DP0 pid=64) paged_kv_indices = self._compute_flashinfer_kv_metadata( (EngineCore_DP0 pid=64) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (EngineCore_DP0 pid=64) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/attention/backends/flashinfer.py", line 760, in _compute_flashinfer_kv_metadata (EngineCore_DP0 pid=64) _copy_page_indices_kernel[(num_reqs,)]( (EngineCore_DP0 pid=64) File "/usr/local/lib/python3.12/dist-packages/triton/runtime/jit.py", line 419, in (EngineCore_DP0 pid=64) return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs) (EngineCore_DP0 pid=64) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (EngineCore_DP0 pid=64) File "/usr/local/lib/python3.12/dist-packages/triton/runtime/jit.py", line 733, in run (EngineCore_DP0 pid=64) kernel = self._do_compile(key, signature, device, constexprs, options, attrs, warmup) (EngineCore_DP0 pid=64) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (EngineCore_DP0 pid=64) File "/usr/local/lib/python3.12/dist-packages/triton/runtime/jit.py", line 861, in _do_compile (EngineCore_DP0 pid=64) kernel = self.compile(src, target=target, options=options.__dict__) (EngineCore_DP0 pid=64) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (EngineCore_DP0 pid=64) File "/usr/local/lib/python3.12/dist-packages/triton/compiler/compiler.py", line 320, in compile (EngineCore_DP0 pid=64) next_module = compile_ir(module, metadata) (EngineCore_DP0 pid=64) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (EngineCore_DP0 pid=64) File "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/compiler.py", line 520, in (EngineCore_DP0 pid=64) stages["cubin"] = lambda src, metadata: self.make_cubin(src, metadata, options, self.target.arch) (EngineCore_DP0 pid=64) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (EngineCore_DP0 pid=64) File "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/compiler.py", line 503, in make_cubin (EngineCore_DP0 pid=64) raise PTXASError(error) (EngineCore_DP0 pid=64) triton.runtime.errors.PTXASError: PTXAS error: Internal Triton PTX codegen error (EngineCore_DP0 pid=64) `ptxas` stderr: (EngineCore_DP0 pid=64) ptxas fatal : Value 'sm_121a' is not defined for option 'gpu-name' (EngineCore_DP0 pid=64) (EngineCore_DP0 pid=64) Repro command: /usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/bin/ptxas -lineinfo -v --gpu-name=sm_121a /tmp/tmps2bfse10.ptx -o /tmp/tmps2bfse10.ptx.o (EngineCore_DP0 pid=64) [rank0]:[W504 03:17:38.171227643 ProcessGroupNCCL.cpp:1524] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator()) (APIServer pid=1) Traceback (most recent call last): (APIServer pid=1) File "/usr/local/bin/vllm", line 10, in (APIServer pid=1) sys.exit(main()) (APIServer pid=1) ^^^^^^ (APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/cli/main.py", line 73, in main (APIServer pid=1) args.dispatch_function(args) (APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/cli/serve.py", line 60, in cmd (APIServer pid=1) uvloop.run(run_server(args)) (APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/uvloop/__init__.py", line 96, in run (APIServer pid=1) return __asyncio.run( (APIServer pid=1) ^^^^^^^^^^^^^^ (APIServer pid=1) File "/usr/lib/python3.12/asyncio/runners.py", line 195, in run (APIServer pid=1) return runner.run(main) (APIServer pid=1) ^^^^^^^^^^^^^^^^ (APIServer pid=1) File "/usr/lib/python3.12/asyncio/runners.py", line 118, in run (APIServer pid=1) return self._loop.run_until_complete(task) (APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (APIServer pid=1) File "uvloop/loop.pyx", line 1518, in uvloop.loop.Loop.run_until_complete (APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/uvloop/__init__.py", line 48, in wrapper (APIServer pid=1) return await main (APIServer pid=1) ^^^^^^^^^^ (APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 1319, in run_server (APIServer pid=1) await run_server_worker(listen_address, sock, args, **uvicorn_kwargs) (APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 1338, in run_server_worker (APIServer pid=1) async with build_async_engine_client( (APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^ (APIServer pid=1) File "/usr/lib/python3.12/contextlib.py", line 210, in __aenter__ (APIServer pid=1) return await anext(self.gen) (APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^ (APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 173, in build_async_engine_client (APIServer pid=1) async with build_async_engine_client_from_engine_args( (APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (APIServer pid=1) File "/usr/lib/python3.12/contextlib.py", line 210, in __aenter__ (APIServer pid=1) return await anext(self.gen) (APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^ (APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 214, in build_async_engine_client_from_engine_args (APIServer pid=1) async_llm = AsyncLLM.from_vllm_config( (APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^ (APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/async_llm.py", line 205, in from_vllm_config (APIServer pid=1) return cls( (APIServer pid=1) ^^^^ (APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/async_llm.py", line 132, in __init__ (APIServer pid=1) self.engine_core = EngineCoreClient.make_async_mp_client( (APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core_client.py", line 122, in make_async_mp_client (APIServer pid=1) return AsyncMPClient(*client_args) (APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^ (APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core_client.py", line 824, in __init__ (APIServer pid=1) super().__init__( (APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core_client.py", line 479, in __init__ (APIServer pid=1) with launch_core_engines(vllm_config, executor_class, log_stats) as ( (APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (APIServer pid=1) File "/usr/lib/python3.12/contextlib.py", line 144, in __exit__ (APIServer pid=1) next(self.gen) (APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/utils.py", line 921, in launch_core_engines (APIServer pid=1) wait_for_engine_startup( (APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/utils.py", line 980, in wait_for_engine_startup (APIServer pid=1) raise RuntimeError( (APIServer pid=1) RuntimeError: Engine core initialization failed. See root cause above. Failed core proc(s): {} 1a32f0137ccd64570d19545e990f1cacf19e2d3e06f361c185393a053fc85e1a Started vibevoice-asr. Waiting for /v1/models readiness (timeout 600s). Server ready: docker exec vibevoice-asr curl -s http://127.0.0.1:8000/v1/models c879783cf610d245901fb572425b74550c5530e88540922ec0799eb432e0f54e Started vibevoice-asr. Waiting for /v1/models readiness (timeout 600s). Server ready: docker exec vibevoice-asr curl -s http://127.0.0.1:8000/v1/models