.clang-format
.coveragerc
.dockerignore
.git-blame-ignore-revs
.gitignore
.markdownlint.yaml
.pre-commit-config.yaml
.readthedocs.yaml
.shellcheckrc
.yapfignore
AGENTS.md
CLAUDE.md
CMakeLists.txt
CODE_OF_CONDUCT.md
CONTRIBUTING.md
DCO
LICENSE
MANIFEST.in
README.md
RELEASE.md
SECURITY.md
codecov.yml
mkdocs.yaml
pyproject.toml
setup.py
use_existing_torch.py
.buildkite/.pipeline_gen_v2
.buildkite/check-wheel-size.py
.buildkite/ci_config.yaml
.buildkite/ci_config_intel.yaml
.buildkite/release-pipeline.yaml
.buildkite/test-amd.yaml
.buildkite/test-pipeline.yaml
.buildkite/hardware_tests/amd.yaml
.buildkite/hardware_tests/ascend_npu.yaml
.buildkite/hardware_tests/cpu.yaml
.buildkite/hardware_tests/gh200.yaml
.buildkite/hardware_tests/intel.yaml
.buildkite/image_build/image_build.sh
.buildkite/image_build/image_build.yaml
.buildkite/image_build/image_build_cpu.sh
.buildkite/image_build/image_build_cpu_arm64.sh
.buildkite/image_build/image_build_hpu.sh
.buildkite/image_build/image_build_xpu.sh
.buildkite/intel_jobs/test-intel.yaml
.buildkite/lm-eval-harness/conftest.py
.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh
.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
.buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh
.buildkite/lm-eval-harness/test_lm_eval_correctness.py
.buildkite/lm-eval-harness/configs/DeepSeek-V2-Lite-Chat.yaml
.buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform.yaml
.buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct.yaml
.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml
.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform.yaml
.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8.yaml
.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml
.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml
.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml
.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml
.buildkite/lm-eval-harness/configs/Meta-Llama-3.2-1B-Instruct-FP8-compressed-tensors.yaml
.buildkite/lm-eval-harness/configs/Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml
.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8-MM.yaml
.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
.buildkite/lm-eval-harness/configs/Minitron-4B-Base-FP8.yaml
.buildkite/lm-eval-harness/configs/Mixtral-8x22B-Instruct-v0.1-FP8-Dynamic.yaml
.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1-FP8.yaml
.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1.yaml
.buildkite/lm-eval-harness/configs/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16.yaml
.buildkite/lm-eval-harness/configs/NVIDIA-Nemotron-3-Nano-30B-A3B-FP8.yaml
.buildkite/lm-eval-harness/configs/Qwen1.5-MoE-W4A16-compressed-tensors.yaml
.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-FP8W8.yaml
.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml
.buildkite/lm-eval-harness/configs/Qwen2-57B-A14-Instruct.yaml
.buildkite/lm-eval-harness/configs/Qwen2.5-1.5B-Instruct.yaml
.buildkite/lm-eval-harness/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
.buildkite/lm-eval-harness/configs/Qwen2.5-VL-7B-Instruct.yaml
.buildkite/lm-eval-harness/configs/Qwen3-235B-A22B-Instruct-2507-FP8.yaml
.buildkite/lm-eval-harness/configs/models-large-hopper.txt
.buildkite/lm-eval-harness/configs/models-large-rocm-fp8.txt
.buildkite/lm-eval-harness/configs/models-large-rocm.txt
.buildkite/lm-eval-harness/configs/models-large.txt
.buildkite/lm-eval-harness/configs/models-mm-large-h100.txt
.buildkite/lm-eval-harness/configs/models-mm-small.txt
.buildkite/lm-eval-harness/configs/models-small-rocm.txt
.buildkite/lm-eval-harness/configs/models-small.txt
.buildkite/performance-benchmarks/README.md
.buildkite/performance-benchmarks/performance-benchmarks-descriptions.md
.buildkite/performance-benchmarks/scripts/compare-json-results.py
.buildkite/performance-benchmarks/scripts/convert-results-json-to-markdown.py
.buildkite/performance-benchmarks/scripts/launch-server.sh
.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
.buildkite/performance-benchmarks/tests/genai-perf-tests.json
.buildkite/performance-benchmarks/tests/latency-tests-arm64-cpu.json
.buildkite/performance-benchmarks/tests/latency-tests-cpu.json
.buildkite/performance-benchmarks/tests/latency-tests-hpu.json
.buildkite/performance-benchmarks/tests/latency-tests.json
.buildkite/performance-benchmarks/tests/nightly-tests.json
.buildkite/performance-benchmarks/tests/serving-tests-arm64-cpu.json
.buildkite/performance-benchmarks/tests/serving-tests-cpu-asr.json
.buildkite/performance-benchmarks/tests/serving-tests-cpu-embed.json
.buildkite/performance-benchmarks/tests/serving-tests-cpu-text.json
.buildkite/performance-benchmarks/tests/serving-tests-cpu.json
.buildkite/performance-benchmarks/tests/serving-tests-hpu.json
.buildkite/performance-benchmarks/tests/serving-tests.json
.buildkite/performance-benchmarks/tests/throughput-tests-arm64-cpu.json
.buildkite/performance-benchmarks/tests/throughput-tests-cpu.json
.buildkite/performance-benchmarks/tests/throughput-tests-hpu.json
.buildkite/performance-benchmarks/tests/throughput-tests.json
.buildkite/scripts/annotate-release.sh
.buildkite/scripts/annotate-rocm-release.sh
.buildkite/scripts/cache-rocm-base-wheels.sh
.buildkite/scripts/check-ray-compatibility.sh
.buildkite/scripts/cherry-pick-from-milestone.sh
.buildkite/scripts/ci-clean-log.sh
.buildkite/scripts/cleanup-nightly-builds.sh
.buildkite/scripts/generate-and-upload-nightly-index.sh
.buildkite/scripts/generate-nightly-index.py
.buildkite/scripts/push-nightly-builds-rocm.sh
.buildkite/scripts/push-nightly-builds.sh
.buildkite/scripts/rerun-test.sh
.buildkite/scripts/run-benchmarks.sh
.buildkite/scripts/run-multi-node-test.sh
.buildkite/scripts/trigger-ci-build.sh
.buildkite/scripts/upload-nightly-wheels.sh
.buildkite/scripts/upload-release-wheels-pypi.sh
.buildkite/scripts/upload-rocm-wheels.sh
.buildkite/scripts/hardware_ci/run-amd-test.sh
.buildkite/scripts/hardware_ci/run-cpu-compatibility-test.sh
.buildkite/scripts/hardware_ci/run-cpu-distributed-smoke-test.sh
.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
.buildkite/scripts/hardware_ci/run-cpu-test-s390x.sh
.buildkite/scripts/hardware_ci/run-cpu-test.sh
.buildkite/scripts/hardware_ci/run-gh200-test.sh
.buildkite/scripts/hardware_ci/run-hpu-test.sh
.buildkite/scripts/hardware_ci/run-intel-test.sh
.buildkite/scripts/hardware_ci/run-npu-test.sh
.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh
.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
.buildkite/scripts/hardware_ci/run-xpu-test.sh
.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh
.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_prefetch_offload.sh
.buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh
.buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh
.buildkite/scripts/tool_call/run-bfcl-eval.sh
.buildkite/scripts/tpu/cleanup_docker.sh
.buildkite/scripts/tpu/config_v6e_1.env
.buildkite/scripts/tpu/docker_run_bm.sh
.buildkite/scripts/tpu/quantized_v6e_1.env
.buildkite/scripts/tpu/run_bm.sh
.buildkite/test_areas/attention.yaml
.buildkite/test_areas/basic_correctness.yaml
.buildkite/test_areas/benchmarks.yaml
.buildkite/test_areas/compile.yaml
.buildkite/test_areas/cuda.yaml
.buildkite/test_areas/distributed.yaml
.buildkite/test_areas/e2e_integration.yaml
.buildkite/test_areas/engine.yaml
.buildkite/test_areas/entrypoints.yaml
.buildkite/test_areas/expert_parallelism.yaml
.buildkite/test_areas/kernels.yaml
.buildkite/test_areas/lm_eval.yaml
.buildkite/test_areas/lora.yaml
.buildkite/test_areas/misc.yaml
.buildkite/test_areas/model_executor.yaml
.buildkite/test_areas/model_runner_v2.yaml
.buildkite/test_areas/models_basic.yaml
.buildkite/test_areas/models_distributed.yaml
.buildkite/test_areas/models_language.yaml
.buildkite/test_areas/models_multimodal.yaml
.buildkite/test_areas/plugins.yaml
.buildkite/test_areas/pytorch.yaml
.buildkite/test_areas/quantization.yaml
.buildkite/test_areas/ray_compat.yaml
.buildkite/test_areas/samplers.yaml
.buildkite/test_areas/spec_decode.yaml
.buildkite/test_areas/weight_loading.yaml
.gemini/config.yaml
.github/CODEOWNERS
.github/FUNDING.yml
.github/PULL_REQUEST_TEMPLATE.md
.github/dependabot.yml
.github/mergify.yml
.github/scale-config.yml
.github/ISSUE_TEMPLATE/100-documentation.yml
.github/ISSUE_TEMPLATE/200-installation.yml
.github/ISSUE_TEMPLATE/300-usage.yml
.github/ISSUE_TEMPLATE/400-bug-report.yml
.github/ISSUE_TEMPLATE/450-ci-failure.yml
.github/ISSUE_TEMPLATE/500-feature-request.yml
.github/ISSUE_TEMPLATE/600-new-model.yml
.github/ISSUE_TEMPLATE/700-performance-discussion.yml
.github/ISSUE_TEMPLATE/750-RFC.yml
.github/ISSUE_TEMPLATE/config.yml
.github/workflows/add_label_automerge.yml
.github/workflows/issue_autolabel.yml
.github/workflows/macos-smoke-test.yml
.github/workflows/new_pr_bot.yml
.github/workflows/pre-commit.yml
.github/workflows/stale.yml
.github/workflows/matchers/actionlint.json
.github/workflows/matchers/markdownlint.json
.github/workflows/matchers/mypy.json
.github/workflows/scripts/build.sh
.github/workflows/scripts/create_release.js
.github/workflows/scripts/cuda-install.sh
.github/workflows/scripts/env.sh
.github/workflows/scripts/pytorch-install.sh
benchmarks/README.md
benchmarks/backend_request_func.py
benchmarks/benchmark_batch_invariance.py
benchmarks/benchmark_block_pool.py
benchmarks/benchmark_hash.py
benchmarks/benchmark_latency.py
benchmarks/benchmark_long_document_qa_throughput.py
benchmarks/benchmark_ngram_proposer.py
benchmarks/benchmark_prefix_block_hash.py
benchmarks/benchmark_prefix_caching.py
benchmarks/benchmark_prioritization.py
benchmarks/benchmark_serving.py
benchmarks/benchmark_serving_structured_output.py
benchmarks/benchmark_throughput.py
benchmarks/benchmark_topk_topp.py
benchmarks/benchmark_utils.py
benchmarks/run_structured_output_benchmark.sh
benchmarks/sonnet.txt
benchmarks/attention_benchmarks/README.md
benchmarks/attention_benchmarks/__init__.py
benchmarks/attention_benchmarks/batch_spec.py
benchmarks/attention_benchmarks/benchmark.py
benchmarks/attention_benchmarks/common.py
benchmarks/attention_benchmarks/mla_runner.py
benchmarks/attention_benchmarks/runner.py
benchmarks/attention_benchmarks/configs/mla_decode.yaml
benchmarks/attention_benchmarks/configs/mla_mixed_batch.yaml
benchmarks/attention_benchmarks/configs/mla_prefill.yaml
benchmarks/attention_benchmarks/configs/mla_sparse_decode.yaml
benchmarks/attention_benchmarks/configs/mla_sparse_prefill.yaml
benchmarks/attention_benchmarks/configs/reorder_threshold.yaml
benchmarks/attention_benchmarks/configs/speculative_decode.yaml
benchmarks/attention_benchmarks/configs/standard_attention.yaml
benchmarks/auto_tune/README.md
benchmarks/auto_tune/auto_tune.sh
benchmarks/auto_tune/batch_auto_tune.sh
benchmarks/cutlass_benchmarks/utils.py
benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
benchmarks/cutlass_benchmarks/weight_shapes.py
benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh
benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh
benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py
benchmarks/disagg_benchmarks/round_robin_proxy.py
benchmarks/disagg_benchmarks/visualize_benchmark_results.py
benchmarks/fused_kernels/layernorm_rms_benchmarks.py
benchmarks/kernels/bench_concat_mla_q.py
benchmarks/kernels/bench_cp_gather_fp8.py
benchmarks/kernels/benchmark_2d_silu_mul_fp8_quant.py
benchmarks/kernels/benchmark_activation.py
benchmarks/kernels/benchmark_block_fp8_gemm.py
benchmarks/kernels/benchmark_cutlass_moe_fp8.py
benchmarks/kernels/benchmark_cutlass_moe_nvfp4.py
benchmarks/kernels/benchmark_device_communicators.py
benchmarks/kernels/benchmark_fp8_gemm.py
benchmarks/kernels/benchmark_fused_collective.py
benchmarks/kernels/benchmark_fused_topk.py
benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
benchmarks/kernels/benchmark_int8_gemm.py
benchmarks/kernels/benchmark_layernorm.py
benchmarks/kernels/benchmark_lora.py
benchmarks/kernels/benchmark_machete.py
benchmarks/kernels/benchmark_marlin.py
benchmarks/kernels/benchmark_mla_k_concat.py
benchmarks/kernels/benchmark_moe.py
benchmarks/kernels/benchmark_moe_align_block_size.py
benchmarks/kernels/benchmark_moe_defaults.py
benchmarks/kernels/benchmark_moe_permute_unpermute.py
benchmarks/kernels/benchmark_mrope.py
benchmarks/kernels/benchmark_mxfp4_qutlass.py
benchmarks/kernels/benchmark_nvfp4_gemm.py
benchmarks/kernels/benchmark_nvfp4_quant.py
benchmarks/kernels/benchmark_nvfp4_qutlass.py
benchmarks/kernels/benchmark_paged_attention.py
benchmarks/kernels/benchmark_per_token_group_quant.py
benchmarks/kernels/benchmark_per_token_quant_fp8.py
benchmarks/kernels/benchmark_quant.py
benchmarks/kernels/benchmark_reshape_and_cache.py
benchmarks/kernels/benchmark_reshape_and_cache_flash.py
benchmarks/kernels/benchmark_rmsnorm.py
benchmarks/kernels/benchmark_rope.py
benchmarks/kernels/benchmark_router_gemm.py
benchmarks/kernels/benchmark_shapes.py
benchmarks/kernels/benchmark_silu_mul_fp8_quant.py
benchmarks/kernels/benchmark_trtllm_decode_attention.py
benchmarks/kernels/benchmark_trtllm_prefill_attention.py
benchmarks/kernels/benchmark_w8a8_block_fp8.py
benchmarks/kernels/graph_machete_bench.py
benchmarks/kernels/requirements.txt
benchmarks/kernels/utils.py
benchmarks/kernels/weight_shapes.py
benchmarks/kernels/cpu/benchmark_cpu_attn.py
benchmarks/kernels/cpu/benchmark_cpu_fused_moe.py
benchmarks/kernels/deepgemm/README.md
benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py
benchmarks/multi_turn/README.md
benchmarks/multi_turn/bench_dataset.py
benchmarks/multi_turn/bench_utils.py
benchmarks/multi_turn/benchmark_serving_multi_turn.py
benchmarks/multi_turn/convert_sharegpt_to_openai.py
benchmarks/multi_turn/generate_multi_turn.json
benchmarks/multi_turn/requirements.txt
benchmarks/overheads/benchmark_hashing.py
benchmarks/structured_schemas/structured_schema_1.json
cmake/cpu_extension.cmake
cmake/hipify.py
cmake/utils.cmake
cmake/external_projects/flashmla.cmake
cmake/external_projects/qutlass.cmake
cmake/external_projects/triton_kernels.cmake
cmake/external_projects/vllm_flash_attn.cmake
csrc/activation_kernels.cu
csrc/cache.h
csrc/cache_kernels.cu
csrc/cache_kernels_fused.cu
csrc/concat_mla_q.cuh
csrc/cub_helpers.h
csrc/cuda_compat.h
csrc/cuda_utils.h
csrc/cuda_utils_kernels.cu
csrc/cuda_vec_utils.cuh
csrc/cuda_view.cu
csrc/cumem_allocator.cpp
csrc/cumem_allocator_compat.h
csrc/custom_all_reduce.cu
csrc/custom_all_reduce.cuh
csrc/custom_all_reduce_test.cu
csrc/custom_quickreduce.cu
csrc/dispatch_utils.h
csrc/dsv3_fused_a_gemm.cu
csrc/fused_qknorm_rope_kernel.cu
csrc/launch_bounds_utils.h
csrc/layernorm_kernels.cu
csrc/layernorm_quant_kernels.cu
csrc/minimax_reduce_rms_kernel.cu
csrc/minimax_reduce_rms_kernel.h
csrc/ops.h
csrc/pos_encoding_kernels.cu
csrc/sampler.cu
csrc/topk.cu
csrc/torch_bindings.cpp
csrc/type_convert.cuh
csrc/attention/attention_dtypes.h
csrc/attention/attention_generic.cuh
csrc/attention/attention_kernels.cuh
csrc/attention/attention_utils.cuh
csrc/attention/dtype_bfloat16.cuh
csrc/attention/dtype_float16.cuh
csrc/attention/dtype_float32.cuh
csrc/attention/dtype_fp8.cuh
csrc/attention/merge_attn_states.cu
csrc/attention/paged_attention_v1.cu
csrc/attention/paged_attention_v2.cu
csrc/attention/vertical_slash_index.cu
csrc/attention/mla/sm100_cutlass_mla_kernel.cu
csrc/attention/mla/cutlass_sm100_mla/device/sm100_mla.hpp
csrc/attention/mla/cutlass_sm100_mla/kernel/sm100_fmha_mla_reduction.hpp
csrc/attention/mla/cutlass_sm100_mla/kernel/sm100_fmha_mla_tma_warpspecialized.hpp
csrc/attention/mla/cutlass_sm100_mla/kernel/sm100_mla_tile_scheduler.hpp
csrc/core/batch_invariant.hpp
csrc/core/exception.hpp
csrc/core/math.hpp
csrc/core/registration.h
csrc/core/scalar_type.hpp
csrc/cpu/activation.cpp
csrc/cpu/cpu_arch_macros.h
csrc/cpu/cpu_attn.cpp
csrc/cpu/cpu_attn_amx.hpp
csrc/cpu/cpu_attn_impl.hpp
csrc/cpu/cpu_attn_neon.hpp
csrc/cpu/cpu_attn_neon_bfmmla.hpp
csrc/cpu/cpu_attn_vec.hpp
csrc/cpu/cpu_attn_vec16.hpp
csrc/cpu/cpu_attn_vxe.hpp
csrc/cpu/cpu_fused_moe.cpp
csrc/cpu/cpu_types.hpp
csrc/cpu/cpu_types_arm.hpp
csrc/cpu/cpu_types_riscv.hpp
csrc/cpu/cpu_types_scalar.hpp
csrc/cpu/cpu_types_vsx.hpp
csrc/cpu/cpu_types_vxe.hpp
csrc/cpu/cpu_types_x86.hpp
csrc/cpu/cpu_wna16.cpp
csrc/cpu/dnnl_helper.cpp
csrc/cpu/dnnl_helper.h
csrc/cpu/dnnl_kernels.cpp
csrc/cpu/float_convert.hpp
csrc/cpu/generate_cpu_attn_dispatch.py
csrc/cpu/layernorm.cpp
csrc/cpu/mla_decode.cpp
csrc/cpu/pos_encoding.cpp
csrc/cpu/shm.cpp
csrc/cpu/torch_bindings.cpp
csrc/cpu/utils.cpp
csrc/cpu/utils.hpp
csrc/cpu/micro_gemm/cpu_micro_gemm_amx.hpp
csrc/cpu/micro_gemm/cpu_micro_gemm_impl.hpp
csrc/cpu/micro_gemm/cpu_micro_gemm_vec.hpp
csrc/cpu/sgl-kernels/common.h
csrc/cpu/sgl-kernels/gemm.cpp
csrc/cpu/sgl-kernels/gemm.h
csrc/cpu/sgl-kernels/gemm_fp8.cpp
csrc/cpu/sgl-kernels/gemm_int8.cpp
csrc/cpu/sgl-kernels/moe.cpp
csrc/cpu/sgl-kernels/moe_fp8.cpp
csrc/cpu/sgl-kernels/moe_int8.cpp
csrc/cpu/sgl-kernels/vec.h
csrc/cutlass_extensions/common.cpp
csrc/cutlass_extensions/common.hpp
csrc/cutlass_extensions/cute_utils.cuh
csrc/cutlass_extensions/torch_utils.hpp
csrc/cutlass_extensions/vllm_collective_builder.cuh
csrc/cutlass_extensions/vllm_custom_types.cuh
csrc/cutlass_extensions/vllm_cutlass_library_extension.py
csrc/cutlass_extensions/vllm_numeric_conversion.cuh
csrc/cutlass_extensions/vllm_type_utils.cuh
csrc/cutlass_extensions/epilogue/broadcast_load_epilogue_array_c3x.hpp
csrc/cutlass_extensions/epilogue/broadcast_load_epilogue_c2x.hpp
csrc/cutlass_extensions/epilogue/broadcast_load_epilogue_c3x.hpp
csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c2x.hpp
csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp
csrc/libtorch_stable/dispatch_utils.h
csrc/libtorch_stable/ops.h
csrc/libtorch_stable/permute_cols.cu
csrc/libtorch_stable/torch_bindings.cpp
csrc/libtorch_stable/torch_utils.h
csrc/libtorch_stable/quantization/vectorization.cuh
csrc/libtorch_stable/quantization/vectorization_utils.cuh
csrc/libtorch_stable/quantization/w8a8/per_token_group_quant_8bit.h
csrc/libtorch_stable/quantization/w8a8/fp8/per_token_group_quant.cu
csrc/libtorch_stable/quantization/w8a8/int8/per_token_group_quant.cu
csrc/mamba/mamba_ssm/selective_scan.h
csrc/mamba/mamba_ssm/selective_scan_fwd.cu
csrc/mamba/mamba_ssm/static_switch.h
csrc/moe/dsv3_router_gemm_bf16_out.cu
csrc/moe/dsv3_router_gemm_entry.cu
csrc/moe/dsv3_router_gemm_float_out.cu
csrc/moe/dsv3_router_gemm_utils.h
csrc/moe/dynamic_4bit_int_moe_cpu.cpp
csrc/moe/gpt_oss_router_gemm.cu
csrc/moe/gpt_oss_router_gemm.cuh
csrc/moe/grouped_topk_kernels.cu
csrc/moe/moeTopKFuncs.cuh
csrc/moe/moe_align_sum_kernels.cu
csrc/moe/moe_ops.h
csrc/moe/moe_permute_unpermute_op.cu
csrc/moe/moe_wna16.cu
csrc/moe/moe_wna16_utils.h
csrc/moe/router_gemm.cu
csrc/moe/topk_softmax_kernels.cu
csrc/moe/torch_bindings.cpp
csrc/moe/marlin_moe_wna16/.gitignore
csrc/moe/marlin_moe_wna16/generate_kernels.py
csrc/moe/marlin_moe_wna16/kernel.h
csrc/moe/marlin_moe_wna16/marlin_template.h
csrc/moe/marlin_moe_wna16/ops.cu
csrc/moe/mxfp8_moe/cutlass_mxfp8_grouped_mm.cu
csrc/moe/mxfp8_moe/cutlass_mxfp8_grouped_mm_functor.cuh
csrc/moe/mxfp8_moe/cutlass_mxfp8_grouped_mm_launcher.cuh
csrc/moe/mxfp8_moe/cutlass_mxfp8_grouped_mm_traits.cuh
csrc/moe/mxfp8_moe/mxfp8_experts_quant.cu
csrc/moe/mxfp8_moe/mxfp8_experts_quant.cuh
csrc/moe/permute_unpermute_kernels/dispatch.h
csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.cu
csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.h
csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.inl
csrc/quantization/activation_kernels.cu
csrc/quantization/utils.cuh
csrc/quantization/awq/dequantize.cuh
csrc/quantization/awq/gemm_kernels.cu
csrc/quantization/cutlass_w4a8/get_group_starts.cuh
csrc/quantization/cutlass_w4a8/w4a8_grouped_mm_entry.cu
csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu
csrc/quantization/cutlass_w4a8/w4a8_utils.cu
csrc/quantization/cutlass_w4a8/w4a8_utils.cuh
csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu
csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu
csrc/quantization/fp4/nvfp4_experts_quant.cu
csrc/quantization/fp4/nvfp4_quant_entry.cu
csrc/quantization/fp4/nvfp4_quant_kernels.cu
csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu
csrc/quantization/fp4/nvfp4_scaled_mm_kernels.cu
csrc/quantization/fp4/nvfp4_scaled_mm_sm120_kernels.cu
csrc/quantization/fp4/nvfp4_utils.cuh
csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu
csrc/quantization/fused_kernels/layernorm_utils.cuh
csrc/quantization/fused_kernels/quant_conversions.cuh
csrc/quantization/gguf/dequantize.cuh
csrc/quantization/gguf/ggml-common.h
csrc/quantization/gguf/gguf_kernel.cu
csrc/quantization/gguf/mmq.cuh
csrc/quantization/gguf/mmvq.cuh
csrc/quantization/gguf/moe.cuh
csrc/quantization/gguf/moe_vec.cuh
csrc/quantization/gguf/vecdotq.cuh
csrc/quantization/gptq/compat.cuh
csrc/quantization/gptq/matrix_view.cuh
csrc/quantization/gptq/q_gemm.cu
csrc/quantization/gptq/qdq_2.cuh
csrc/quantization/gptq/qdq_3.cuh
csrc/quantization/gptq/qdq_4.cuh
csrc/quantization/gptq/qdq_8.cuh
csrc/quantization/gptq/qdq_util.cuh
csrc/quantization/gptq_allspark/allspark_qgemm_w8a16.cu
csrc/quantization/gptq_allspark/allspark_repack.cu
csrc/quantization/gptq_allspark/allspark_utils.cuh
csrc/quantization/hadamard/hadacore/hadamard_transform_cuda.cu
csrc/quantization/machete/Readme.md
csrc/quantization/machete/generate.py
csrc/quantization/machete/machete_collective_builder.cuh
csrc/quantization/machete/machete_interleaving_utils.cuh
csrc/quantization/machete/machete_mainloop.cuh
csrc/quantization/machete/machete_mm_kernel.cuh
csrc/quantization/machete/machete_mm_launcher.cuh
csrc/quantization/machete/machete_prepack_kernel.cuh
csrc/quantization/machete/machete_prepack_launcher.cuh
csrc/quantization/machete/machete_prepacked_layout.cuh
csrc/quantization/machete/machete_pytorch.cu
csrc/quantization/marlin/.gitignore
csrc/quantization/marlin/awq_marlin_repack.cu
csrc/quantization/marlin/dequant.h
csrc/quantization/marlin/generate_kernels.py
csrc/quantization/marlin/gptq_marlin_repack.cu
csrc/quantization/marlin/kernel.h
csrc/quantization/marlin/marlin.cu
csrc/quantization/marlin/marlin.cuh
csrc/quantization/marlin/marlin_dtypes.cuh
csrc/quantization/marlin/marlin_int4_fp8_preprocess.cu
csrc/quantization/marlin/marlin_mma.h
csrc/quantization/marlin/marlin_template.h
csrc/quantization/w8a8/cutlass/Epilogues.md
csrc/quantization/w8a8/cutlass/scaled_mm_c2x.cu
csrc/quantization/w8a8/cutlass/scaled_mm_c2x.cuh
csrc/quantization/w8a8/cutlass/scaled_mm_c2x_sm75_dispatch.cuh
csrc/quantization/w8a8/cutlass/scaled_mm_c2x_sm80_dispatch.cuh
csrc/quantization/w8a8/cutlass/scaled_mm_c2x_sm89_fp8_dispatch.cuh
csrc/quantization/w8a8/cutlass/scaled_mm_c2x_sm89_int8_dispatch.cuh
csrc/quantization/w8a8/cutlass/scaled_mm_c3x_sm100.cu
csrc/quantization/w8a8/cutlass/scaled_mm_c3x_sm120.cu
csrc/quantization/w8a8/cutlass/scaled_mm_c3x_sm90.cu
csrc/quantization/w8a8/cutlass/scaled_mm_entry.cu
csrc/quantization/w8a8/cutlass/c3x/cutlass_gemm_caller.cuh
csrc/quantization/w8a8/cutlass/c3x/scaled_mm.cuh
csrc/quantization/w8a8/cutlass/c3x/scaled_mm_azp_sm90_int8.cu
csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm100_fp8.cu
csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm100_fp8_dispatch.cuh
csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm120_fp8.cu
csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm120_fp8_dispatch.cuh
csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm90_fp8.cu
csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm90_fp8_dispatch.cuh
csrc/quantization/w8a8/cutlass/c3x/scaled_mm_helper.hpp
csrc/quantization/w8a8/cutlass/c3x/scaled_mm_kernels.hpp
csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm100_fp8.cu
csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm100_fp8_dispatch.cuh
csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm120_fp8.cu
csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm120_fp8_dispatch.cuh
csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm90_fp8.cu
csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm90_fp8_dispatch.cuh
csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm90_int8.cu
csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm90_int8_dispatch.cuh
csrc/quantization/w8a8/cutlass/moe/get_group_starts.cuh
csrc/quantization/w8a8/cutlass/moe/grouped_mm_c3x.cuh
csrc/quantization/w8a8/cutlass/moe/grouped_mm_c3x_sm100.cu
csrc/quantization/w8a8/cutlass/moe/grouped_mm_c3x_sm90.cu
csrc/quantization/w8a8/cutlass/moe/moe_data.cu
csrc/quantization/w8a8/fp8/common.cu
csrc/quantization/w8a8/fp8/common.cuh
csrc/quantization/w8a8/fp8/amd/quant_utils.cuh
csrc/quantization/w8a8/fp8/nvidia/quant_utils.cuh
csrc/quantization/w8a8/int8/scaled_quant.cu
csrc/quickreduce/base.h
csrc/quickreduce/quick_reduce.h
csrc/quickreduce/quick_reduce_impl.cuh
csrc/rocm/attention.cu
csrc/rocm/ops.h
csrc/rocm/skinny_gemms.cu
csrc/rocm/torch_bindings.cpp
docker/Dockerfile
docker/Dockerfile.cpu
docker/Dockerfile.nightly_torch
docker/Dockerfile.ppc64le
docker/Dockerfile.rocm
docker/Dockerfile.rocm_base
docker/Dockerfile.s390x
docker/Dockerfile.tpu
docker/Dockerfile.xpu
docker/docker-bake.hcl
docker/versions.json
docs/.nav.yml
docs/README.md
docs/maybe_skip_pr_build.sh
docs/api/README.md
docs/api/vllm/.meta.yml
docs/assets/contributing/dockerfile-stages-dependency.png
docs/assets/contributing/load-pattern-examples.png
docs/assets/deployment/anything-llm-chat-with-doc.png
docs/assets/deployment/anything-llm-chat-without-doc.png
docs/assets/deployment/anything-llm-provider.png
docs/assets/deployment/anything-llm-upload-doc.png
docs/assets/deployment/architecture_helm_deployment.png
docs/assets/deployment/chatbox-chat.png
docs/assets/deployment/chatbox-settings.png
docs/assets/deployment/claude-code-example.png
docs/assets/deployment/dify-chat.png
docs/assets/deployment/dify-create-chatbot.png
docs/assets/deployment/dify-settings.png
docs/assets/deployment/dp_external_lb.png
docs/assets/deployment/dp_internal_lb.png
docs/assets/deployment/hf-inference-endpoints-catalog.png
docs/assets/deployment/hf-inference-endpoints-choose-infra.png
docs/assets/deployment/hf-inference-endpoints-click-deploy-button.png
docs/assets/deployment/hf-inference-endpoints-configure-container.png
docs/assets/deployment/hf-inference-endpoints-create-endpoint.png
docs/assets/deployment/hf-inference-endpoints-locate-deploy-button.png
docs/assets/deployment/hf-inference-endpoints-new-endpoint.png
docs/assets/deployment/hf-inference-endpoints-select-hardware.png
docs/assets/deployment/hf-inference-endpoints-select-model.png
docs/assets/deployment/open_webui.png
docs/assets/deployment/streamlit-chat.png
docs/assets/design/hierarchy.png
docs/assets/design/arch_overview/entrypoints.excalidraw.png
docs/assets/design/arch_overview/llm_engine.excalidraw.png
docs/assets/design/arch_overview/v1_process_architecture_tp2_dp4.png
docs/assets/design/arch_overview/v1_process_architecture_tp4.png
docs/assets/design/cuda_graphs/current_design.png
docs/assets/design/cuda_graphs/executor_runtime.png
docs/assets/design/cuda_graphs/previous_design.png
docs/assets/design/cuda_graphs/wrapper_flow.png
docs/assets/design/debug_vllm_compile/design_diagram.png
docs/assets/design/debug_vllm_compile/dynamic_shapes.png
docs/assets/design/debug_vllm_compile/tlparse_inductor.png
docs/assets/design/fused_moe_modular_kernel/fused_experts_blocks.png
docs/assets/design/fused_moe_modular_kernel/fused_moe_batched.png
docs/assets/design/fused_moe_modular_kernel/fused_moe_non_batched.png
docs/assets/design/fused_moe_modular_kernel/prepare_and_finalize_blocks.png
docs/assets/design/hybrid_kv_cache_manager/basic_grouping_example.png
docs/assets/design/hybrid_kv_cache_manager/full_attn.png
docs/assets/design/hybrid_kv_cache_manager/memory_layout.png
docs/assets/design/hybrid_kv_cache_manager/overview.png
docs/assets/design/hybrid_kv_cache_manager/sw_attn.png
docs/assets/design/metrics/intervals-1.png
docs/assets/design/metrics/intervals-2.png
docs/assets/design/metrics/intervals-3.png
docs/assets/design/model_runner_v2/async_no_race_condition.png
docs/assets/design/model_runner_v2/async_race_condition.png
docs/assets/design/model_runner_v2/async_sched.png
docs/assets/design/model_runner_v2/persistent_batch_mrv2.png
docs/assets/design/model_runner_v2/persistent_batch_v1.png
docs/assets/design/paged_attention/k_vecs.png
docs/assets/design/paged_attention/key.png
docs/assets/design/paged_attention/logits_vec.png
docs/assets/design/paged_attention/q_vecs.png
docs/assets/design/paged_attention/query.png
docs/assets/design/paged_attention/v_vec.png
docs/assets/design/paged_attention/value.png
docs/assets/design/prefix_caching/example-time-1.png
docs/assets/design/prefix_caching/example-time-3.png
docs/assets/design/prefix_caching/example-time-4.png
docs/assets/design/prefix_caching/example-time-5.png
docs/assets/design/prefix_caching/example-time-6.png
docs/assets/design/prefix_caching/example-time-7.png
docs/assets/design/prefix_caching/free.png
docs/assets/design/prefix_caching/overview.png
docs/assets/design/tpu/most_model_len.png
docs/assets/features/disagg_encoder/disagg_encoder_flow.png
docs/assets/features/disagg_prefill/abstraction.jpg
docs/assets/features/disagg_prefill/high_level_design.png
docs/assets/features/disagg_prefill/overview.jpg
docs/assets/features/disagg_prefill/workflow.png
docs/assets/features/speculative_decoding/speculators-user-flow-dark.svg
docs/assets/features/speculative_decoding/speculators-user-flow-light.svg
docs/assets/logos/vllm-logo-only-light.ico
docs/assets/logos/vllm-logo-only-light.png
docs/assets/logos/vllm-logo-text-dark.png
docs/assets/logos/vllm-logo-text-light.png
docs/benchmarking/README.md
docs/benchmarking/cli.md
docs/benchmarking/dashboard.md
docs/benchmarking/sweeps.md
docs/cli/.meta.yml
docs/cli/.nav.yml
docs/cli/README.md
docs/cli/chat.md
docs/cli/complete.md
docs/cli/json_tip.inc.md
docs/cli/run-batch.md
docs/cli/serve.md
docs/cli/bench/latency.md
docs/cli/bench/mm_processor.md
docs/cli/bench/serve.md
docs/cli/bench/throughput.md
docs/cli/bench/sweep/plot.md
docs/cli/bench/sweep/plot_pareto.md
docs/cli/bench/sweep/serve.md
docs/cli/bench/sweep/serve_workload.md
docs/community/contact_us.md
docs/community/meetups.md
docs/community/sponsors.md
docs/configuration/README.md
docs/configuration/conserving_memory.md
docs/configuration/engine_args.md
docs/configuration/env_vars.md
docs/configuration/model_resolution.md
docs/configuration/optimization.md
docs/configuration/serve_args.md
docs/contributing/README.md
docs/contributing/deprecation_policy.md
docs/contributing/editing-agent-instructions.md
docs/contributing/incremental_build.md
docs/contributing/profiling.md
docs/contributing/vulnerability_management.md
docs/contributing/ci/failures.md
docs/contributing/ci/nightly_builds.md
docs/contributing/ci/update_pytorch_version.md
docs/contributing/dockerfile/dockerfile.md
docs/contributing/model/README.md
docs/contributing/model/basic.md
docs/contributing/model/multimodal.md
docs/contributing/model/registration.md
docs/contributing/model/tests.md
docs/contributing/model/transcription.md
docs/deployment/docker.md
docs/deployment/k8s.md
docs/deployment/nginx.md
docs/deployment/frameworks/anyscale.md
docs/deployment/frameworks/anything-llm.md
docs/deployment/frameworks/autogen.md
docs/deployment/frameworks/bentoml.md
docs/deployment/frameworks/cerebrium.md
docs/deployment/frameworks/chatbox.md
docs/deployment/frameworks/dify.md
docs/deployment/frameworks/dstack.md
docs/deployment/frameworks/haystack.md
docs/deployment/frameworks/helm.md
docs/deployment/frameworks/hf_inference_endpoints.md
docs/deployment/frameworks/litellm.md
docs/deployment/frameworks/lobe-chat.md
docs/deployment/frameworks/lws.md
docs/deployment/frameworks/modal.md
docs/deployment/frameworks/open-webui.md
docs/deployment/frameworks/retrieval_augmented_generation.md
docs/deployment/frameworks/runpod.md
docs/deployment/frameworks/skypilot.md
docs/deployment/frameworks/streamlit.md
docs/deployment/frameworks/triton.md
docs/deployment/integrations/aibrix.md
docs/deployment/integrations/dynamo.md
docs/deployment/integrations/kaito.md
docs/deployment/integrations/kserve.md
docs/deployment/integrations/kthena.md
docs/deployment/integrations/kubeai.md
docs/deployment/integrations/kuberay.md
docs/deployment/integrations/llamastack.md
docs/deployment/integrations/llm-d.md
docs/deployment/integrations/llmaz.md
docs/deployment/integrations/production-stack.md
docs/design/arch_overview.md
docs/design/attention_backends.md
docs/design/cuda_graphs.md
docs/design/cuda_graphs_multimodal.md
docs/design/custom_op.md
docs/design/dbo.md
docs/design/debug_vllm_compile.md
docs/design/fused_moe_modular_kernel.md
docs/design/fusions.md
docs/design/huggingface_integration.md
docs/design/hybrid_kv_cache_manager.md
docs/design/io_processor_plugins.md
docs/design/logits_processors.md
docs/design/lora_resolver_plugins.md
docs/design/metrics.md
docs/design/mm_processing.md
docs/design/model_runner_v2.md
docs/design/moe_kernel_features.md
docs/design/multiprocessing.md
docs/design/optimization_levels.md
docs/design/p2p_nccl_connector.md
docs/design/paged_attention.md
docs/design/plugin_system.md
docs/design/prefix_caching.md
docs/design/torch_compile.md
docs/design/torch_compile_multimodal.md
docs/examples/README.md
docs/features/README.md
docs/features/automatic_prefix_caching.md
docs/features/batch_invariance.md
docs/features/custom_arguments.md
docs/features/custom_logitsprocs.md
docs/features/disagg_encoder.md
docs/features/disagg_prefill.md
docs/features/interleaved_thinking.md
docs/features/lora.md
docs/features/mooncake_connector_usage.md
docs/features/multimodal_inputs.md
docs/features/nixl_connector_usage.md
docs/features/prompt_embeds.md
docs/features/reasoning_outputs.md
docs/features/sleep_mode.md
docs/features/structured_outputs.md
docs/features/tool_calling.md
docs/features/quantization/README.md
docs/features/quantization/auto_awq.md
docs/features/quantization/bnb.md
docs/features/quantization/fp8.md
docs/features/quantization/gguf.md
docs/features/quantization/gptqmodel.md
docs/features/quantization/inc.md
docs/features/quantization/int4.md
docs/features/quantization/int8.md
docs/features/quantization/llm_compressor.md
docs/features/quantization/modelopt.md
docs/features/quantization/quantized_kvcache.md
docs/features/quantization/quark.md
docs/features/quantization/torchao.md
docs/features/speculative_decoding/README.md
docs/features/speculative_decoding/draft_model.md
docs/features/speculative_decoding/eagle.md
docs/features/speculative_decoding/mlp.md
docs/features/speculative_decoding/mtp.md
docs/features/speculative_decoding/n_gram.md
docs/features/speculative_decoding/parallel_draft_model.md
docs/features/speculative_decoding/speculators.md
docs/features/speculative_decoding/suffix.md
docs/getting_started/quickstart.md
docs/getting_started/installation/.nav.yml
docs/getting_started/installation/README.md
docs/getting_started/installation/cpu.apple.inc.md
docs/getting_started/installation/cpu.arm.inc.md
docs/getting_started/installation/cpu.md
docs/getting_started/installation/cpu.s390x.inc.md
docs/getting_started/installation/cpu.x86.inc.md
docs/getting_started/installation/device.template.md
docs/getting_started/installation/gpu.cuda.inc.md
docs/getting_started/installation/gpu.md
docs/getting_started/installation/gpu.rocm.inc.md
docs/getting_started/installation/gpu.xpu.inc.md
docs/getting_started/installation/python_env_setup.inc.md
docs/governance/collaboration.md
docs/governance/committers.md
docs/governance/process.md
docs/mkdocs/hooks/generate_argparse.py
docs/mkdocs/hooks/generate_examples.py
docs/mkdocs/hooks/generate_metrics.py
docs/mkdocs/hooks/remove_announcement.py
docs/mkdocs/hooks/url_schemes.py
docs/mkdocs/javascript/edit_and_feedback.js
docs/mkdocs/javascript/mathjax.js
docs/mkdocs/javascript/run_llm_widget.js
docs/mkdocs/javascript/slack_and_forum.js
docs/mkdocs/overrides/main.html
docs/mkdocs/overrides/partials/toc-item.html
docs/mkdocs/stylesheets/extra.css
docs/models/generative_models.md
docs/models/supported_models.md
docs/models/extensions/fastsafetensor.md
docs/models/extensions/instanttensor.md
docs/models/extensions/runai_model_streamer.md
docs/models/extensions/tensorizer.md
docs/models/hardware_supported_models/cpu.md
docs/models/hardware_supported_models/xpu.md
docs/models/pooling_models/README.md
docs/models/pooling_models/classify.md
docs/models/pooling_models/embed.md
docs/models/pooling_models/reward.md
docs/models/pooling_models/scoring.md
docs/models/pooling_models/specific_models.md
docs/models/pooling_models/token_classify.md
docs/models/pooling_models/token_embed.md
docs/serving/context_parallel_deployment.md
docs/serving/data_parallel_deployment.md
docs/serving/distributed_troubleshooting.md
docs/serving/expert_parallel_deployment.md
docs/serving/offline_inference.md
docs/serving/openai_compatible_server.md
docs/serving/parallelism_scaling.md
docs/serving/integrations/claude_code.md
docs/serving/integrations/langchain.md
docs/serving/integrations/llamaindex.md
docs/training/async_rl.md
docs/training/rlhf.md
docs/training/trl.md
docs/training/weight_transfer/README.md
docs/training/weight_transfer/base.md
docs/training/weight_transfer/ipc.md
docs/training/weight_transfer/nccl.md
docs/usage/README.md
docs/usage/faq.md
docs/usage/metrics.md
docs/usage/reproducibility.md
docs/usage/security.md
docs/usage/troubleshooting.md
docs/usage/usage_stats.md
docs/usage/v1_guide.md
examples/template_alpaca.jinja
examples/template_baichuan.jinja
examples/template_chatglm.jinja
examples/template_chatglm2.jinja
examples/template_chatml.jinja
examples/template_falcon.jinja
examples/template_falcon_180b.jinja
examples/template_inkbot.jinja
examples/template_teleflm.jinja
examples/tool_chat_template_deepseekr1.jinja
examples/tool_chat_template_deepseekv3.jinja
examples/tool_chat_template_deepseekv31.jinja
examples/tool_chat_template_functiongemma.jinja
examples/tool_chat_template_gemma3_pythonic.jinja
examples/tool_chat_template_gemma4.jinja
examples/tool_chat_template_glm4.jinja
examples/tool_chat_template_granite.jinja
examples/tool_chat_template_granite_20b_fc.jinja
examples/tool_chat_template_hermes.jinja
examples/tool_chat_template_hunyuan_a13b.jinja
examples/tool_chat_template_internlm2_tool.jinja
examples/tool_chat_template_llama3.1_json.jinja
examples/tool_chat_template_llama3.2_json.jinja
examples/tool_chat_template_llama3.2_pythonic.jinja
examples/tool_chat_template_llama4_json.jinja
examples/tool_chat_template_llama4_pythonic.jinja
examples/tool_chat_template_minimax_m1.jinja
examples/tool_chat_template_mistral.jinja
examples/tool_chat_template_mistral3.jinja
examples/tool_chat_template_mistral_parallel.jinja
examples/tool_chat_template_phi4_mini.jinja
examples/tool_chat_template_qwen3coder.jinja
examples/tool_chat_template_toolace.jinja
examples/tool_chat_template_xlam_llama.jinja
examples/tool_chat_template_xlam_qwen.jinja
examples/basic/offline_inference/README.md
examples/basic/offline_inference/basic.py
examples/basic/offline_inference/chat.py
examples/basic/offline_inference/classify.py
examples/basic/offline_inference/embed.py
examples/basic/offline_inference/generate.py
examples/basic/offline_inference/reward.py
examples/basic/offline_inference/score.py
examples/basic/online_serving/openai_chat_completion_client.py
examples/basic/online_serving/openai_completion_client.py
examples/offline_inference/async_llm_streaming.py
examples/offline_inference/audio_language.py
examples/offline_inference/automatic_prefix_caching.py
examples/offline_inference/batch_llm_inference.py
examples/offline_inference/chat_with_tools.py
examples/offline_inference/context_extension.py
examples/offline_inference/data_parallel.py
examples/offline_inference/disaggregated_prefill.py
examples/offline_inference/encoder_decoder_multimodal.py
examples/offline_inference/extract_hidden_states.py
examples/offline_inference/llm_engine_example.py
examples/offline_inference/llm_engine_reset_kv.py
examples/offline_inference/load_sharded_state.py
examples/offline_inference/lora_with_quantization_inference.py
examples/offline_inference/metrics.py
examples/offline_inference/mistral-small.py
examples/offline_inference/mlpspeculator.py
examples/offline_inference/multilora_inference.py
examples/offline_inference/pause_resume.py
examples/offline_inference/prefix_caching.py
examples/offline_inference/prefix_caching_flexkv.py
examples/offline_inference/prompt_embed_inference.py
examples/offline_inference/qwen_1m.py
examples/offline_inference/reproducibility.py
examples/offline_inference/routed_experts_e2e.py
examples/offline_inference/run_one_batch.py
examples/offline_inference/save_sharded_state.py
examples/offline_inference/simple_profiling.py
examples/offline_inference/skip_loading_weights_in_engine_init.py
examples/offline_inference/spec_decode.py
examples/offline_inference/structured_outputs.py
examples/offline_inference/torchrun_dp_example.py
examples/offline_inference/torchrun_example.py
examples/offline_inference/vision_language.py
examples/offline_inference/vision_language_multi_image.py
examples/offline_inference/disaggregated-prefill-v1/README.md
examples/offline_inference/disaggregated-prefill-v1/decode_example.py
examples/offline_inference/disaggregated-prefill-v1/prefill_example.py
examples/offline_inference/disaggregated-prefill-v1/run.sh
examples/offline_inference/kv_load_failure_recovery/README.md
examples/offline_inference/kv_load_failure_recovery/decode_example.py
examples/offline_inference/kv_load_failure_recovery/load_recovery_example_connector.py
examples/offline_inference/kv_load_failure_recovery/prefill_example.py
examples/offline_inference/kv_load_failure_recovery/run.sh
examples/offline_inference/logits_processor/README.md
examples/offline_inference/logits_processor/custom.py
examples/offline_inference/logits_processor/custom_req.py
examples/offline_inference/logits_processor/custom_req_init.py
examples/offline_inference/openai_batch/README.md
examples/offline_inference/openai_batch/openai_example_batch.jsonl
examples/offline_inference/qwen2_5_omni/README.md
examples/offline_inference/qwen2_5_omni/only_thinker.py
examples/offline_inference/qwen3_omni/only_thinker.py
examples/online_serving/api_client.py
examples/online_serving/batched_chat_completions.py
examples/online_serving/data_parallel_pause_resume.py
examples/online_serving/disaggregated_prefill.sh
examples/online_serving/gradio_openai_chatbot_webserver.py
examples/online_serving/gradio_webserver.py
examples/online_serving/kv_events_subscriber.py
examples/online_serving/multi-node-serving.sh
examples/online_serving/multi_instance_data_parallel.py
examples/online_serving/openai_chat_completion_client_for_multimodal.py
examples/online_serving/openai_chat_completion_client_with_tools.py
examples/online_serving/openai_chat_completion_client_with_tools_required.py
examples/online_serving/openai_chat_completion_client_with_tools_xlam.py
examples/online_serving/openai_chat_completion_client_with_tools_xlam_streaming.py
examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py
examples/online_serving/openai_chat_completion_with_reasoning.py
examples/online_serving/openai_chat_completion_with_reasoning_streaming.py
examples/online_serving/openai_realtime_client.py
examples/online_serving/openai_realtime_microphone_client.py
examples/online_serving/openai_responses_client.py
examples/online_serving/openai_responses_client_with_mcp_tools.py
examples/online_serving/openai_responses_client_with_tools.py
examples/online_serving/openai_transcription_client.py
examples/online_serving/openai_translation_client.py
examples/online_serving/prompt_embed_inference_with_openai_client.py
examples/online_serving/ray_serve_deepseek.py
examples/online_serving/retrieval_augmented_generation_with_langchain.py
examples/online_serving/retrieval_augmented_generation_with_llamaindex.py
examples/online_serving/run_cluster.sh
examples/online_serving/sagemaker-entrypoint.sh
examples/online_serving/streamlit_openai_chatbot_webserver.py
examples/online_serving/token_generation_client.py
examples/online_serving/utils.py
examples/online_serving/chart-helm/.helmignore
examples/online_serving/chart-helm/Chart.yaml
examples/online_serving/chart-helm/README.md
examples/online_serving/chart-helm/ct.yaml
examples/online_serving/chart-helm/lintconf.yaml
examples/online_serving/chart-helm/values.schema.json
examples/online_serving/chart-helm/values.yaml
examples/online_serving/chart-helm/templates/_helpers.tpl
examples/online_serving/chart-helm/templates/configmap.yaml
examples/online_serving/chart-helm/templates/custom-objects.yaml
examples/online_serving/chart-helm/templates/deployment.yaml
examples/online_serving/chart-helm/templates/hpa.yaml
examples/online_serving/chart-helm/templates/job.yaml
examples/online_serving/chart-helm/templates/poddisruptionbudget.yaml
examples/online_serving/chart-helm/templates/pvc.yaml
examples/online_serving/chart-helm/templates/secrets.yaml
examples/online_serving/chart-helm/templates/service.yaml
examples/online_serving/chart-helm/tests/deployment_test.yaml
examples/online_serving/chart-helm/tests/job_test.yaml
examples/online_serving/chart-helm/tests/pvc_test.yaml
examples/online_serving/dashboards/README.md
examples/online_serving/dashboards/grafana/README.md
examples/online_serving/dashboards/grafana/performance_statistics.json
examples/online_serving/dashboards/grafana/query_statistics.json
examples/online_serving/dashboards/perses/README.md
examples/online_serving/dashboards/perses/performance_statistics.yaml
examples/online_serving/dashboards/perses/query_statistics.yaml
examples/online_serving/disaggregated_encoder/README.md
examples/online_serving/disaggregated_encoder/disagg_1e1p1d_example.sh
examples/online_serving/disaggregated_encoder/disagg_1e1pd_example.sh
examples/online_serving/disaggregated_encoder/disagg_epd_proxy.py
examples/online_serving/disaggregated_serving/README.md
examples/online_serving/disaggregated_serving/disagg_proxy_demo.py
examples/online_serving/disaggregated_serving/kv_events.sh
examples/online_serving/disaggregated_serving/moriio_toy_proxy_server.py
examples/online_serving/disaggregated_serving/mooncake_connector/mooncake_connector_proxy.py
examples/online_serving/disaggregated_serving/mooncake_connector/run_mooncake_connector.sh
examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_example_p2p_nccl_xpyd.sh
examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_proxy_p2p_nccl_xpyd.py
examples/online_serving/ec_both_encoder/ec_both_encoder.sh
examples/online_serving/elastic_ep/bench.sh
examples/online_serving/elastic_ep/scale.py
examples/online_serving/elastic_ep/serve_deepseek_v2.sh
examples/online_serving/opentelemetry/README.md
examples/online_serving/opentelemetry/dummy_client.py
examples/online_serving/prometheus_grafana/README.md
examples/online_serving/prometheus_grafana/docker-compose.yaml
examples/online_serving/prometheus_grafana/grafana.json
examples/online_serving/prometheus_grafana/prometheus.yaml
examples/online_serving/structured_outputs/README.md
examples/online_serving/structured_outputs/pyproject.toml
examples/online_serving/structured_outputs/structured_outputs.py
examples/others/logging_configuration.md
examples/others/tensorize_vllm_model.py
examples/others/lmcache/README.md
examples/others/lmcache/cpu_offload_lmcache.py
examples/others/lmcache/disagg_prefill_lmcache_v0.py
examples/others/lmcache/kv_cache_sharing_lmcache_v1.py
examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh
examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_proxy_server.py
examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_vllm_launcher.sh
examples/others/lmcache/disagg_prefill_lmcache_v1/configs/lmcache-decoder-config.yaml
examples/others/lmcache/disagg_prefill_lmcache_v1/configs/lmcache-prefiller-config.yaml
examples/pooling/classify/classification_online.py
examples/pooling/classify/vision_classification_online.py
examples/pooling/embed/embed_jina_embeddings_v3_offline.py
examples/pooling/embed/embed_matryoshka_fy_offline.py
examples/pooling/embed/embedding_requests_base64_online.py
examples/pooling/embed/embedding_requests_bytes_online.py
examples/pooling/embed/openai_embedding_client.py
examples/pooling/embed/openai_embedding_matryoshka_fy_client.py
examples/pooling/embed/vision_embedding_offline.py
examples/pooling/embed/vision_embedding_online.py
examples/pooling/embed/openai_embedding_long_text/README.md
examples/pooling/embed/openai_embedding_long_text/client.py
examples/pooling/embed/openai_embedding_long_text/service.sh
examples/pooling/embed/template/dse_qwen2_vl.jinja
examples/pooling/embed/template/nemotron_embed_vl.jinja
examples/pooling/embed/template/vlm2vec_phi3v.jinja
examples/pooling/embed/template/vlm2vec_qwen2vl.jinja
examples/pooling/plugin/prithvi_geospatial_mae_io_processor.py
examples/pooling/plugin/prithvi_geospatial_mae_offline.py
examples/pooling/plugin/prithvi_geospatial_mae_online.py
examples/pooling/pooling/pooling_online.py
examples/pooling/score/cohere_rerank_client.py
examples/pooling/score/colbert_rerank_online.py
examples/pooling/score/colmodernvbert_rerank_online.py
examples/pooling/score/colqwen3_5_rerank_online.py
examples/pooling/score/colqwen3_rerank_online.py
examples/pooling/score/convert_model_to_seq_cls.py
examples/pooling/score/qwen3_reranker_offline.py
examples/pooling/score/qwen3_reranker_online.py
examples/pooling/score/rerank_api_online.py
examples/pooling/score/score_api_online.py
examples/pooling/score/using_template_offline.py
examples/pooling/score/using_template_online.py
examples/pooling/score/vision_rerank_api_online.py
examples/pooling/score/vision_reranker_offline.py
examples/pooling/score/vision_score_api_online.py
examples/pooling/score/template/bge-reranker-v2-gemma.jinja
examples/pooling/score/template/mxbai_rerank_v2.jinja
examples/pooling/score/template/nemotron-rerank.jinja
examples/pooling/score/template/nemotron-vl-rerank.jinja
examples/pooling/score/template/qwen3_reranker.jinja
examples/pooling/score/template/qwen3_vl_reranker.jinja
examples/pooling/token_classify/forced_alignment_offline.py
examples/pooling/token_classify/ner_offline.py
examples/pooling/token_classify/ner_online.py
examples/pooling/token_embed/colqwen3_token_embed_online.py
examples/pooling/token_embed/jina_embeddings_v4_offline.py
examples/pooling/token_embed/multi_vector_retrieval_offline.py
examples/pooling/token_embed/multi_vector_retrieval_online.py
examples/rl/rlhf_async_new_apis.py
examples/rl/rlhf_http_ipc.py
examples/rl/rlhf_http_nccl.py
examples/rl/rlhf_ipc.py
examples/rl/rlhf_nccl.py
examples/rl/rlhf_nccl_fsdp_ep.py
requirements/build.txt
requirements/common.txt
requirements/cpu-build.txt
requirements/cpu.txt
requirements/cuda.txt
requirements/dev.txt
requirements/docs.txt
requirements/kv_connectors.txt
requirements/kv_connectors_rocm.txt
requirements/lint.txt
requirements/nightly_torch_test.txt
requirements/rocm-build.txt
requirements/rocm-test.in
requirements/rocm-test.txt
requirements/rocm.txt
requirements/test.in
requirements/test.txt
requirements/tpu.txt
requirements/xpu-test.in
requirements/xpu-test.txt
requirements/xpu.txt
requirements/test/xpu.txt
scripts/autotune_helion_kernels.py
tests/__init__.py
tests/ci_envs.py
tests/conftest.py
tests/test_access_log_filter.py
tests/test_attention_backend_registry.py
tests/test_config.py
tests/test_embedded_commit.py
tests/test_envs.py
tests/test_inputs.py
tests/test_logger.py
tests/test_logprobs.py
tests/test_outputs.py
tests/test_pooling_params.py
tests/test_ray_env.py
tests/test_regression.py
tests/test_scalartype.py
tests/test_seed_behavior.py
tests/test_sequence.py
tests/test_triton_utils.py
tests/test_version.py
tests/test_vllm_port.py
tests/test_zen_cpu_platform_detection.py
tests/utils.py
tests/basic_correctness/__init__.py
tests/basic_correctness/test_basic_correctness.py
tests/basic_correctness/test_cpu_offload.py
tests/basic_correctness/test_cumem.py
tests/basic_correctness/test_prefetch_offload.py
tests/benchmarks/__init__.py
tests/benchmarks/test_bench_startup.py
tests/benchmarks/test_latency_cli.py
tests/benchmarks/test_plot_filters.py
tests/benchmarks/test_random_dataset.py
tests/benchmarks/test_random_multimodal_dataset_video.py
tests/benchmarks/test_serve_cli.py
tests/benchmarks/test_throughput_cli.py
tests/benchmarks/sweep/__init__.py
tests/benchmarks/sweep/test_param_sweep.py
tests/compile/README.md
tests/compile/__init__.py
tests/compile/backend.py
tests/compile/conftest.py
tests/compile/silly_attention.py
tests/compile/test_aot_compile.py
tests/compile/test_compile_ranges.py
tests/compile/test_config.py
tests/compile/test_decorator.py
tests/compile/test_dynamic_shapes_compilation.py
tests/compile/test_graph_partition.py
tests/compile/test_rotary_embedding_compile.py
tests/compile/test_sequence_parallelism_threshold.py
tests/compile/test_structured_logging.py
tests/compile/test_wrapper.py
tests/compile/correctness_e2e/__init__.py
tests/compile/correctness_e2e/test_async_tp.py
tests/compile/correctness_e2e/test_sequence_parallel.py
tests/compile/fullgraph/__init__.py
tests/compile/fullgraph/test_basic_correctness.py
tests/compile/fullgraph/test_full_cudagraph.py
tests/compile/fullgraph/test_full_graph.py
tests/compile/fullgraph/test_multimodal_compile.py
tests/compile/fullgraph/test_multiple_graphs.py
tests/compile/fullgraph/test_simple.py
tests/compile/fullgraph/test_toy_llama.py
tests/compile/fusions_e2e/__init__.py
tests/compile/fusions_e2e/common.py
tests/compile/fusions_e2e/conftest.py
tests/compile/fusions_e2e/models.py
tests/compile/fusions_e2e/test_tp1_quant.py
tests/compile/fusions_e2e/test_tp2_ar_rms.py
tests/compile/fusions_e2e/test_tp2_async_tp.py
tests/compile/h100/__init__.py
tests/compile/h100/test_startup.py
tests/compile/passes/__init__.py
tests/compile/passes/test_functionalization.py
tests/compile/passes/test_fuse_act_padding.py
tests/compile/passes/test_fusion.py
tests/compile/passes/test_fusion_attn.py
tests/compile/passes/test_noop_elimination.py
tests/compile/passes/test_pass_manager.py
tests/compile/passes/test_qk_norm_rope_fusion.py
tests/compile/passes/test_rope_kvcache_fusion.py
tests/compile/passes/test_scatter_split_replace.py
tests/compile/passes/test_silu_mul_quant_fusion.py
tests/compile/passes/test_split_coalescing.py
tests/compile/passes/distributed/__init__.py
tests/compile/passes/distributed/test_async_tp.py
tests/compile/passes/distributed/test_fusion_all_reduce.py
tests/compile/passes/distributed/test_sequence_parallelism.py
tests/config/base_model_arch_groundtruth.json
tests/config/draft_model_arch_groundtruth.json
tests/config/test_config.yaml
tests/config/test_config_generation.py
tests/config/test_config_utils.py
tests/config/test_config_with_model.yaml
tests/config/test_model_arch_config.py
tests/config/test_mp_reducer.py
tests/config/test_multimodal_config.py
tests/cuda/test_cuda_compatibility_path.py
tests/cuda/test_cuda_context.py
tests/cuda/test_platform_no_cuda_init.py
tests/cuda/scripts/check_device_count_respects_env.py
tests/cuda/scripts/check_platform_no_cuda_init.py
tests/detokenizer/__init__.py
tests/detokenizer/test_disable_detokenization.py
tests/detokenizer/test_min_tokens.py
tests/detokenizer/test_stop_reason.py
tests/detokenizer/test_stop_string_while_stop_model_terminates.py
tests/detokenizer/test_stop_strings.py
tests/distributed/__init__.py
tests/distributed/conftest.py
tests/distributed/eplb_utils.py
tests/distributed/test_ca_buffer_sharing.py
tests/distributed/test_comm_ops.py
tests/distributed/test_context_parallel.py
tests/distributed/test_custom_all_reduce.py
tests/distributed/test_dcp_a2a.py
tests/distributed/test_distributed_oot.py
tests/distributed/test_elastic_ep.py
tests/distributed/test_eplb_algo.py
tests/distributed/test_eplb_execute.py
tests/distributed/test_eplb_fused_moe_layer.py
tests/distributed/test_eplb_fused_moe_layer_dep_nvfp4.py
tests/distributed/test_eplb_spec_decode.py
tests/distributed/test_eplb_utils.py
tests/distributed/test_events.py
tests/distributed/test_expert_parallel.py
tests/distributed/test_expert_placement.py
tests/distributed/test_kvlayout.py
tests/distributed/test_mq_connect_ip.py
tests/distributed/test_multi_node_assignment.py
tests/distributed/test_multiproc_executor.py
tests/distributed/test_nccl_symm_mem_allreduce.py
tests/distributed/test_node_count.py
tests/distributed/test_packed_tensor.py
tests/distributed/test_pipeline_parallel.py
tests/distributed/test_pipeline_partition.py
tests/distributed/test_pp_cudagraph.py
tests/distributed/test_pynccl.py
tests/distributed/test_quick_all_reduce.py
tests/distributed/test_same_node.py
tests/distributed/test_shm_broadcast.py
tests/distributed/test_shm_buffer.py
tests/distributed/test_shm_storage.py
tests/distributed/test_symm_mem_allreduce.py
tests/distributed/test_torchrun_example.py
tests/distributed/test_torchrun_example_moe.py
tests/distributed/test_utils.py
tests/distributed/test_weight_transfer.py
tests/engine/__init__.py
tests/engine/test_arg_utils.py
tests/engine/test_short_mm_context.py
tests/entrypoints/__init__.py
tests/entrypoints/conftest.py
tests/entrypoints/test_api_server_process_manager.py
tests/entrypoints/test_chat_utils.py
tests/entrypoints/test_context.py
tests/entrypoints/test_launch_cli.py
tests/entrypoints/test_ssl_cert_refresher.py
tests/entrypoints/test_utils.py
tests/entrypoints/anthropic/__init__.py
tests/entrypoints/anthropic/test_anthropic_messages_conversion.py
tests/entrypoints/anthropic/test_messages.py
tests/entrypoints/llm/__init__.py
tests/entrypoints/llm/test_accuracy.py
tests/entrypoints/llm/test_chat.py
tests/entrypoints/llm/test_collective_rpc.py
tests/entrypoints/llm/test_generate.py
tests/entrypoints/llm/test_gpu_utilization.py
tests/entrypoints/llm/test_mm_cache_stats.py
tests/entrypoints/llm/test_mm_embeds_only.py
tests/entrypoints/llm/test_prompt_validation.py
tests/entrypoints/llm/test_struct_output_generate.py
tests/entrypoints/offline_mode/__init__.py
tests/entrypoints/offline_mode/test_offline_mode.py
tests/entrypoints/openai/__init__.py
tests/entrypoints/openai/conftest.py
tests/entrypoints/openai/test_async_tokenization.py
tests/entrypoints/openai/test_chunked_prompt.py
tests/entrypoints/openai/test_cli_args.py
tests/entrypoints/openai/test_multi_api_servers.py
tests/entrypoints/openai/test_openai_schema.py
tests/entrypoints/openai/test_return_token_ids.py
tests/entrypoints/openai/test_return_tokens_as_ids.py
tests/entrypoints/openai/test_run_batch.py
tests/entrypoints/openai/test_uds.py
tests/entrypoints/openai/utils.py
tests/entrypoints/openai/chat_completion/__init__.py
tests/entrypoints/openai/chat_completion/test_audio.py
tests/entrypoints/openai/chat_completion/test_audio_in_video.py
tests/entrypoints/openai/chat_completion/test_batched_chat_completions.py
tests/entrypoints/openai/chat_completion/test_chat.py
tests/entrypoints/openai/chat_completion/test_chat_completion.py
tests/entrypoints/openai/chat_completion/test_chat_echo.py
tests/entrypoints/openai/chat_completion/test_chat_error.py
tests/entrypoints/openai/chat_completion/test_chat_logit_bias_validation.py
tests/entrypoints/openai/chat_completion/test_chat_with_tool_reasoning.py
tests/entrypoints/openai/chat_completion/test_completion_with_function_calling.py
tests/entrypoints/openai/chat_completion/test_completion_with_image_embeds.py
tests/entrypoints/openai/chat_completion/test_default_mm_loras.py
tests/entrypoints/openai/chat_completion/test_enable_force_include_usage.py
tests/entrypoints/openai/chat_completion/test_oot_registration.py
tests/entrypoints/openai/chat_completion/test_root_path.py
tests/entrypoints/openai/chat_completion/test_serving_chat.py
tests/entrypoints/openai/chat_completion/test_serving_chat_stream_harmony.py
tests/entrypoints/openai/chat_completion/test_video.py
tests/entrypoints/openai/chat_completion/test_vision.py
tests/entrypoints/openai/chat_completion/test_vision_embeds.py
tests/entrypoints/openai/completion/__init__.py
tests/entrypoints/openai/completion/test_completion.py
tests/entrypoints/openai/completion/test_completion_error.py
tests/entrypoints/openai/completion/test_completion_with_prompt_embeds.py
tests/entrypoints/openai/completion/test_lora_resolvers.py
tests/entrypoints/openai/completion/test_prompt_validation.py
tests/entrypoints/openai/completion/test_shutdown.py
tests/entrypoints/openai/completion/test_tensorizer_entrypoint.py
tests/entrypoints/openai/completion/test_token_in_token_out.py
tests/entrypoints/openai/correctness/__init__.py
tests/entrypoints/openai/correctness/test_lmeval.py
tests/entrypoints/openai/correctness/test_transcription_api_correctness.py
tests/entrypoints/openai/models/__init__.py
tests/entrypoints/openai/models/test_models.py
tests/entrypoints/openai/parser/__init__.py
tests/entrypoints/openai/parser/test_harmony_utils.py
tests/entrypoints/openai/realtime/__init__.py
tests/entrypoints/openai/realtime/test_realtime_validation.py
tests/entrypoints/openai/responses/__init__.py
tests/entrypoints/openai/responses/conftest.py
tests/entrypoints/openai/responses/test_basic.py
tests/entrypoints/openai/responses/test_errors.py
tests/entrypoints/openai/responses/test_function_call.py
tests/entrypoints/openai/responses/test_function_call_parsing.py
tests/entrypoints/openai/responses/test_harmony.py
tests/entrypoints/openai/responses/test_harmony_utils.py
tests/entrypoints/openai/responses/test_image.py
tests/entrypoints/openai/responses/test_mcp_tools.py
tests/entrypoints/openai/responses/test_parsable_context.py
tests/entrypoints/openai/responses/test_protocol.py
tests/entrypoints/openai/responses/test_responses_utils.py
tests/entrypoints/openai/responses/test_sampling_params.py
tests/entrypoints/openai/responses/test_serving_responses.py
tests/entrypoints/openai/responses/test_simple.py
tests/entrypoints/openai/responses/test_stateful.py
tests/entrypoints/openai/responses/test_structured_output.py
tests/entrypoints/openai/speech_to_text/__init__.py
tests/entrypoints/openai/speech_to_text/test_transcription_validation.py
tests/entrypoints/openai/speech_to_text/test_transcription_validation_whisper.py
tests/entrypoints/openai/speech_to_text/test_translation_validation.py
tests/entrypoints/openai/tool_parsers/__init__.py
tests/entrypoints/openai/tool_parsers/test_granite4_tool_parser.py
tests/entrypoints/openai/tool_parsers/test_hermes_tool_parser.py
tests/entrypoints/openai/tool_parsers/test_openai_tool_parser.py
tests/entrypoints/pooling/__init__.py
tests/entrypoints/pooling/basic/__init__.py
tests/entrypoints/pooling/basic/test_encode.py
tests/entrypoints/pooling/basic/test_truncation.py
tests/entrypoints/pooling/classify/__init__.py
tests/entrypoints/pooling/classify/test_offline.py
tests/entrypoints/pooling/classify/test_online.py
tests/entrypoints/pooling/classify/test_online_vision.py
tests/entrypoints/pooling/embed/__init__.py
tests/entrypoints/pooling/embed/conftest.py
tests/entrypoints/pooling/embed/test_cohere_online.py
tests/entrypoints/pooling/embed/test_cohere_online_vision.py
tests/entrypoints/pooling/embed/test_cohere_openai_parity.py
tests/entrypoints/pooling/embed/test_correctness_mteb.py
tests/entrypoints/pooling/embed/test_io_processor.py
tests/entrypoints/pooling/embed/test_offline.py
tests/entrypoints/pooling/embed/test_online.py
tests/entrypoints/pooling/embed/test_online_dimensions.py
tests/entrypoints/pooling/embed/test_online_long_text.py
tests/entrypoints/pooling/embed/test_online_vision.py
tests/entrypoints/pooling/embed/test_protocol.py
tests/entrypoints/pooling/pooling/__init__.py
tests/entrypoints/pooling/pooling/test_online.py
tests/entrypoints/pooling/reward/__init__.py
tests/entrypoints/pooling/reward/test_offline.py
tests/entrypoints/pooling/scoring/__init__.py
tests/entrypoints/pooling/scoring/test_bi_encoder_offline.py
tests/entrypoints/pooling/scoring/test_bi_encoder_online.py
tests/entrypoints/pooling/scoring/test_cross_encoder_correctness_mteb.py
tests/entrypoints/pooling/scoring/test_cross_encoder_offline.py
tests/entrypoints/pooling/scoring/test_cross_encoder_online.py
tests/entrypoints/pooling/scoring/test_cross_encoder_online_vision.py
tests/entrypoints/pooling/scoring/test_late_interaction_offline.py
tests/entrypoints/pooling/scoring/test_late_interaction_online.py
tests/entrypoints/pooling/scoring/test_utils.py
tests/entrypoints/pooling/scoring/util.py
tests/entrypoints/pooling/token_classify/__init__.py
tests/entrypoints/pooling/token_classify/test_offline.py
tests/entrypoints/pooling/token_classify/test_online.py
tests/entrypoints/pooling/token_embed/__init__.py
tests/entrypoints/pooling/token_embed/test_offline.py
tests/entrypoints/pooling/token_embed/test_online.py
tests/entrypoints/rpc/__init__.py
tests/entrypoints/rpc/test_collective_rpc.py
tests/entrypoints/sagemaker/__init__.py
tests/entrypoints/sagemaker/conftest.py
tests/entrypoints/sagemaker/test_sagemaker_handler_overrides.py
tests/entrypoints/sagemaker/test_sagemaker_lora_adapters.py
tests/entrypoints/sagemaker/test_sagemaker_middleware_integration.py
tests/entrypoints/sagemaker/test_sagemaker_stateful_sessions.py
tests/entrypoints/serve/__init__.py
tests/entrypoints/serve/disagg/__init__.py
tests/entrypoints/serve/disagg/test_serving_tokens.py
tests/entrypoints/serve/instrumentator/__init__.py
tests/entrypoints/serve/instrumentator/test_basic.py
tests/entrypoints/serve/instrumentator/test_metrics.py
tests/entrypoints/serve/instrumentator/test_optional_middleware.py
tests/entrypoints/serve/instrumentator/test_orca_metrics.py
tests/entrypoints/serve/instrumentator/test_sleep.py
tests/entrypoints/serve/lora/__init__.py
tests/entrypoints/serve/lora/test_lora_adapters.py
tests/entrypoints/serve/lora/test_serving_models.py
tests/entrypoints/serve/render/__init__.py
tests/entrypoints/serve/render/test_launch_render.py
tests/entrypoints/serve/render/test_render.py
tests/entrypoints/serve/render/test_render_multimodal.py
tests/entrypoints/serve/tokenize/__init__.py
tests/entrypoints/serve/tokenize/test_tokenization.py
tests/entrypoints/serve/tokenize/test_tokenization_vlm.py
tests/entrypoints/weight_transfer/__init__.py
tests/entrypoints/weight_transfer/test_weight_transfer_llm.py
tests/evals/gpt_oss/README.md
tests/evals/gpt_oss/__init__.py
tests/evals/gpt_oss/conftest.py
tests/evals/gpt_oss/test_gpqa_correctness.py
tests/evals/gpt_oss/configs/gpt-oss-20b-baseline.yaml
tests/evals/gpt_oss/configs/gpt-oss-20b-flashinfer-mxfp4-bf16.yaml
tests/evals/gpt_oss/configs/gpt-oss-20b-flashinfer-mxfp4-mxfp8-cutlass.yaml
tests/evals/gpt_oss/configs/gpt-oss-20b-marlin.yaml
tests/evals/gpt_oss/configs/gpt-oss-20b-rocm-baseline.yaml
tests/evals/gpt_oss/configs/gpt-oss-20b-sm100-fi-mxfp4-mxfp8-trtllm.yaml
tests/evals/gpt_oss/configs/models-b200.txt
tests/evals/gpt_oss/configs/models-gfx942.txt
tests/evals/gpt_oss/configs/models-gfx950.txt
tests/evals/gpt_oss/configs/models-h100.txt
tests/evals/gsm8k/README.md
tests/evals/gsm8k/__init__.py
tests/evals/gsm8k/conftest.py
tests/evals/gsm8k/gsm8k_eval.py
tests/evals/gsm8k/test_gsm8k_correctness.py
tests/evals/gsm8k/configs/DeepSeek-R1-DP.yaml
tests/evals/gsm8k/configs/DeepSeek-R1-DP_MI325.yaml
tests/evals/gsm8k/configs/DeepSeek-R1-TP.yaml
tests/evals/gsm8k/configs/DeepSeek-R1-TP_MI325.yaml
tests/evals/gsm8k/configs/DeepSeek-V2-Lite-Instruct-FP8.yaml
tests/evals/gsm8k/configs/DeepSeek-V3.2-DP.yaml
tests/evals/gsm8k/configs/DeepSeek-V3.2-DP_MI325.yaml
tests/evals/gsm8k/configs/DeepSeek-V3.2-TP.yaml
tests/evals/gsm8k/configs/DeepSeek-V3.2-TP_MI325.yaml
tests/evals/gsm8k/configs/Llama-3-8B-Instruct-nonuniform-CT.yaml
tests/evals/gsm8k/configs/Llama-3.2-1B-Instruct-INT8-CT.yaml
tests/evals/gsm8k/configs/Nemotron-3-Super-120B-A12B-BF16.yaml
tests/evals/gsm8k/configs/Nemotron-3-Super-120B-A12B-FP8.yaml
tests/evals/gsm8k/configs/Nemotron-3-Super-120B-A12B-NVFP4.yaml
tests/evals/gsm8k/configs/Qwen1.5-MoE-W4A16-CT.yaml
tests/evals/gsm8k/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
tests/evals/gsm8k/configs/Qwen3-0.6B-FP8.yaml
tests/evals/gsm8k/configs/Qwen3-30B-A3B-MXFP4A16.yaml
tests/evals/gsm8k/configs/Qwen3-30B-A3B-NVFP4.yaml
tests/evals/gsm8k/configs/Qwen3-Next-80B-A3B-NVFP4-EP2.yaml
tests/evals/gsm8k/configs/Qwen3-Next-FP8-EP2.yaml
tests/evals/gsm8k/configs/Qwen3-Next-FP8-EP2_MI355.yaml
tests/evals/gsm8k/configs/Qwen3.5-35B-A3B-DEP2.yaml
tests/evals/gsm8k/configs/Qwen3.5-35B-A3B-FP8-DEP2.yaml
tests/evals/gsm8k/configs/Qwen3.5-397B-A17B-NVFP4-DEP2.yaml
tests/evals/gsm8k/configs/models-blackwell.txt
tests/evals/gsm8k/configs/models-h200.txt
tests/evals/gsm8k/configs/models-mi3xx-fp8-and-mixed.txt
tests/evals/gsm8k/configs/models-mi3xx.txt
tests/evals/gsm8k/configs/models-qwen35-blackwell.txt
tests/evals/gsm8k/configs/models-qwen35-mi355.txt
tests/evals/gsm8k/configs/models-small.txt
tests/evals/gsm8k/configs/moe-refactor/Llama-4-Scout-BF16-fi-cutlass.yaml
tests/evals/gsm8k/configs/moe-refactor/Llama-4-Scout-BF16-triton.yaml
tests/evals/gsm8k/configs/moe-refactor/Llama-4-Scout-Fp8-CT-vllm-cutlass.yaml
tests/evals/gsm8k/configs/moe-refactor/Llama-4-Scout-Fp8-ModelOpt-fi-cutlass.yaml
tests/evals/gsm8k/configs/moe-refactor/Llama-4-Scout-Fp8-ModelOpt-fi-trtllm.yaml
tests/evals/gsm8k/configs/moe-refactor/Llama-4-Scout-Fp8-ModelOpt-marlin.yaml
tests/evals/gsm8k/configs/moe-refactor/Llama-4-Scout-Fp8-ModelOpt-triton.yaml
tests/evals/gsm8k/configs/moe-refactor/Mixtral-8x7B-BF16-fi-cutlass.yaml
tests/evals/gsm8k/configs/moe-refactor/Mixtral-8x7B-BF16-triton.yaml
tests/evals/gsm8k/configs/moe-refactor/Mixtral-8x7B-Fp8-AutoFp8-fi-cutlass.yaml
tests/evals/gsm8k/configs/moe-refactor/Mixtral-8x7B-Fp8-AutoFp8-triton.yaml
tests/evals/gsm8k/configs/moe-refactor/Nemotron-Nano-30B-Fp8-ModelOpt-fi-trtllm.yaml
tests/evals/gsm8k/configs/moe-refactor/Nemotron-Nano-30B-NvFp4-ModelOpt-fi-cutlass.yaml
tests/evals/gsm8k/configs/moe-refactor/Nemotron-Nano-30B-NvFp4-ModelOpt-vllm-cutlass.yaml
tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-BF16-fi-cutlass.yaml
tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-BF16-triton.yaml
tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-Fp8-AutoFp8-deepgemm.yaml
tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-Fp8-AutoFp8-fi-cutlass.yaml
tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-Fp8-AutoFp8-fi-trtllm.yaml
tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-Fp8-AutoFp8-marlin.yaml
tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-Fp8-AutoFp8-triton.yaml
tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-Fp8-CT-Block-deepgemm.yaml
tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-Fp8-CT-Block-fi-cutlass.yaml
tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-Fp8-CT-Block-marlin.yaml
tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-Fp8-CT-Block-triton.yaml
tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-Fp8-CT-Channel-marlin.yaml
tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-Fp8-CT-Channel-vllm-cutlass.yaml
tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-NvFp4-CT-fi-cutlass.yaml
tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-NvFp4-CT-fi-trtllm.yaml
tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-NvFp4-CT-marlin.yaml
tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-NvFp4-CT-vllm-cutlass.yaml
tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-NvFp4-ModelOpt-fi-cutlass.yaml
tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-NvFp4-ModelOpt-fi-trtllm.yaml
tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-NvFp4-ModelOpt-marlin.yaml
tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-NvFp4-ModelOpt-vllm-cutlass.yaml
tests/evals/gsm8k/configs/moe-refactor/config-b200.txt
tests/evals/gsm8k/configs/moe-refactor/config-h100.txt
tests/evals/gsm8k/configs/moe-refactor/config-test.txt
tests/evals/gsm8k/configs/moe-refactor-dp-ep/Llama-4-Scout-Fp8-ModelOpt-triton.yaml
tests/evals/gsm8k/configs/moe-refactor-dp-ep/Qwen3-30B-A3B-BF16-triton.yaml
tests/evals/gsm8k/configs/moe-refactor-dp-ep/Qwen3-30B-A3B-Fp8-AutoFp8-deepgemm-deepep-ht.yaml
tests/evals/gsm8k/configs/moe-refactor-dp-ep/Qwen3-30B-A3B-Fp8-AutoFp8-deepgemm-deepep-ll.yaml
tests/evals/gsm8k/configs/moe-refactor-dp-ep/Qwen3-30B-A3B-Fp8-AutoFp8-deepgemm.yaml
tests/evals/gsm8k/configs/moe-refactor-dp-ep/Qwen3-30B-A3B-Fp8-CT-Block-deepgemm-deepep-ht.yaml
tests/evals/gsm8k/configs/moe-refactor-dp-ep/Qwen3-30B-A3B-Fp8-CT-Block-deepgemm-deepep-ll.yaml
tests/evals/gsm8k/configs/moe-refactor-dp-ep/Qwen3-30B-A3B-Fp8-CT-Block-deepgemm.yaml
tests/evals/gsm8k/configs/moe-refactor-dp-ep/Qwen3-30B-A3B-NvFp4-CT-fi-cutedsl-deepep-ll.yaml
tests/evals/gsm8k/configs/moe-refactor-dp-ep/Qwen3-30B-A3B-NvFp4-CT-fi-cutlass.yaml
tests/evals/gsm8k/configs/moe-refactor-dp-ep/Qwen3-30B-A3B-NvFp4-ModelOpt-fi-cutedsl-deepep-ll.yaml
tests/evals/gsm8k/configs/moe-refactor-dp-ep/Qwen3-30B-A3B-NvFp4-ModelOpt-fi-cutlass.yaml
tests/evals/gsm8k/configs/moe-refactor-dp-ep/Qwen3-30B-A3B-NvFp4-ModelOpt-fi-trtllm.yaml
tests/evals/gsm8k/configs/moe-refactor-dp-ep/config-b200.txt
tests/kernels/__init__.py
tests/kernels/allclose_default.py
tests/kernels/quant_utils.py
tests/kernels/test_apply_repetition_penalties.py
tests/kernels/test_cache_kernels.py
tests/kernels/test_concat_mla_q.py
tests/kernels/test_cp_gather_fp8.py
tests/kernels/test_fla_layernorm_guard.py
tests/kernels/test_flex_attention.py
tests/kernels/test_fused_quant_activation.py
tests/kernels/test_fused_recurrent_packed_decode.py
tests/kernels/test_fused_sigmoid_gating_delta_rule.py
tests/kernels/test_onednn.py
tests/kernels/test_shuffle_rows.py
tests/kernels/test_top_k_per_row.py
tests/kernels/utils.py
tests/kernels/attention/conftest.py
tests/kernels/attention/test_aiter_flash_attn.py
tests/kernels/attention/test_attention.py
tests/kernels/attention/test_attention_selector.py
tests/kernels/attention/test_cache.py
tests/kernels/attention/test_cascade_flash_attn.py
tests/kernels/attention/test_cpu_attn.py
tests/kernels/attention/test_cutlass_mla_decode.py
tests/kernels/attention/test_deepgemm_attention.py
tests/kernels/attention/test_flash_attn.py
tests/kernels/attention/test_flashinfer.py
tests/kernels/attention/test_flashinfer_mla_decode.py
tests/kernels/attention/test_flashinfer_trtllm_attention.py
tests/kernels/attention/test_flashmla.py
tests/kernels/attention/test_flashmla_sparse.py
tests/kernels/attention/test_lightning_attn.py
tests/kernels/attention/test_merge_attn_states.py
tests/kernels/attention/test_mha_attn.py
tests/kernels/attention/test_mla_decode_cpu.py
tests/kernels/attention/test_pack_unpack_triton.py
tests/kernels/attention/test_prefix_prefill.py
tests/kernels/attention/test_rocm_attention_selector.py
tests/kernels/attention/test_triton_decode_attention.py
tests/kernels/attention/test_triton_prefill_attention.py
tests/kernels/attention/test_triton_unified_attention.py
tests/kernels/attention/test_trtllm_kvfp8_dequant.py
tests/kernels/attention/test_use_trtllm_attention.py
tests/kernels/attention/test_xpu_mla_sparse.py
tests/kernels/core/test_activation.py
tests/kernels/core/test_apply_rotary_emb.py
tests/kernels/core/test_fused_qk_norm_rope.py
tests/kernels/core/test_fused_quant_layernorm.py
tests/kernels/core/test_fused_rms_norm_gated.py
tests/kernels/core/test_layernorm.py
tests/kernels/core/test_minimax_reduce_rms.py
tests/kernels/core/test_mrope.py
tests/kernels/core/test_opcheck.py
tests/kernels/core/test_permute_cols.py
tests/kernels/core/test_pos_encoding.py
tests/kernels/core/test_rotary_embedding.py
tests/kernels/core/test_rotary_embedding_mla_cache_fused.py
tests/kernels/core/test_uva.py
tests/kernels/helion/helpers.py
tests/kernels/helion/test_autotune.py
tests/kernels/helion/test_config_manager.py
tests/kernels/helion/test_helion_available.py
tests/kernels/helion/test_pattern_matching.py
tests/kernels/helion/test_register.py
tests/kernels/helion/test_silu_mul_fp8.py
tests/kernels/helion/test_utils.py
tests/kernels/mamba/test_causal_conv1d.py
tests/kernels/mamba/test_mamba_mixer2.py
tests/kernels/mamba/test_mamba_ssm.py
tests/kernels/mamba/test_mamba_ssm_ssd.py
tests/kernels/moe/__init__.py
tests/kernels/moe/parallel_utils.py
tests/kernels/moe/test_batched_deepgemm.py
tests/kernels/moe/test_batched_moe.py
tests/kernels/moe/test_block_fp8.py
tests/kernels/moe/test_block_int8.py
tests/kernels/moe/test_count_expert_num_tokens.py
tests/kernels/moe/test_cpu_fused_moe.py
tests/kernels/moe/test_cutedsl_moe.py
tests/kernels/moe/test_cutlass_moe.py
tests/kernels/moe/test_cutlass_mxfp8_grouped_mm.py
tests/kernels/moe/test_deepep_deepgemm_moe.py
tests/kernels/moe/test_deepep_moe.py
tests/kernels/moe/test_deepgemm.py
tests/kernels/moe/test_flashinfer.py
tests/kernels/moe/test_flashinfer_moe.py
tests/kernels/moe/test_fused_topk.py
tests/kernels/moe/test_gpt_oss_triton_kernels.py
tests/kernels/moe/test_grouped_topk.py
tests/kernels/moe/test_marlin_vs_trtllm_mxint4.py
tests/kernels/moe/test_modular_kernel_combinations.py
tests/kernels/moe/test_modular_oai_triton_moe.py
tests/kernels/moe/test_moe.py
tests/kernels/moe/test_moe_align_block_size.py
tests/kernels/moe/test_moe_permute_unpermute.py
tests/kernels/moe/test_nvfp4_moe.py
tests/kernels/moe/test_ocp_mx_moe.py
tests/kernels/moe/test_rocm_aiter_topk.py
tests/kernels/moe/test_router_gemm.py
tests/kernels/moe/test_routing.py
tests/kernels/moe/test_routing_simulator.py
tests/kernels/moe/test_shared_fused_moe_routed_transform.py
tests/kernels/moe/test_silu_mul_fp8_quant_deep_gemm.py
tests/kernels/moe/test_silu_mul_per_token_group_quant_fp8_colmajor.py
tests/kernels/moe/test_triton_moe_no_act_mul.py
tests/kernels/moe/test_triton_moe_ptpc_fp8.py
tests/kernels/moe/test_unquantized_backend_selection.py
tests/kernels/moe/utils.py
tests/kernels/moe/modular_kernel_tools/__init__.py
tests/kernels/moe/modular_kernel_tools/cli_args.py
tests/kernels/moe/modular_kernel_tools/common.py
tests/kernels/moe/modular_kernel_tools/make_feature_matrix.py
tests/kernels/moe/modular_kernel_tools/mk_objects.py
tests/kernels/moe/modular_kernel_tools/parallel_utils.py
tests/kernels/moe/modular_kernel_tools/profile_modular_kernel.py
tests/kernels/quantization/nvfp4_utils.py
tests/kernels/quantization/test_allspark_gemm.py
tests/kernels/quantization/test_awq.py
tests/kernels/quantization/test_awq_triton.py
tests/kernels/quantization/test_block_fp8.py
tests/kernels/quantization/test_block_int8.py
tests/kernels/quantization/test_cutlass_scaled_mm.py
tests/kernels/quantization/test_cutlass_w4a8.py
tests/kernels/quantization/test_cutlass_w4a8_moe.py
tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
tests/kernels/quantization/test_flashinfer_scaled_mm.py
tests/kernels/quantization/test_fp8_min_max_helper.py
tests/kernels/quantization/test_fp8_quant.py
tests/kernels/quantization/test_fp8_quant_group.py
tests/kernels/quantization/test_ggml.py
tests/kernels/quantization/test_gguf.py
tests/kernels/quantization/test_gptq.py
tests/kernels/quantization/test_hadacore.py
tests/kernels/quantization/test_int8_kernel.py
tests/kernels/quantization/test_int8_quant.py
tests/kernels/quantization/test_machete_mm.py
tests/kernels/quantization/test_marlin_gemm.py
tests/kernels/quantization/test_mxfp4_qutlass.py
tests/kernels/quantization/test_mxfp4_triton_ep.py
tests/kernels/quantization/test_nvfp4_quant.py
tests/kernels/quantization/test_nvfp4_qutlass.py
tests/kernels/quantization/test_nvfp4_scaled_mm.py
tests/kernels/quantization/test_per_token_group_quant.py
tests/kernels/quantization/test_rocm_skinny_gemms.py
tests/kernels/quantization/test_scaled_mm_kernel_selection.py
tests/kernels/quantization/test_silu_mul_nvfp4_quant.py
tests/kernels/quantization/test_triton_scaled_mm.py
tests/lora/__init__.py
tests/lora/conftest.py
tests/lora/test_add_lora.py
tests/lora/test_chatglm3_tp.py
tests/lora/test_deepseekv2_tp.py
tests/lora/test_default_mm_loras.py
tests/lora/test_fused_moe_lora_kernel.py
tests/lora/test_gptoss_tp.py
tests/lora/test_layers.py
tests/lora/test_llama_tp.py
tests/lora/test_llm_with_multi_loras.py
tests/lora/test_lora_checkpoints.py
tests/lora/test_lora_functions.py
tests/lora/test_lora_huggingface.py
tests/lora/test_lora_manager.py
tests/lora/test_lora_utils.py
tests/lora/test_minicpmv_tp.py
tests/lora/test_mixtral.py
tests/lora/test_moe_lora_align_sum.py
tests/lora/test_olmoe_tp.py
tests/lora/test_peft_helper.py
tests/lora/test_punica_ops.py
tests/lora/test_punica_ops_fp8.py
tests/lora/test_punica_xpu_ops.py
tests/lora/test_quant_model.py
tests/lora/test_qwen35_densemodel_lora.py
tests/lora/test_qwen3_unembed.py
tests/lora/test_qwen3moe_tp.py
tests/lora/test_qwenvl.py
tests/lora/test_resolver.py
tests/lora/test_transformers_model.py
tests/lora/test_utils.py
tests/lora/test_whisper.py
tests/lora/test_worker.py
tests/lora/utils.py
tests/model_executor/__init__.py
tests/model_executor/test_cpu_unquantized_gemm_dispatch.py
tests/model_executor/test_eagle_quantization.py
tests/model_executor/test_enabled_custom_ops.py
tests/model_executor/test_model_load_with_params.py
tests/model_executor/test_oink_integration.py
tests/model_executor/test_qwen3_omni.py
tests/model_executor/test_qwen3_vl_mrope.py
tests/model_executor/test_routed_experts_capture.py
tests/model_executor/test_weight_utils.py
tests/model_executor/layers/test_rocm_unquantized_gemm.py
tests/model_executor/model_loader/__init__.py
tests/model_executor/model_loader/test_ep_weight_filter.py
tests/model_executor/model_loader/test_registry.py
tests/model_executor/model_loader/test_reload.py
tests/model_executor/model_loader/test_sharded_state_loader.py
tests/model_executor/model_loader/fastsafetensors_loader/__init__.py
tests/model_executor/model_loader/fastsafetensors_loader/test_fastsafetensors_loader.py
tests/model_executor/model_loader/fastsafetensors_loader/test_weight_utils.py
tests/model_executor/model_loader/instanttensor_loader/__init__.py
tests/model_executor/model_loader/instanttensor_loader/test_instanttensor_loader.py
tests/model_executor/model_loader/instanttensor_loader/test_weight_utils.py
tests/model_executor/model_loader/runai_streamer_loader/__init__.py
tests/model_executor/model_loader/runai_streamer_loader/conftest.py
tests/model_executor/model_loader/runai_streamer_loader/test_runai_model_streamer_loader.py
tests/model_executor/model_loader/runai_streamer_loader/test_runai_model_streamer_s3.py
tests/model_executor/model_loader/runai_streamer_loader/test_runai_utils.py
tests/model_executor/model_loader/runai_streamer_loader/test_weight_utils.py
tests/model_executor/model_loader/tensorizer_loader/__init__.py
tests/model_executor/model_loader/tensorizer_loader/conftest.py
tests/model_executor/model_loader/tensorizer_loader/test_tensorizer.py
tests/models/__init__.py
tests/models/registry.py
tests/models/test_gguf_download.py
tests/models/test_initialization.py
tests/models/test_oot_registration.py
tests/models/test_registry.py
tests/models/test_terratorch.py
tests/models/test_transformers.py
tests/models/test_utils.py
tests/models/test_vision.py
tests/models/utils.py
tests/models/fixtures/mistral_small_3_chat.json
tests/models/fixtures/pixtral_chat.json
tests/models/fixtures/qwen2_5_math_prm_reward_step.json
tests/models/fixtures/audioflamingo3/expected_results_batched.json
tests/models/fixtures/audioflamingo3/expected_results_single.json
tests/models/fixtures/musicflamingo/expected_results_batched.json
tests/models/fixtures/musicflamingo/expected_results_single.json
tests/models/language/__init__.py
tests/models/language/generation/__init__.py
tests/models/language/generation/conftest.py
tests/models/language/generation/test_common.py
tests/models/language/generation/test_gemma.py
tests/models/language/generation/test_granite.py
tests/models/language/generation/test_grok.py
tests/models/language/generation/test_hybrid.py
tests/models/language/generation/test_mistral.py
tests/models/language/generation/test_phimoe.py
tests/models/language/generation_ppl_test/__init__.py
tests/models/language/generation_ppl_test/ppl_utils.py
tests/models/language/generation_ppl_test/test_gemma.py
tests/models/language/generation_ppl_test/test_gpt.py
tests/models/language/generation_ppl_test/test_qwen.py
tests/models/language/pooling/__init__.py
tests/models/language/pooling/conftest.py
tests/models/language/pooling/embed_utils.py
tests/models/language/pooling/test_all_pooling_plus_chunked_prefill.py
tests/models/language/pooling/test_auto_prefix_cache_support.py
tests/models/language/pooling/test_bge_m3.py
tests/models/language/pooling/test_classification.py
tests/models/language/pooling/test_colbert.py
tests/models/language/pooling/test_embedding.py
tests/models/language/pooling/test_extract_hidden_states.py
tests/models/language/pooling/test_gritlm.py
tests/models/language/pooling/test_head_dtype.py
tests/models/language/pooling/test_mm_classifier_conversion.py
tests/models/language/pooling/test_multi_vector_retrieval.py
tests/models/language/pooling/test_multilabel_classification_support.py
tests/models/language/pooling/test_nomic_max_model_len.py
tests/models/language/pooling/test_pooler_config_init_behaviour.py
tests/models/language/pooling/test_reward.py
tests/models/language/pooling/test_splade_sparse_pooler.py
tests/models/language/pooling/test_token_classification.py
tests/models/language/pooling/test_truncation_control.py
tests/models/language/pooling_mteb_test/__init__.py
tests/models/language/pooling_mteb_test/mteb_embed_utils.py
tests/models/language/pooling_mteb_test/mteb_score_utils.py
tests/models/language/pooling_mteb_test/test_baai.py
tests/models/language/pooling_mteb_test/test_bge_reranker_v2_gemma.py
tests/models/language/pooling_mteb_test/test_cross_encoder.py
tests/models/language/pooling_mteb_test/test_ernie.py
tests/models/language/pooling_mteb_test/test_gte.py
tests/models/language/pooling_mteb_test/test_intfloat.py
tests/models/language/pooling_mteb_test/test_jina.py
tests/models/language/pooling_mteb_test/test_mxbai_rerank.py
tests/models/language/pooling_mteb_test/test_nemotron.py
tests/models/language/pooling_mteb_test/test_nomic.py
tests/models/language/pooling_mteb_test/test_qwen3_reranker.py
tests/models/language/pooling_mteb_test/test_snowflake_arctic_embed.py
tests/models/language/pooling_mteb_test/test_st_projector.py
tests/models/language/pooling_mteb_test/test_voyage.py
tests/models/multimodal/__init__.py
tests/models/multimodal/conftest.py
tests/models/multimodal/test_mapping.py
tests/models/multimodal/generation/__init__.py
tests/models/multimodal/generation/test_audioflamingo3.py
tests/models/multimodal/generation/test_common.py
tests/models/multimodal/generation/test_granite_speech.py
tests/models/multimodal/generation/test_interleaved.py
tests/models/multimodal/generation/test_keye.py
tests/models/multimodal/generation/test_maverick.py
tests/models/multimodal/generation/test_multimodal_gguf.py
tests/models/multimodal/generation/test_musicflamingo.py
tests/models/multimodal/generation/test_nemotron_parse.py
tests/models/multimodal/generation/test_phi4mm.py
tests/models/multimodal/generation/test_phi4siglip.py
tests/models/multimodal/generation/test_pixtral.py
tests/models/multimodal/generation/test_qwen2_5_vl.py
tests/models/multimodal/generation/test_qwen2_vl.py
tests/models/multimodal/generation/test_ultravox.py
tests/models/multimodal/generation/test_vit_backend_functionality.py
tests/models/multimodal/generation/test_voxtral.py
tests/models/multimodal/generation/test_voxtral_realtime.py
tests/models/multimodal/generation/test_whisper.py
tests/models/multimodal/generation/vlm_utils/__init__.py
tests/models/multimodal/generation/vlm_utils/builders.py
tests/models/multimodal/generation/vlm_utils/case_filtering.py
tests/models/multimodal/generation/vlm_utils/core.py
tests/models/multimodal/generation/vlm_utils/custom_inputs.py
tests/models/multimodal/generation/vlm_utils/model_utils.py
tests/models/multimodal/generation/vlm_utils/runners.py
tests/models/multimodal/generation/vlm_utils/types.py
tests/models/multimodal/pooling/__init__.py
tests/models/multimodal/pooling/conftest.py
tests/models/multimodal/pooling/test_clip.py
tests/models/multimodal/pooling/test_colmodernvbert.py
tests/models/multimodal/pooling/test_colpali.py
tests/models/multimodal/pooling/test_colqwen3.py
tests/models/multimodal/pooling/test_colqwen3_5.py
tests/models/multimodal/pooling/test_dse_qwen2_vl.py
tests/models/multimodal/pooling/test_intern_vit.py
tests/models/multimodal/pooling/test_jinavl_reranker.py
tests/models/multimodal/pooling/test_llama_nemotron_vl.py
tests/models/multimodal/pooling/test_llava_next.py
tests/models/multimodal/pooling/test_phi3v.py
tests/models/multimodal/pooling/test_prithvi_mae.py
tests/models/multimodal/pooling/test_qwen3_asr_forced_aligner.py
tests/models/multimodal/pooling/test_radio.py
tests/models/multimodal/pooling/test_siglip.py
tests/models/multimodal/processing/__init__.py
tests/models/multimodal/processing/test_audio_in_video.py
tests/models/multimodal/processing/test_audioflamingo3.py
tests/models/multimodal/processing/test_common.py
tests/models/multimodal/processing/test_deepseek_ocr.py
tests/models/multimodal/processing/test_gemma3.py
tests/models/multimodal/processing/test_gemma4.py
tests/models/multimodal/processing/test_glm4_1v.py
tests/models/multimodal/processing/test_h2ovl.py
tests/models/multimodal/processing/test_idefics3.py
tests/models/multimodal/processing/test_internvl.py
tests/models/multimodal/processing/test_llama4.py
tests/models/multimodal/processing/test_llava_next.py
tests/models/multimodal/processing/test_llava_onevision.py
tests/models/multimodal/processing/test_minimax_vl_01.py
tests/models/multimodal/processing/test_mllama4.py
tests/models/multimodal/processing/test_musicflamingo.py
tests/models/multimodal/processing/test_nemotron_vl.py
tests/models/multimodal/processing/test_phi3v.py
tests/models/multimodal/processing/test_phi4mm.py
tests/models/multimodal/processing/test_qwen2_5_omni_embed.py
tests/models/multimodal/processing/test_qwen2_vl.py
tests/models/multimodal/processing/test_qwen3_omni.py
tests/models/multimodal/processing/test_qwen3_vl.py
tests/models/multimodal/processing/test_smolvlm.py
tests/models/multimodal/processing/test_tensor_schema.py
tests/models/multimodal/processing/test_transformers.py
tests/models/quantization/__init__.py
tests/models/quantization/test_awq.py
tests/models/quantization/test_bitsandbytes.py
tests/models/quantization/test_fp8.py
tests/models/quantization/test_gguf.py
tests/models/quantization/test_gpt_oss.py
tests/models/quantization/test_gptq_marlin.py
tests/models/quantization/test_modelopt.py
tests/models/quantization/test_mxfp4.py
tests/models/quantization/test_mxfp8.py
tests/models/quantization/test_nvfp4.py
tests/multimodal/__init__.py
tests/multimodal/test_audio.py
tests/multimodal/test_cache.py
tests/multimodal/test_embedding_shape_validation.py
tests/multimodal/test_embedding_shape_validation_unit.py
tests/multimodal/test_hasher.py
tests/multimodal/test_image.py
tests/multimodal/test_inputs.py
tests/multimodal/test_processing.py
tests/multimodal/test_registry.py
tests/multimodal/test_sparse_tensor_validation_unit.py
tests/multimodal/test_utils.py
tests/multimodal/test_video.py
tests/multimodal/utils.py
tests/multimodal/assets/corrupted.mp4
tests/multimodal/assets/image1.png
tests/multimodal/assets/image2.png
tests/multimodal/assets/rgba.png
tests/multimodal/media/__init__.py
tests/multimodal/media/test_audio.py
tests/multimodal/media/test_base.py
tests/multimodal/media/test_connector.py
tests/multimodal/media/test_image.py
tests/multimodal/media/test_video.py
tests/plugins/bge_m3_sparse_plugin/setup.py
tests/plugins/bge_m3_sparse_plugin/bge_m3_sparse_processor/__init__.py
tests/plugins/bge_m3_sparse_plugin/bge_m3_sparse_processor/sparse_embeddings_processor.py
tests/plugins/bge_m3_sparse_plugin/bge_m3_sparse_processor/types.py
tests/plugins/lora_resolvers/__init__.py
tests/plugins/lora_resolvers/test_filesystem_resolver.py
tests/plugins/lora_resolvers/test_hf_hub_resolver.py
tests/plugins/prithvi_io_processor_plugin/setup.py
tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/__init__.py
tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/prithvi_processor.py
tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/types.py
tests/plugins/vllm_add_dummy_model/setup.py
tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/__init__.py
tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py
tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py
tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_opt.py
tests/plugins/vllm_add_dummy_platform/setup.py
tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/__init__.py
tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_attention_backend.py
tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_custom_ops.py
tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py
tests/plugins/vllm_add_dummy_stat_logger/setup.py
tests/plugins/vllm_add_dummy_stat_logger/dummy_stat_logger/dummy_stat_logger.py
tests/plugins_tests/test_bge_m3_sparse_io_processor_plugins.py
tests/plugins_tests/test_io_processor_plugins.py
tests/plugins_tests/test_platform_plugins.py
tests/plugins_tests/test_scheduler_plugins.py
tests/plugins_tests/test_stats_logger_plugins.py
tests/plugins_tests/test_terratorch_io_processor_plugins.py
tests/prompts/example.txt
tests/prompts/summary.txt
tests/quantization/__init__.py
tests/quantization/fp_quant.py
tests/quantization/reference_mxfp4.py
tests/quantization/test_auto_round.py
tests/quantization/test_blackwell_moe.py
tests/quantization/test_compressed_tensors.py
tests/quantization/test_configs.py
tests/quantization/test_cpu_offload.py
tests/quantization/test_cpu_wna16.py
tests/quantization/test_experts_int8.py
tests/quantization/test_fp8.py
tests/quantization/test_gptq_dynamic.py
tests/quantization/test_gptq_v2.py
tests/quantization/test_lm_head.py
tests/quantization/test_mi3xx_moe.py
tests/quantization/test_mixed_precision.py
tests/quantization/test_modelopt.py
tests/quantization/test_quark.py
tests/quantization/test_register_quantization_config.py
tests/quantization/test_torchao.py
tests/quantization/utils.py
tests/reasoning/__init__.py
tests/reasoning/test_base_thinking_reasoning_parser.py
tests/reasoning/test_deepseekr1_reasoning_parser.py
tests/reasoning/test_deepseekv3_reasoning_parser.py
tests/reasoning/test_ernie45_reasoning_parser.py
tests/reasoning/test_gemma4_reasoning_parser.py
tests/reasoning/test_glm4_moe_reasoning_parser.py
tests/reasoning/test_gptoss_reasoning_parser.py
tests/reasoning/test_granite_reasoning_parser.py
tests/reasoning/test_holo2_reasoning_parser.py
tests/reasoning/test_hunyuan_reasoning_parser.py
tests/reasoning/test_kimi_k2_reasoning_parser.py
tests/reasoning/test_minimax_m2_append_reasoning_parser.py
tests/reasoning/test_minimax_m2_reasoning_parser.py
tests/reasoning/test_mistral_reasoning_parser.py
tests/reasoning/test_nemotron_v3_reasoning_parser.py
tests/reasoning/test_olmo3_reasoning_parser.py
tests/reasoning/test_qwen3_reasoning_parser.py
tests/reasoning/test_seedoss_reasoning_parser.py
tests/reasoning/test_step3p5_reasoning_parser.py
tests/reasoning/utils.py
tests/renderers/__init__.py
tests/renderers/test_completions.py
tests/renderers/test_gemma4_chat_template.py
tests/renderers/test_hf.py
tests/renderers/test_mistral.py
tests/renderers/test_process_multi_modal_uuids.py
tests/renderers/test_sparse_tensor_validation.py
tests/renderers/inputs/__init__.py
tests/renderers/inputs/test_preprocess.py
tests/rocm/aiter/test_grouped_quant.py
tests/rocm/aiter/test_mla_fp8_support_check.py
tests/samplers/__init__.py
tests/samplers/test_beam_search.py
tests/samplers/test_ignore_eos.py
tests/samplers/test_logprobs.py
tests/samplers/test_no_bad_words.py
tests/standalone_tests/lazy_imports.py
tests/standalone_tests/python_only_compile.sh
tests/standalone_tests/pytorch_nightly_dependency.sh
tests/system_messages/sonnet3.5_nov2024.txt
tests/tokenizers_/__init__.py
tests/tokenizers_/test_basic.py
tests/tokenizers_/test_detokenize.py
tests/tokenizers_/test_hf.py
tests/tokenizers_/test_mistral.py
tests/tokenizers_/test_registry.py
tests/tool_parsers/__init__.py
tests/tool_parsers/common_tests.py
tests/tool_parsers/conftest.py
tests/tool_parsers/test_deepseekv31_tool_parser.py
tests/tool_parsers/test_deepseekv32_tool_parser.py
tests/tool_parsers/test_deepseekv3_tool_parser.py
tests/tool_parsers/test_ernie45_moe_tool_parser.py
tests/tool_parsers/test_functiongemma_tool_parser.py
tests/tool_parsers/test_gemma4_tool_parser.py
tests/tool_parsers/test_gigachat3_tool_parser.py
tests/tool_parsers/test_glm47_moe_tool_parser.py
tests/tool_parsers/test_glm4_moe_tool_parser.py
tests/tool_parsers/test_granite4_tool_parser.py
tests/tool_parsers/test_granite_20b_fc_tool_parser.py
tests/tool_parsers/test_granite_tool_parser.py
tests/tool_parsers/test_hermes_tool_parser.py
tests/tool_parsers/test_hunyuan_a13b_tool_parser.py
tests/tool_parsers/test_internlm2_tool_parser.py
tests/tool_parsers/test_jamba_tool_parser.py
tests/tool_parsers/test_kimi_k2_tool_parser.py
tests/tool_parsers/test_llama3_json_tool_parser.py
tests/tool_parsers/test_llama4_pythonic_tool_parser.py
tests/tool_parsers/test_longcat_tool_parser.py
tests/tool_parsers/test_minimax_m2_tool_parser.py
tests/tool_parsers/test_minimax_tool_parser.py
tests/tool_parsers/test_mistral_tool_parser.py
tests/tool_parsers/test_olmo3_tool_parser.py
tests/tool_parsers/test_openai_tool_parser.py
tests/tool_parsers/test_phi4mini_tool_parser.py
tests/tool_parsers/test_pythonic_tool_parser.py
tests/tool_parsers/test_qwen3coder_tool_parser.py
tests/tool_parsers/test_qwen3xml_tool_parser.py
tests/tool_parsers/test_seed_oss_tool_parser.py
tests/tool_parsers/test_step3_tool_parser.py
tests/tool_parsers/test_step3p5_tool_parser.py
tests/tool_parsers/test_xlam_tool_parser.py
tests/tool_parsers/utils.py
tests/tool_use/__init__.py
tests/tool_use/conftest.py
tests/tool_use/test_chat_completion_request_validations.py
tests/tool_use/test_chat_completions.py
tests/tool_use/test_parallel_tool_calls.py
tests/tool_use/test_tool_calls.py
tests/tool_use/test_tool_choice_required.py
tests/tool_use/utils.py
tests/tool_use/mistral/__init__.py
tests/tool_use/mistral/conftest.py
tests/tool_use/mistral/test_mistral_tool_calls.py
tests/tool_use/mistral/utils.py
tests/tools/__init__.py
tests/tools/test_config_validator.py
tests/tracing/__init__.py
tests/tracing/conftest.py
tests/tracing/test_loading_tracing.py
tests/transformers_utils/__init__.py
tests/transformers_utils/test_config.py
tests/transformers_utils/test_config_parser_registry.py
tests/transformers_utils/test_processor.py
tests/transformers_utils/test_repo_utils.py
tests/transformers_utils/test_utils.py
tests/utils_/__init__.py
tests/utils_/test_argparse_utils.py
tests/utils_/test_async_utils.py
tests/utils_/test_cache.py
tests/utils_/test_collection_utils.py
tests/utils_/test_func_utils.py
tests/utils_/test_gc_utils.py
tests/utils_/test_hashing.py
tests/utils_/test_import_utils.py
tests/utils_/test_jsontree.py
tests/utils_/test_mem_utils.py
tests/utils_/test_network_utils.py
tests/utils_/test_serial_utils.py
tests/utils_/test_system_utils.py
tests/utils_/test_tensor_schema.py
tests/utils_/test_torch_utils.py
tests/v1/__init__.py
tests/v1/test_oracle.py
tests/v1/test_outputs.py
tests/v1/test_request.py
tests/v1/test_serial_utils.py
tests/v1/test_tensor_ipc_queue.py
tests/v1/utils.py
tests/v1/attention/test_attention_backends.py
tests/v1/attention/test_attention_backends_selection.py
tests/v1/attention/test_attention_splitting.py
tests/v1/attention/test_batch_reordering.py
tests/v1/attention/test_chunked_local_attention.py
tests/v1/attention/test_gdn_metadata_builder.py
tests/v1/attention/test_mamba_update_block_table.py
tests/v1/attention/test_mla_backends.py
tests/v1/attention/test_rocm_attention_backends_selection.py
tests/v1/attention/test_sparse_mla_backends.py
tests/v1/attention/test_trtllm_attention_integration.py
tests/v1/attention/utils.py
tests/v1/core/__init__.py
tests/v1/core/test_async_scheduler.py
tests/v1/core/test_encoder_cache_manager.py
tests/v1/core/test_kv_cache_metrics.py
tests/v1/core/test_kv_cache_utils.py
tests/v1/core/test_kv_sharing.py
tests/v1/core/test_output.py
tests/v1/core/test_prefix_caching.py
tests/v1/core/test_priority_scheduler_random.py
tests/v1/core/test_repetition_detection.py
tests/v1/core/test_reset_prefix_cache_e2e.py
tests/v1/core/test_scheduler.py
tests/v1/core/test_scheduler_e2e.py
tests/v1/core/test_single_type_kv_cache_manager.py
tests/v1/core/utils.py
tests/v1/cudagraph/__init__.py
tests/v1/cudagraph/test_cudagraph_dispatch.py
tests/v1/cudagraph/test_cudagraph_mode.py
tests/v1/cudagraph/test_encoder_cudagraph.py
tests/v1/determinism/conftest.py
tests/v1/determinism/test_batch_invariance.py
tests/v1/determinism/test_online_batch_invariance.py
tests/v1/determinism/test_rms_norm_batch_invariant.py
tests/v1/determinism/utils.py
tests/v1/distributed/__init__.py
tests/v1/distributed/test_async_llm_dp.py
tests/v1/distributed/test_dbo.py
tests/v1/distributed/test_eagle_dp.py
tests/v1/distributed/test_external_lb_dp.py
tests/v1/distributed/test_hybrid_lb_dp.py
tests/v1/distributed/test_internal_lb_dp.py
tests/v1/e2e/__init__.py
tests/v1/e2e/test_hybrid_chunked_prefill.py
tests/v1/e2e/general/__init__.py
tests/v1/e2e/general/test_async_scheduling.py
tests/v1/e2e/general/test_cascade_attention.py
tests/v1/e2e/general/test_context_length.py
tests/v1/e2e/general/test_correctness_sliding_window.py
tests/v1/e2e/general/test_kv_sharing_fast_prefill.py
tests/v1/e2e/general/test_mamba_prefix_cache.py
tests/v1/e2e/general/test_min_tokens.py
tests/v1/e2e/general/test_pooling_chunked_prefill.py
tests/v1/e2e/general/test_streaming_input.py
tests/v1/e2e/spec_decode/__init__.py
tests/v1/e2e/spec_decode/test_async_spec_decode.py
tests/v1/e2e/spec_decode/test_lora_with_spec_decode.py
tests/v1/e2e/spec_decode/test_spec_decode.py
tests/v1/ec_connector/integration/README.md
tests/v1/ec_connector/integration/hato.jpg
tests/v1/ec_connector/integration/run_epd_correctness_test.sh
tests/v1/ec_connector/integration/test_epd_correctness.py
tests/v1/ec_connector/unit/test_ec_example_connector.py
tests/v1/engine/__init__.py
tests/v1/engine/conftest.py
tests/v1/engine/test_abort_final_step.py
tests/v1/engine/test_async_llm.py
tests/v1/engine/test_engine_args.py
tests/v1/engine/test_engine_core.py
tests/v1/engine/test_engine_core_client.py
tests/v1/engine/test_fast_incdec_prefix_err.py
tests/v1/engine/test_init_error_messaging.py
tests/v1/engine/test_llm_engine.py
tests/v1/engine/test_output_processor.py
tests/v1/engine/test_parallel_sampling.py
tests/v1/engine/test_preprocess_error_handling.py
tests/v1/engine/utils.py
tests/v1/entrypoints/openai/test_thinking_token_budget.py
tests/v1/executor/__init__.py
tests/v1/executor/test_executor.py
tests/v1/kv_connector/__init__.py
tests/v1/kv_connector/extract_hidden_states_integration/__init__.py
tests/v1/kv_connector/extract_hidden_states_integration/predictable_llama.py
tests/v1/kv_connector/extract_hidden_states_integration/test_extraction.py
tests/v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh
tests/v1/kv_connector/nixl_integration/run_edge_case_test.sh
tests/v1/kv_connector/nixl_integration/run_tpu_disagg_accuracy_test.sh
tests/v1/kv_connector/nixl_integration/run_tpu_edge_case_test.sh
tests/v1/kv_connector/nixl_integration/run_xpu_disagg_accuracy_test.sh
tests/v1/kv_connector/nixl_integration/spec_decode_acceptance_test.sh
tests/v1/kv_connector/nixl_integration/test_accuracy.py
tests/v1/kv_connector/nixl_integration/test_disagg_accuracy.py
tests/v1/kv_connector/nixl_integration/test_edge_cases.py
tests/v1/kv_connector/nixl_integration/test_spec_decode_acceptance.py
tests/v1/kv_connector/nixl_integration/toy_proxy_server.py
tests/v1/kv_connector/unit/__init__.py
tests/v1/kv_connector/unit/test_backwards_compatibility.py
tests/v1/kv_connector/unit/test_cache_pollution_prevention.py
tests/v1/kv_connector/unit/test_config.py
tests/v1/kv_connector/unit/test_decode_bench_connector.py
tests/v1/kv_connector/unit/test_error_propagation.py
tests/v1/kv_connector/unit/test_example_connector.py
tests/v1/kv_connector/unit/test_flexkv_connector.py
tests/v1/kv_connector/unit/test_invalid_blocks_correctness.py
tests/v1/kv_connector/unit/test_kv_cache_layout.py
tests/v1/kv_connector/unit/test_kv_connector_lifecycle.py
tests/v1/kv_connector/unit/test_kv_load_failure_recovery.py
tests/v1/kv_connector/unit/test_lmcache_connector.py
tests/v1/kv_connector/unit/test_lmcache_integration.py
tests/v1/kv_connector/unit/test_mooncake_connector.py
tests/v1/kv_connector/unit/test_moriio_connector.py
tests/v1/kv_connector/unit/test_multi_connector.py
tests/v1/kv_connector/unit/test_nixl_connector.py
tests/v1/kv_connector/unit/test_nixl_connector_hma.py
tests/v1/kv_connector/unit/test_output_aggregator.py
tests/v1/kv_connector/unit/test_remote_decode_lifecycle.py
tests/v1/kv_connector/unit/test_remote_prefill_lifecycle.py
tests/v1/kv_connector/unit/test_scheduler_kv_connector_override.py
tests/v1/kv_connector/unit/utils.py
tests/v1/kv_connector/unit/offloading_connector/__init__.py
tests/v1/kv_connector/unit/offloading_connector/conftest.py
tests/v1/kv_connector/unit/offloading_connector/test_metrics.py
tests/v1/kv_connector/unit/offloading_connector/test_scheduler.py
tests/v1/kv_connector/unit/offloading_connector/test_worker.py
tests/v1/kv_connector/unit/offloading_connector/utils.py
tests/v1/kv_offload/test_cpu_gpu.py
tests/v1/kv_offload/test_cpu_manager.py
tests/v1/kv_offload/test_cpu_offloading.py
tests/v1/kv_offload/test_worker.py
tests/v1/logits_processors/__init__.py
tests/v1/logits_processors/test_correctness.py
tests/v1/logits_processors/test_custom_offline.py
tests/v1/logits_processors/test_custom_online.py
tests/v1/logits_processors/utils.py
tests/v1/metrics/test_engine_logger_apis.py
tests/v1/metrics/test_metrics_reader.py
tests/v1/metrics/test_perf_metrics.py
tests/v1/metrics/test_ray_metrics.py
tests/v1/metrics/test_stats.py
tests/v1/sample/__init__.py
tests/v1/sample/test_logprobs.py
tests/v1/sample/test_logprobs_e2e.py
tests/v1/sample/test_rejection_sampler.py
tests/v1/sample/test_sampler.py
tests/v1/sample/test_sampling_params_e2e.py
tests/v1/sample/test_topk_topp_sampler.py
tests/v1/sample/utils.py
tests/v1/shutdown/conftest.py
tests/v1/shutdown/test_delete.py
tests/v1/shutdown/test_forward_error.py
tests/v1/shutdown/test_processor_error.py
tests/v1/shutdown/test_startup_error.py
tests/v1/shutdown/utils.py
tests/v1/simple_kv_offload/__init__.py
tests/v1/simple_kv_offload/test_integration.py
tests/v1/simple_kv_offload/test_scheduler.py
tests/v1/spec_decode/__init__.py
tests/v1/spec_decode/test_acceptance_length.py
tests/v1/spec_decode/test_backup_token_async_spec.py
tests/v1/spec_decode/test_eagle.py
tests/v1/spec_decode/test_eagle_step_kernel.py
tests/v1/spec_decode/test_extract_hidden_states.py
tests/v1/spec_decode/test_max_len.py
tests/v1/spec_decode/test_mtp.py
tests/v1/spec_decode/test_ngram.py
tests/v1/spec_decode/test_speculators_eagle3.py
tests/v1/spec_decode/test_synthetic_rejection_sampler_utils.py
tests/v1/spec_decode/test_tree_attention.py
tests/v1/streaming_input/__init__.py
tests/v1/streaming_input/test_async_llm_streaming.py
tests/v1/streaming_input/test_gpu_model_runner_streaming.py
tests/v1/streaming_input/test_gpu_model_runner_v2_streaming.py
tests/v1/streaming_input/test_scheduler_streaming.py
tests/v1/structured_output/__init__.py
tests/v1/structured_output/test_backend_guidance.py
tests/v1/structured_output/test_reasoning_structured_output.py
tests/v1/structured_output/test_utils.py
tests/v1/tracing/__init__.py
tests/v1/tracing/test_tracing.py
tests/v1/worker/__init__.py
tests/v1/worker/test_gpu_input_batch.py
tests/v1/worker/test_gpu_model_runner.py
tests/v1/worker/test_gpu_model_runner_v2_eplb.py
tests/v1/worker/test_gpu_profiler.py
tests/v1/worker/test_late_interaction_runner.py
tests/v1/worker/test_mamba_utils.py
tests/v1/worker/test_utils.py
tests/v1/worker/test_worker_memory_snapshot.py
tests/vllm_test_utils/setup.py
tests/vllm_test_utils/vllm_test_utils/__init__.py
tests/vllm_test_utils/vllm_test_utils/blame.py
tests/vllm_test_utils/vllm_test_utils/monitor.py
tests/weight_loading/models-amd.txt
tests/weight_loading/models-large-amd.txt
tests/weight_loading/models-large.txt
tests/weight_loading/models.txt
tests/weight_loading/run_model_weight_loading_test.sh
tests/weight_loading/test_weight_loading.py
tools/check_repo.sh
tools/flashinfer-build.sh
tools/generate_cmake_presets.py
tools/generate_versions_json.py
tools/install_deepgemm.sh
tools/install_gdrcopy.sh
tools/install_nixl_from_source_ubuntu.py
tools/install_torchcodec_rocm.sh
tools/report_build_time_ninja.py
tools/ep_kernels/README.md
tools/ep_kernels/configure_system_drivers.sh
tools/ep_kernels/install_python_libraries.sh
tools/ep_kernels/elastic_ep/eep_nvshmem.patch
tools/ep_kernels/elastic_ep/install_eep_libraries.sh
tools/pre_commit/check_boolean_context_manager.py
tools/pre_commit/check_forbidden_imports.py
tools/pre_commit/check_init_lazy_imports.py
tools/pre_commit/check_spdx_header.py
tools/pre_commit/check_torch_cuda.py
tools/pre_commit/generate_attention_backend_docs.py
tools/pre_commit/generate_nightly_torch_test.py
tools/pre_commit/mypy.py
tools/pre_commit/png-lint.sh
tools/pre_commit/shellcheck.sh
tools/pre_commit/update-dockerfile-graph.sh
tools/pre_commit/validate_config.py
tools/profiler/print_layerwise_table.py
tools/profiler/visualize_layerwise_profile.py
tools/profiler/nsys_profile_tools/README.md
tools/profiler/nsys_profile_tools/gputrc2graph.py
tools/profiler/nsys_profile_tools/vllm_engine_model.json
tools/profiler/nsys_profile_tools/images/csv1.png
tools/profiler/nsys_profile_tools/images/html.png
tools/profiler/nsys_profile_tools/images/html_tbl.png
tools/vllm-rocm/generate-rocm-wheels-root-index.sh
tools/vllm-rocm/pin_rocm_dependencies.py
tools/vllm-tpu/build.sh
vllm/__init__.py
vllm/_aiter_ops.py
vllm/_custom_ops.py
vllm/_oink_ops.py
vllm/_version.py
vllm/_xpu_ops.py
vllm/beam_search.py
vllm/collect_env.py
vllm/connections.py
vllm/env_override.py
vllm/envs.py
vllm/exceptions.py
vllm/forward_context.py
vllm/logger.py
vllm/logits_process.py
vllm/logprobs.py
vllm/model_inspection.py
vllm/outputs.py
vllm/pooling_params.py
vllm/py.typed
vllm/sampling_params.py
vllm/scalar_type.py
vllm/scripts.py
vllm/sequence.py
vllm/tasks.py
vllm/version.py
vllm.egg-info/PKG-INFO
vllm.egg-info/SOURCES.txt
vllm.egg-info/dependency_links.txt
vllm.egg-info/entry_points.txt
vllm.egg-info/requires.txt
vllm.egg-info/top_level.txt
vllm/assets/__init__.py
vllm/assets/audio.py
vllm/assets/base.py
vllm/assets/image.py
vllm/assets/video.py
vllm/benchmarks/__init__.py
vllm/benchmarks/datasets.py
vllm/benchmarks/latency.py
vllm/benchmarks/mm_processor.py
vllm/benchmarks/plot.py
vllm/benchmarks/serve.py
vllm/benchmarks/startup.py
vllm/benchmarks/throughput.py
vllm/benchmarks/lib/__init__.py
vllm/benchmarks/lib/endpoint_request_func.py
vllm/benchmarks/lib/ready_checker.py
vllm/benchmarks/lib/utils.py
vllm/benchmarks/sweep/__init__.py
vllm/benchmarks/sweep/cli.py
vllm/benchmarks/sweep/param_sweep.py
vllm/benchmarks/sweep/plot.py
vllm/benchmarks/sweep/plot_pareto.py
vllm/benchmarks/sweep/serve.py
vllm/benchmarks/sweep/serve_workload.py
vllm/benchmarks/sweep/server.py
vllm/benchmarks/sweep/startup.py
vllm/benchmarks/sweep/utils.py
vllm/compilation/__init__.py
vllm/compilation/backends.py
vllm/compilation/base_static_graph.py
vllm/compilation/caching.py
vllm/compilation/compiler_interface.py
vllm/compilation/counter.py
vllm/compilation/cuda_graph.py
vllm/compilation/decorators.py
vllm/compilation/monitor.py
vllm/compilation/partition_rules.py
vllm/compilation/piecewise_backend.py
vllm/compilation/wrapper.py
vllm/compilation/passes/__init__.py
vllm/compilation/passes/fx_utils.py
vllm/compilation/passes/inductor_pass.py
vllm/compilation/passes/pass_manager.py
vllm/compilation/passes/vllm_inductor_pass.py
vllm/compilation/passes/fusion/__init__.py
vllm/compilation/passes/fusion/act_quant_fusion.py
vllm/compilation/passes/fusion/allreduce_rms_fusion.py
vllm/compilation/passes/fusion/attn_quant_fusion.py
vllm/compilation/passes/fusion/collective_fusion.py
vllm/compilation/passes/fusion/matcher_utils.py
vllm/compilation/passes/fusion/minimax_qk_norm_fusion.py
vllm/compilation/passes/fusion/qk_norm_rope_fusion.py
vllm/compilation/passes/fusion/rms_quant_fusion.py
vllm/compilation/passes/fusion/rocm_aiter_fusion.py
vllm/compilation/passes/fusion/rope_kvcache_fusion.py
vllm/compilation/passes/fusion/sequence_parallelism.py
vllm/compilation/passes/utility/__init__.py
vllm/compilation/passes/utility/fix_functionalization.py
vllm/compilation/passes/utility/noop_elimination.py
vllm/compilation/passes/utility/post_cleanup.py
vllm/compilation/passes/utility/scatter_split_replace.py
vllm/compilation/passes/utility/split_coalescing.py
vllm/config/__init__.py
vllm/config/attention.py
vllm/config/cache.py
vllm/config/compilation.py
vllm/config/device.py
vllm/config/ec_transfer.py
vllm/config/kernel.py
vllm/config/kv_events.py
vllm/config/kv_transfer.py
vllm/config/load.py
vllm/config/lora.py
vllm/config/model.py
vllm/config/model_arch.py
vllm/config/multimodal.py
vllm/config/observability.py
vllm/config/offload.py
vllm/config/parallel.py
vllm/config/pooler.py
vllm/config/profiler.py
vllm/config/reasoning.py
vllm/config/scheduler.py
vllm/config/speculative.py
vllm/config/speech_to_text.py
vllm/config/structured_outputs.py
vllm/config/utils.py
vllm/config/vllm.py
vllm/config/weight_transfer.py
vllm/device_allocator/__init__.py
vllm/device_allocator/cumem.py
vllm/distributed/__init__.py
vllm/distributed/communication_op.py
vllm/distributed/kv_events.py
vllm/distributed/parallel_state.py
vllm/distributed/stateless_coordinator.py
vllm/distributed/utils.py
vllm/distributed/device_communicators/__init__.py
vllm/distributed/device_communicators/all2all.py
vllm/distributed/device_communicators/all_reduce_utils.py
vllm/distributed/device_communicators/base_device_communicator.py
vllm/distributed/device_communicators/cpu_communicator.py
vllm/distributed/device_communicators/cuda_communicator.py
vllm/distributed/device_communicators/cuda_wrapper.py
vllm/distributed/device_communicators/custom_all_reduce.py
vllm/distributed/device_communicators/flashinfer_all_reduce.py
vllm/distributed/device_communicators/mnnvl_compat.py
vllm/distributed/device_communicators/pynccl.py
vllm/distributed/device_communicators/pynccl_allocator.py
vllm/distributed/device_communicators/pynccl_wrapper.py
vllm/distributed/device_communicators/quick_all_reduce.py
vllm/distributed/device_communicators/ray_communicator.py
vllm/distributed/device_communicators/shm_broadcast.py
vllm/distributed/device_communicators/shm_object_storage.py
vllm/distributed/device_communicators/symm_mem.py
vllm/distributed/device_communicators/xpu_communicator.py
vllm/distributed/ec_transfer/__init__.py
vllm/distributed/ec_transfer/ec_transfer_state.py
vllm/distributed/ec_transfer/ec_connector/__init__.py
vllm/distributed/ec_transfer/ec_connector/base.py
vllm/distributed/ec_transfer/ec_connector/example_connector.py
vllm/distributed/ec_transfer/ec_connector/factory.py
vllm/distributed/elastic_ep/__init__.py
vllm/distributed/elastic_ep/elastic_execute.py
vllm/distributed/elastic_ep/elastic_state.py
vllm/distributed/elastic_ep/standby_state.py
vllm/distributed/eplb/__init__.py
vllm/distributed/eplb/async_worker.py
vllm/distributed/eplb/eplb_state.py
vllm/distributed/eplb/eplb_utils.py
vllm/distributed/eplb/rebalance_execute.py
vllm/distributed/eplb/policy/__init__.py
vllm/distributed/eplb/policy/abstract.py
vllm/distributed/eplb/policy/default.py
vllm/distributed/kv_transfer/README.md
vllm/distributed/kv_transfer/__init__.py
vllm/distributed/kv_transfer/disagg_prefill_workflow.jpg
vllm/distributed/kv_transfer/kv_transfer_state.py
vllm/distributed/kv_transfer/kv_connector/__init__.py
vllm/distributed/kv_transfer/kv_connector/base.py
vllm/distributed/kv_transfer/kv_connector/factory.py
vllm/distributed/kv_transfer/kv_connector/utils.py
vllm/distributed/kv_transfer/kv_connector/v1/__init__.py
vllm/distributed/kv_transfer/kv_connector/v1/base.py
vllm/distributed/kv_transfer/kv_connector/v1/decode_bench_connector.py
vllm/distributed/kv_transfer/kv_connector/v1/example_connector.py
vllm/distributed/kv_transfer/kv_connector/v1/example_hidden_states_connector.py
vllm/distributed/kv_transfer/kv_connector/v1/flexkv_connector.py
vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py
vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py
vllm/distributed/kv_transfer/kv_connector/v1/metrics.py
vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py
vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py
vllm/distributed/kv_transfer/kv_connector/v1/simple_cpu_offload_connector.py
vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/__init__.py
vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/multi_process_adapter.py
vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/utils.py
vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/vllm_v1_adapter.py
vllm/distributed/kv_transfer/kv_connector/v1/mooncake/__init__.py
vllm/distributed/kv_transfer/kv_connector/v1/mooncake/mooncake_connector.py
vllm/distributed/kv_transfer/kv_connector/v1/mooncake/mooncake_utils.py
vllm/distributed/kv_transfer/kv_connector/v1/moriio/__init__.py
vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_common.py
vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_connector.py
vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_engine.py
vllm/distributed/kv_transfer/kv_connector/v1/offloading/__init__.py
vllm/distributed/kv_transfer/kv_connector/v1/offloading/common.py
vllm/distributed/kv_transfer/kv_connector/v1/offloading/metrics.py
vllm/distributed/kv_transfer/kv_connector/v1/offloading/scheduler.py
vllm/distributed/kv_transfer/kv_connector/v1/offloading/worker.py
vllm/distributed/kv_transfer/kv_connector/v1/p2p/__init__.py
vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py
vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py
vllm/distributed/kv_transfer/kv_connector/v1/p2p/tensor_memory_pool.py
vllm/distributed/weight_transfer/__init__.py
vllm/distributed/weight_transfer/base.py
vllm/distributed/weight_transfer/factory.py
vllm/distributed/weight_transfer/ipc_engine.py
vllm/distributed/weight_transfer/nccl_engine.py
vllm/distributed/weight_transfer/packed_tensor.py
vllm/engine/__init__.py
vllm/engine/arg_utils.py
vllm/engine/async_llm_engine.py
vllm/engine/llm_engine.py
vllm/engine/protocol.py
vllm/entrypoints/__init__.py
vllm/entrypoints/api_server.py
vllm/entrypoints/chat_utils.py
vllm/entrypoints/constants.py
vllm/entrypoints/grpc_server.py
vllm/entrypoints/launcher.py
vllm/entrypoints/llm.py
vllm/entrypoints/logger.py
vllm/entrypoints/ssl.py
vllm/entrypoints/utils.py
vllm/entrypoints/anthropic/__init__.py
vllm/entrypoints/anthropic/api_router.py
vllm/entrypoints/anthropic/protocol.py
vllm/entrypoints/anthropic/serving.py
vllm/entrypoints/cli/__init__.py
vllm/entrypoints/cli/collect_env.py
vllm/entrypoints/cli/launch.py
vllm/entrypoints/cli/main.py
vllm/entrypoints/cli/openai.py
vllm/entrypoints/cli/run_batch.py
vllm/entrypoints/cli/serve.py
vllm/entrypoints/cli/types.py
vllm/entrypoints/cli/benchmark/__init__.py
vllm/entrypoints/cli/benchmark/base.py
vllm/entrypoints/cli/benchmark/latency.py
vllm/entrypoints/cli/benchmark/main.py
vllm/entrypoints/cli/benchmark/mm_processor.py
vllm/entrypoints/cli/benchmark/serve.py
vllm/entrypoints/cli/benchmark/startup.py
vllm/entrypoints/cli/benchmark/sweep.py
vllm/entrypoints/cli/benchmark/throughput.py
vllm/entrypoints/mcp/__init__.py
vllm/entrypoints/mcp/tool.py
vllm/entrypoints/mcp/tool_server.py
vllm/entrypoints/openai/__init__.py
vllm/entrypoints/openai/api_server.py
vllm/entrypoints/openai/cli_args.py
vllm/entrypoints/openai/orca_metrics.py
vllm/entrypoints/openai/run_batch.py
vllm/entrypoints/openai/server_utils.py
vllm/entrypoints/openai/utils.py
vllm/entrypoints/openai/chat_completion/__init__.py
vllm/entrypoints/openai/chat_completion/api_router.py
vllm/entrypoints/openai/chat_completion/batch_serving.py
vllm/entrypoints/openai/chat_completion/protocol.py
vllm/entrypoints/openai/chat_completion/serving.py
vllm/entrypoints/openai/chat_completion/stream_harmony.py
vllm/entrypoints/openai/completion/__init__.py
vllm/entrypoints/openai/completion/api_router.py
vllm/entrypoints/openai/completion/protocol.py
vllm/entrypoints/openai/completion/serving.py
vllm/entrypoints/openai/engine/__init__.py
vllm/entrypoints/openai/engine/protocol.py
vllm/entrypoints/openai/engine/serving.py
vllm/entrypoints/openai/generate/__init__.py
vllm/entrypoints/openai/generate/api_router.py
vllm/entrypoints/openai/models/__init__.py
vllm/entrypoints/openai/models/api_router.py
vllm/entrypoints/openai/models/protocol.py
vllm/entrypoints/openai/models/serving.py
vllm/entrypoints/openai/parser/__init__.py
vllm/entrypoints/openai/parser/harmony_utils.py
vllm/entrypoints/openai/parser/responses_parser.py
vllm/entrypoints/openai/realtime/__init__.py
vllm/entrypoints/openai/realtime/api_router.py
vllm/entrypoints/openai/realtime/connection.py
vllm/entrypoints/openai/realtime/metrics.py
vllm/entrypoints/openai/realtime/protocol.py
vllm/entrypoints/openai/realtime/serving.py
vllm/entrypoints/openai/responses/__init__.py
vllm/entrypoints/openai/responses/api_router.py
vllm/entrypoints/openai/responses/context.py
vllm/entrypoints/openai/responses/harmony.py
vllm/entrypoints/openai/responses/protocol.py
vllm/entrypoints/openai/responses/serving.py
vllm/entrypoints/openai/responses/streaming_events.py
vllm/entrypoints/openai/responses/utils.py
vllm/entrypoints/openai/speech_to_text/__init__.py
vllm/entrypoints/openai/speech_to_text/api_router.py
vllm/entrypoints/openai/speech_to_text/protocol.py
vllm/entrypoints/openai/speech_to_text/serving.py
vllm/entrypoints/openai/speech_to_text/speech_to_text.py
vllm/entrypoints/pooling/__init__.py
vllm/entrypoints/pooling/io_processor_factories.py
vllm/entrypoints/pooling/typing.py
vllm/entrypoints/pooling/utils.py
vllm/entrypoints/pooling/base/__init__.py
vllm/entrypoints/pooling/base/io_processor.py
vllm/entrypoints/pooling/base/protocol.py
vllm/entrypoints/pooling/base/serving.py
vllm/entrypoints/pooling/classify/__init__.py
vllm/entrypoints/pooling/classify/api_router.py
vllm/entrypoints/pooling/classify/io_processor.py
vllm/entrypoints/pooling/classify/protocol.py
vllm/entrypoints/pooling/classify/serving.py
vllm/entrypoints/pooling/embed/__init__.py
vllm/entrypoints/pooling/embed/api_router.py
vllm/entrypoints/pooling/embed/io_processor.py
vllm/entrypoints/pooling/embed/protocol.py
vllm/entrypoints/pooling/embed/serving.py
vllm/entrypoints/pooling/pooling/__init__.py
vllm/entrypoints/pooling/pooling/api_router.py
vllm/entrypoints/pooling/pooling/protocol.py
vllm/entrypoints/pooling/pooling/serving.py
vllm/entrypoints/pooling/score/__init__.py
vllm/entrypoints/pooling/score/api_router.py
vllm/entrypoints/pooling/score/protocol.py
vllm/entrypoints/pooling/score/serving.py
vllm/entrypoints/pooling/score/utils.py
vllm/entrypoints/sagemaker/__init__.py
vllm/entrypoints/sagemaker/api_router.py
vllm/entrypoints/serve/__init__.py
vllm/entrypoints/serve/cache/__init__.py
vllm/entrypoints/serve/cache/api_router.py
vllm/entrypoints/serve/disagg/__init__.py
vllm/entrypoints/serve/disagg/api_router.py
vllm/entrypoints/serve/disagg/protocol.py
vllm/entrypoints/serve/disagg/serving.py
vllm/entrypoints/serve/elastic_ep/__init__.py
vllm/entrypoints/serve/elastic_ep/api_router.py
vllm/entrypoints/serve/elastic_ep/middleware.py
vllm/entrypoints/serve/instrumentator/__init__.py
vllm/entrypoints/serve/instrumentator/basic.py
vllm/entrypoints/serve/instrumentator/health.py
vllm/entrypoints/serve/instrumentator/metrics.py
vllm/entrypoints/serve/instrumentator/offline_docs.py
vllm/entrypoints/serve/instrumentator/server_info.py
vllm/entrypoints/serve/instrumentator/static/swagger-ui-bundle.js
vllm/entrypoints/serve/instrumentator/static/swagger-ui.css
vllm/entrypoints/serve/lora/__init__.py
vllm/entrypoints/serve/lora/api_router.py
vllm/entrypoints/serve/lora/protocol.py
vllm/entrypoints/serve/profile/__init__.py
vllm/entrypoints/serve/profile/api_router.py
vllm/entrypoints/serve/render/__init__.py
vllm/entrypoints/serve/render/api_router.py
vllm/entrypoints/serve/render/serving.py
vllm/entrypoints/serve/rlhf/__init__.py
vllm/entrypoints/serve/rlhf/api_router.py
vllm/entrypoints/serve/rpc/__init__.py
vllm/entrypoints/serve/rpc/api_router.py
vllm/entrypoints/serve/sleep/__init__.py
vllm/entrypoints/serve/sleep/api_router.py
vllm/entrypoints/serve/tokenize/__init__.py
vllm/entrypoints/serve/tokenize/api_router.py
vllm/entrypoints/serve/tokenize/protocol.py
vllm/entrypoints/serve/tokenize/serving.py
vllm/inputs/__init__.py
vllm/inputs/engine.py
vllm/inputs/llm.py
vllm/inputs/preprocess.py
vllm/kernels/__init__.py
vllm/kernels/helion/__init__.py
vllm/kernels/helion/config_manager.py
vllm/kernels/helion/register.py
vllm/kernels/helion/utils.py
vllm/kernels/helion/configs/silu_mul_fp8/nvidia_h100.json
vllm/kernels/helion/configs/silu_mul_fp8/nvidia_h200.json
vllm/kernels/helion/ops/__init__.py
vllm/kernels/helion/ops/silu_mul_fp8.py
vllm/logging_utils/__init__.py
vllm/logging_utils/access_log_filter.py
vllm/logging_utils/dump_input.py
vllm/logging_utils/formatter.py
vllm/logging_utils/lazy.py
vllm/logging_utils/log_time.py
vllm/lora/__init__.py
vllm/lora/lora_model.py
vllm/lora/lora_weights.py
vllm/lora/model_manager.py
vllm/lora/peft_helper.py
vllm/lora/request.py
vllm/lora/resolver.py
vllm/lora/utils.py
vllm/lora/worker_manager.py
vllm/lora/layers/__init__.py
vllm/lora/layers/base.py
vllm/lora/layers/base_linear.py
vllm/lora/layers/column_parallel_linear.py
vllm/lora/layers/fused_moe.py
vllm/lora/layers/gate_linear.py
vllm/lora/layers/logits_processor.py
vllm/lora/layers/replicated_linear.py
vllm/lora/layers/row_parallel_linear.py
vllm/lora/layers/utils.py
vllm/lora/layers/vocal_parallel_embedding.py
vllm/lora/ops/__init__.py
vllm/lora/ops/torch_ops/__init__.py
vllm/lora/ops/torch_ops/lora_ops.py
vllm/lora/ops/triton_ops/README_TUNING.md
vllm/lora/ops/triton_ops/__init__.py
vllm/lora/ops/triton_ops/fp8_kernel_utils.py
vllm/lora/ops/triton_ops/fused_moe_lora_fp8_op.py
vllm/lora/ops/triton_ops/fused_moe_lora_op.py
vllm/lora/ops/triton_ops/kernel_utils.py
vllm/lora/ops/triton_ops/lora_expand_fp8_op.py
vllm/lora/ops/triton_ops/lora_expand_op.py
vllm/lora/ops/triton_ops/lora_kernel_metadata.py
vllm/lora/ops/triton_ops/lora_shrink_fp8_op.py
vllm/lora/ops/triton_ops/lora_shrink_op.py
vllm/lora/ops/triton_ops/utils.py
vllm/lora/ops/xpu_ops/__init__.py
vllm/lora/ops/xpu_ops/lora_ops.py
vllm/lora/punica_wrapper/__init__.py
vllm/lora/punica_wrapper/punica_base.py
vllm/lora/punica_wrapper/punica_cpu.py
vllm/lora/punica_wrapper/punica_gpu.py
vllm/lora/punica_wrapper/punica_selector.py
vllm/lora/punica_wrapper/punica_xpu.py
vllm/lora/punica_wrapper/utils.py
vllm/model_executor/__init__.py
vllm/model_executor/custom_op.py
vllm/model_executor/parameter.py
vllm/model_executor/utils.py
vllm/model_executor/kernels/__init__.py
vllm/model_executor/kernels/linear/__init__.py
vllm/model_executor/kernels/linear/mixed_precision/MPLinearKernel.py
vllm/model_executor/kernels/linear/mixed_precision/__init__.py
vllm/model_executor/kernels/linear/mixed_precision/allspark.py
vllm/model_executor/kernels/linear/mixed_precision/conch.py
vllm/model_executor/kernels/linear/mixed_precision/cpu.py
vllm/model_executor/kernels/linear/mixed_precision/cutlass.py
vllm/model_executor/kernels/linear/mixed_precision/dynamic_4bit.py
vllm/model_executor/kernels/linear/mixed_precision/exllama.py
vllm/model_executor/kernels/linear/mixed_precision/machete.py
vllm/model_executor/kernels/linear/mixed_precision/marlin.py
vllm/model_executor/kernels/linear/mixed_precision/xpu.py
vllm/model_executor/kernels/linear/scaled_mm/ScaledMMLinearKernel.py
vllm/model_executor/kernels/linear/scaled_mm/__init__.py
vllm/model_executor/kernels/linear/scaled_mm/aiter.py
vllm/model_executor/kernels/linear/scaled_mm/cpu.py
vllm/model_executor/kernels/linear/scaled_mm/cutlass.py
vllm/model_executor/kernels/linear/scaled_mm/flashinfer.py
vllm/model_executor/kernels/linear/scaled_mm/marlin.py
vllm/model_executor/kernels/linear/scaled_mm/pytorch.py
vllm/model_executor/kernels/linear/scaled_mm/rocm.py
vllm/model_executor/kernels/linear/scaled_mm/triton.py
vllm/model_executor/kernels/linear/scaled_mm/xpu.py
vllm/model_executor/layers/__init__.py
vllm/model_executor/layers/activation.py
vllm/model_executor/layers/attention_layer_base.py
vllm/model_executor/layers/batch_invariant.py
vllm/model_executor/layers/conv.py
vllm/model_executor/layers/kda.py
vllm/model_executor/layers/layernorm.py
vllm/model_executor/layers/lightning_attn.py
vllm/model_executor/layers/linear.py
vllm/model_executor/layers/logits_processor.py
vllm/model_executor/layers/mla.py
vllm/model_executor/layers/resampler.py
vllm/model_executor/layers/sparse_attn_indexer.py
vllm/model_executor/layers/utils.py
vllm/model_executor/layers/vocab_parallel_embedding.py
vllm/model_executor/layers/attention/__init__.py
vllm/model_executor/layers/attention/attention.py
vllm/model_executor/layers/attention/chunked_local_attention.py
vllm/model_executor/layers/attention/cross_attention.py
vllm/model_executor/layers/attention/encoder_only_attention.py
vllm/model_executor/layers/attention/kv_transfer_utils.py
vllm/model_executor/layers/attention/mla_attention.py
vllm/model_executor/layers/attention/mm_encoder_attention.py
vllm/model_executor/layers/attention/static_sink_attention.py
vllm/model_executor/layers/fla/__init__.py
vllm/model_executor/layers/fla/ops/__init__.py
vllm/model_executor/layers/fla/ops/chunk.py
vllm/model_executor/layers/fla/ops/chunk_delta_h.py
vllm/model_executor/layers/fla/ops/chunk_o.py
vllm/model_executor/layers/fla/ops/chunk_scaled_dot_kkt.py
vllm/model_executor/layers/fla/ops/cumsum.py
vllm/model_executor/layers/fla/ops/fused_recurrent.py
vllm/model_executor/layers/fla/ops/fused_sigmoid_gating.py
vllm/model_executor/layers/fla/ops/index.py
vllm/model_executor/layers/fla/ops/kda.py
vllm/model_executor/layers/fla/ops/l2norm.py
vllm/model_executor/layers/fla/ops/layernorm_guard.py
vllm/model_executor/layers/fla/ops/op.py
vllm/model_executor/layers/fla/ops/solve_tril.py
vllm/model_executor/layers/fla/ops/utils.py
vllm/model_executor/layers/fla/ops/wy_fast.py
vllm/model_executor/layers/fused_moe/__init__.py
vllm/model_executor/layers/fused_moe/activation.py
vllm/model_executor/layers/fused_moe/all2all_utils.py
vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py
vllm/model_executor/layers/fused_moe/config.py
vllm/model_executor/layers/fused_moe/cpu_fused_moe.py
vllm/model_executor/layers/fused_moe/cutlass_moe.py
vllm/model_executor/layers/fused_moe/deep_gemm_moe.py
vllm/model_executor/layers/fused_moe/deep_gemm_utils.py
vllm/model_executor/layers/fused_moe/fallback.py
vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
vllm/model_executor/layers/fused_moe/flashinfer_trtllm_moe.py
vllm/model_executor/layers/fused_moe/fused_batched_moe.py
vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
vllm/model_executor/layers/fused_moe/fused_moe.py
vllm/model_executor/layers/fused_moe/fused_moe_method_base.py
vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py
vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py
vllm/model_executor/layers/fused_moe/layer.py
vllm/model_executor/layers/fused_moe/modular_kernel.py
vllm/model_executor/layers/fused_moe/moe_align_block_size.py
vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py
vllm/model_executor/layers/fused_moe/mori_prepare_finalize.py
vllm/model_executor/layers/fused_moe/nixl_ep_prepare_finalize.py
vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
vllm/model_executor/layers/fused_moe/routed_experts_capturer.py
vllm/model_executor/layers/fused_moe/shared_fused_moe.py
vllm/model_executor/layers/fused_moe/topk_weight_and_reduce.py
vllm/model_executor/layers/fused_moe/triton_cutlass_moe.py
vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py
vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py
vllm/model_executor/layers/fused_moe/utils.py
vllm/model_executor/layers/fused_moe/xpu_fused_moe.py
vllm/model_executor/layers/fused_moe/zero_expert_fused_moe.py
vllm/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
vllm/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json
vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json
vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json
vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json
vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json
vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H200,dtype=int8_w8a16.json
vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json
vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json
vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json
vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json
vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X.json
vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json
vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=NVIDIA_H100,dtype=fp8_w8a8.json
vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=NVIDIA_H200.json
vllm/model_executor/layers/fused_moe/configs/E=128,N=1856,device_name=NVIDIA_B200.json
vllm/model_executor/layers/fused_moe/configs/E=128,N=1856,device_name=NVIDIA_H100_80GB_HBM3.json
vllm/model_executor/layers/fused_moe/configs/E=128,N=1856,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
vllm/model_executor/layers/fused_moe/configs/E=128,N=1856,device_name=NVIDIA_L40S.json
vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_A100-SXM4-80GB.json
vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json
vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H20-3e.json
vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H20.json
vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H200.json
vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H800,dtype=fp8_w8a8.json
vllm/model_executor/layers/fused_moe/configs/E=128,N=352,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json
vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20-3e.json
vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20.json
vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200.json
vllm/model_executor/layers/fused_moe/configs/E=128,N=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/fused_moe/configs/E=128,N=512,device_name=NVIDIA_B200,dtype=fp8_w8a8.json
vllm/model_executor/layers/fused_moe/configs/E=128,N=512,device_name=NVIDIA_B200.json
vllm/model_executor/layers/fused_moe/configs/E=128,N=512,device_name=NVIDIA_GB200,dtype=fp8_w8a8.json
vllm/model_executor/layers/fused_moe/configs/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json
vllm/model_executor/layers/fused_moe/configs/E=128,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/fused_moe/configs/E=128,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
vllm/model_executor/layers/fused_moe/configs/E=128,N=512,device_name=NVIDIA_H200.json
vllm/model_executor/layers/fused_moe/configs/E=128,N=704,device_name=NVIDIA_B200,dtype=fp8_w8a8.json
vllm/model_executor/layers/fused_moe/configs/E=128,N=704,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
vllm/model_executor/layers/fused_moe/configs/E=128,N=704,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition,dtype=fp8_w8a8.json
vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=AMD_Instinct_MI308X.json
vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_B200.json
vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20.json
vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200.json
vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=Radeon_8060S_Graphics,dtype=int4_w4a16.json
vllm/model_executor/layers/fused_moe/configs/E=128,N=8960,device_name=NVIDIA_H100_80GB_HBM3,dtype=bf16.json
vllm/model_executor/layers/fused_moe/configs/E=128,N=8960,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
vllm/model_executor/layers/fused_moe/configs/E=128,N=928,device_name=NVIDIA_H100_80GB_HBM3.json
vllm/model_executor/layers/fused_moe/configs/E=128,N=928,device_name=NVIDIA_L40S.json
vllm/model_executor/layers/fused_moe/configs/E=128,N=96,device_name=NVIDIA_H20.json
vllm/model_executor/layers/fused_moe/configs/E=128,N=96,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
vllm/model_executor/layers/fused_moe/configs/E=128,N=96,device_name=NVIDIA_H200.json
vllm/model_executor/layers/fused_moe/configs/E=129,N=704,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition,dtype=fp8_w8a8.json
vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=AMD_Instinct_MI300X.json
vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_B200,dtype=fp8_w8a8.json
vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_B200.json
vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_H100.json
vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_H200.json
vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json
vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json
vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json
vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json
vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json
vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json
vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json
vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json
vllm/model_executor/layers/fused_moe/configs/E=16,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
vllm/model_executor/layers/fused_moe/configs/E=16,N=2048,device_name=NVIDIA_H200.json
vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json
vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json
vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json
vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json
vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_H200,dtype=int8_w8a16.json
vllm/model_executor/layers/fused_moe/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json
vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json
vllm/model_executor/layers/fused_moe/configs/E=16,N=4096,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/fused_moe/configs/E=16,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/fused_moe/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json
vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json
vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json
vllm/model_executor/layers/fused_moe/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=AMD_Instinct_MI300X.json
vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=AMD_Instinct_MI350_OAM,dtype=fp8_w8a8.json
vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json
vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_B300_SXM6_AC,dtype=fp8_w8a8.json
vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_H20-3e.json
vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
vllm/model_executor/layers/fused_moe/configs/E=160,N=320,device_name=NVIDIA_H20-3e.json
vllm/model_executor/layers/fused_moe/configs/E=160,N=384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
vllm/model_executor/layers/fused_moe/configs/E=160,N=384,device_name=AMD_Instinct_MI350_OAM,dtype=fp8_w8a8.json
vllm/model_executor/layers/fused_moe/configs/E=160,N=384,device_name=AMD_Instinct_MI355_OAM,dtype=fp8_w8a8.json
vllm/model_executor/layers/fused_moe/configs/E=160,N=384,device_name=NVIDIA_B200,dtype=fp8_w8a8.json
vllm/model_executor/layers/fused_moe/configs/E=160,N=384,device_name=NVIDIA_B300_SXM6_AC,dtype=fp8_w8a8.json
vllm/model_executor/layers/fused_moe/configs/E=160,N=640,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/fused_moe/configs/E=160,N=640,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/fused_moe/configs/E=160,N=640,device_name=NVIDIA_H100,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/fused_moe/configs/E=160,N=768,device_name=NVIDIA_B300_SXM6_AC,dtype=fp8_w8a8.json
vllm/model_executor/layers/fused_moe/configs/E=20,N=1536,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Server_Edition,dtype=fp8_w8a8.json
vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_H100,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/fused_moe/configs/E=256,N=1024,device_name=AMD_Instinct_MI325X,block_shape=[128,128].json
vllm/model_executor/layers/fused_moe/configs/E=256,N=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json
vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json
vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/fused_moe/configs/E=256,N=384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/fused_moe/configs/E=256,N=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/fused_moe/configs/E=256,N=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/fused_moe/configs/E=256,N=512,device_name=NVIDIA_H100_80GB_HBM3.json
vllm/model_executor/layers/fused_moe/configs/E=256,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/fused_moe/configs/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json
vllm/model_executor/layers/fused_moe/configs/E=32,N=1408,device_name=NVIDIA_B200.json
vllm/model_executor/layers/fused_moe/configs/E=32,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json
vllm/model_executor/layers/fused_moe/configs/E=32,N=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/fused_moe/configs/E=32,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/fused_moe/configs/E=384,N=128,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/fused_moe/configs/E=384,N=128,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/fused_moe/configs/E=384,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/fused_moe/configs/E=384,N=256,device_name=AMD_Instinct_MI350X,dtype=int4_w4a16.json
vllm/model_executor/layers/fused_moe/configs/E=384,N=256,device_name=AMD_Instinct_MI350_OAM,dtype=int4_w4a16.json
vllm/model_executor/layers/fused_moe/configs/E=384,N=256,device_name=AMD_Instinct_MI355X,dtype=int4_w4a16.json
vllm/model_executor/layers/fused_moe/configs/E=384,N=256,device_name=AMD_Instinct_MI355_OAM,dtype=int4_w4a16.json
vllm/model_executor/layers/fused_moe/configs/E=384,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/fused_moe/configs/E=384,N=256,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/fused_moe/configs/E=40,N=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8.json
vllm/model_executor/layers/fused_moe/configs/E=40,N=2560,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/fused_moe/configs/E=40,N=2560,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/fused_moe/configs/E=40,N=2560,device_name=NVIDIA_H100,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_A100-SXM4-80GB.json
vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_B200.json
vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_GB200,dtype=fp8_w8a8.json
vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json
vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_H20-3e.json
vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_H200.json
vllm/model_executor/layers/fused_moe/configs/E=512,N=1344,device_name=NVIDIA_B200.json
vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_B200.json
vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_GB200,dtype=fp8_w8a8.json
vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_H100_80GB_HBM3.json
vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_H20-3e.json
vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_H200.json
vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_B200.json
vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_GB200,dtype=fp8_w8a8.json
vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_H100_80GB_HBM3.json
vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_H20-3e.json
vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_H200.json
vllm/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_A100-SXM4-80GB.json
vllm/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_B200.json
vllm/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_H20-3e.json
vllm/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_H200.json
vllm/model_executor/layers/fused_moe/configs/E=512,N=672,device_name=NVIDIA_B200.json
vllm/model_executor/layers/fused_moe/configs/E=60,N=1408,device_name=AMD_Instinct_MI300X.json
vllm/model_executor/layers/fused_moe/configs/E=60,N=176,device_name=AMD_Instinct_MI300X.json
vllm/model_executor/layers/fused_moe/configs/E=60,N=352,device_name=AMD_Instinct_MI300X.json
vllm/model_executor/layers/fused_moe/configs/E=60,N=704,device_name=AMD_Instinct_MI300X.json
vllm/model_executor/layers/fused_moe/configs/E=62,N=128,device_name=AMD_Instinct_MI300X.json
vllm/model_executor/layers/fused_moe/configs/E=62,N=256,device_name=AMD_Instinct_MI300X.json
vllm/model_executor/layers/fused_moe/configs/E=62,N=256,device_name=NVIDIA_H100_80GB_HBM3.json
vllm/model_executor/layers/fused_moe/configs/E=62,N=512,device_name=AMD_Instinct_MI300X.json
vllm/model_executor/layers/fused_moe/configs/E=62,N=512,device_name=NVIDIA_H100_80GB_HBM3.json
vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json
vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json
vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json
vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200.json
vllm/model_executor/layers/fused_moe/configs/E=64,N=1408,device_name=NVIDIA_B200.json
vllm/model_executor/layers/fused_moe/configs/E=64,N=1536,device_name=NVIDIA_H100_80GB_HBM3.json
vllm/model_executor/layers/fused_moe/configs/E=64,N=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8.json
vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200.json
vllm/model_executor/layers/fused_moe/configs/E=64,N=3072,device_name=NVIDIA_H20,dtype=fp8_w8a8.json
vllm/model_executor/layers/fused_moe/configs/E=64,N=3072,device_name=NVIDIA_H20.json
vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json
vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200.json
vllm/model_executor/layers/fused_moe/configs/E=64,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8.json
vllm/model_executor/layers/fused_moe/configs/E=64,N=384,device_name=NVIDIA_H20.json
vllm/model_executor/layers/fused_moe/configs/E=64,N=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json
vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json
vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json
vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json
vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200.json
vllm/model_executor/layers/fused_moe/configs/E=64,N=768,device_name=NVIDIA_H100_PCIe,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/fused_moe/configs/E=64,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8.json
vllm/model_executor/layers/fused_moe/configs/E=64,N=768,device_name=NVIDIA_H20.json
vllm/model_executor/layers/fused_moe/configs/E=64,N=896,device_name=NVIDIA_H20.json
vllm/model_executor/layers/fused_moe/configs/E=64,N=8960,device_name=NVIDIA_H100_80GB_HBM3,dtype=bf16.json
vllm/model_executor/layers/fused_moe/configs/E=64,N=8960,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
vllm/model_executor/layers/fused_moe/configs/E=72,N=192,device_name=AMD_Instinct_MI300X.json
vllm/model_executor/layers/fused_moe/configs/E=72,N=384,device_name=AMD_Instinct_MI300X.json
vllm/model_executor/layers/fused_moe/configs/E=72,N=384,device_name=NVIDIA_H100_80GB_HBM3.json
vllm/model_executor/layers/fused_moe/configs/E=72,N=768,device_name=AMD_Instinct_MI300X.json
vllm/model_executor/layers/fused_moe/configs/E=72,N=768,device_name=NVIDIA_H100_80GB_HBM3.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325X.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325X.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_L40S.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
vllm/model_executor/layers/fused_moe/configs/README
vllm/model_executor/layers/fused_moe/experts/__init__.py
vllm/model_executor/layers/fused_moe/experts/flashinfer_cutedsl_moe.py
vllm/model_executor/layers/fused_moe/experts/trtllm_fp8_moe.py
vllm/model_executor/layers/fused_moe/experts/trtllm_mxfp4_moe.py
vllm/model_executor/layers/fused_moe/experts/trtllm_nvfp4_moe.py
vllm/model_executor/layers/fused_moe/oracle/__init__.py
vllm/model_executor/layers/fused_moe/oracle/fp8.py
vllm/model_executor/layers/fused_moe/oracle/mxfp4.py
vllm/model_executor/layers/fused_moe/oracle/mxfp8.py
vllm/model_executor/layers/fused_moe/oracle/nvfp4.py
vllm/model_executor/layers/fused_moe/oracle/unquantized.py
vllm/model_executor/layers/fused_moe/prepare_finalize/__init__.py
vllm/model_executor/layers/fused_moe/prepare_finalize/deepep_ht.py
vllm/model_executor/layers/fused_moe/prepare_finalize/deepep_ll.py
vllm/model_executor/layers/fused_moe/prepare_finalize/flashinfer_nvlink_one_sided.py
vllm/model_executor/layers/fused_moe/prepare_finalize/flashinfer_nvlink_two_sided.py
vllm/model_executor/layers/fused_moe/prepare_finalize/naive_dp_ep.py
vllm/model_executor/layers/fused_moe/prepare_finalize/no_dp_ep.py
vllm/model_executor/layers/fused_moe/router/__init__.py
vllm/model_executor/layers/fused_moe/router/base_router.py
vllm/model_executor/layers/fused_moe/router/custom_routing_router.py
vllm/model_executor/layers/fused_moe/router/fused_moe_router.py
vllm/model_executor/layers/fused_moe/router/fused_topk_bias_router.py
vllm/model_executor/layers/fused_moe/router/fused_topk_router.py
vllm/model_executor/layers/fused_moe/router/gate_linear.py
vllm/model_executor/layers/fused_moe/router/grouped_topk_router.py
vllm/model_executor/layers/fused_moe/router/router_factory.py
vllm/model_executor/layers/fused_moe/router/routing_simulator_router.py
vllm/model_executor/layers/fused_moe/runner/__init__.py
vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py
vllm/model_executor/layers/fused_moe/runner/moe_runner.py
vllm/model_executor/layers/mamba/__init__.py
vllm/model_executor/layers/mamba/abstract.py
vllm/model_executor/layers/mamba/gdn_linear_attn.py
vllm/model_executor/layers/mamba/lamport_workspace.py
vllm/model_executor/layers/mamba/linear_attn.py
vllm/model_executor/layers/mamba/mamba_mixer.py
vllm/model_executor/layers/mamba/mamba_mixer2.py
vllm/model_executor/layers/mamba/mamba_utils.py
vllm/model_executor/layers/mamba/short_conv.py
vllm/model_executor/layers/mamba/ops/__init__.py
vllm/model_executor/layers/mamba/ops/causal_conv1d.py
vllm/model_executor/layers/mamba/ops/layernorm_gated.py
vllm/model_executor/layers/mamba/ops/mamba_ssm.py
vllm/model_executor/layers/mamba/ops/ssd_bmm.py
vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py
vllm/model_executor/layers/mamba/ops/ssd_chunk_state.py
vllm/model_executor/layers/mamba/ops/ssd_combined.py
vllm/model_executor/layers/mamba/ops/ssd_state_passing.py
vllm/model_executor/layers/mamba/ops/triton_helpers.py
vllm/model_executor/layers/pooler/__init__.py
vllm/model_executor/layers/pooler/abstract.py
vllm/model_executor/layers/pooler/activations.py
vllm/model_executor/layers/pooler/common.py
vllm/model_executor/layers/pooler/special.py
vllm/model_executor/layers/pooler/seqwise/__init__.py
vllm/model_executor/layers/pooler/seqwise/heads.py
vllm/model_executor/layers/pooler/seqwise/methods.py
vllm/model_executor/layers/pooler/seqwise/poolers.py
vllm/model_executor/layers/pooler/tokwise/__init__.py
vllm/model_executor/layers/pooler/tokwise/heads.py
vllm/model_executor/layers/pooler/tokwise/methods.py
vllm/model_executor/layers/pooler/tokwise/poolers.py
vllm/model_executor/layers/quantization/__init__.py
vllm/model_executor/layers/quantization/awq.py
vllm/model_executor/layers/quantization/awq_marlin.py
vllm/model_executor/layers/quantization/awq_triton.py
vllm/model_executor/layers/quantization/base_config.py
vllm/model_executor/layers/quantization/bitsandbytes.py
vllm/model_executor/layers/quantization/cpu_wna16.py
vllm/model_executor/layers/quantization/experts_int8.py
vllm/model_executor/layers/quantization/fbgemm_fp8.py
vllm/model_executor/layers/quantization/fp8.py
vllm/model_executor/layers/quantization/fp_quant.py
vllm/model_executor/layers/quantization/gguf.py
vllm/model_executor/layers/quantization/gptq.py
vllm/model_executor/layers/quantization/gptq_marlin.py
vllm/model_executor/layers/quantization/inc.py
vllm/model_executor/layers/quantization/input_quant_fp8.py
vllm/model_executor/layers/quantization/kv_cache.py
vllm/model_executor/layers/quantization/modelopt.py
vllm/model_executor/layers/quantization/moe_wna16.py
vllm/model_executor/layers/quantization/mxfp4.py
vllm/model_executor/layers/quantization/mxfp8.py
vllm/model_executor/layers/quantization/petit.py
vllm/model_executor/layers/quantization/qutlass_utils.py
vllm/model_executor/layers/quantization/schema.py
vllm/model_executor/layers/quantization/torchao.py
vllm/model_executor/layers/quantization/compressed_tensors/__init__.py
vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py
vllm/model_executor/layers/quantization/compressed_tensors/utils.py
vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py
vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py
vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_mxfp4.py
vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_nvfp4.py
vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py
vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a8_fp8.py
vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a8_int.py
vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py
vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py
vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
vllm/model_executor/layers/quantization/compressed_tensors/transform/__init__.py
vllm/model_executor/layers/quantization/compressed_tensors/transform/linear.py
vllm/model_executor/layers/quantization/compressed_tensors/transform/module.py
vllm/model_executor/layers/quantization/compressed_tensors/transform/utils.py
vllm/model_executor/layers/quantization/compressed_tensors/transform/schemes/__init__.py
vllm/model_executor/layers/quantization/compressed_tensors/transform/schemes/linear_qutlass_nvfp4.py
vllm/model_executor/layers/quantization/quark/__init__.py
vllm/model_executor/layers/quantization/quark/quark.py
vllm/model_executor/layers/quantization/quark/quark_moe.py
vllm/model_executor/layers/quantization/quark/utils.py
vllm/model_executor/layers/quantization/quark/schemes/__init__.py
vllm/model_executor/layers/quantization/quark/schemes/quark_ocp_mx.py
vllm/model_executor/layers/quantization/quark/schemes/quark_scheme.py
vllm/model_executor/layers/quantization/quark/schemes/quark_w4a8_mxfp4_fp8.py
vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py
vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py
vllm/model_executor/layers/quantization/utils/__init__.py
vllm/model_executor/layers/quantization/utils/allspark_utils.py
vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py
vllm/model_executor/layers/quantization/utils/flashinfer_mxint4_moe.py
vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
vllm/model_executor/layers/quantization/utils/fp8_utils.py
vllm/model_executor/layers/quantization/utils/gptq_utils.py
vllm/model_executor/layers/quantization/utils/int8_utils.py
vllm/model_executor/layers/quantization/utils/layer_utils.py
vllm/model_executor/layers/quantization/utils/machete_utils.py
vllm/model_executor/layers/quantization/utils/marlin_utils.py
vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py
vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py
vllm/model_executor/layers/quantization/utils/marlin_utils_test.py
vllm/model_executor/layers/quantization/utils/mxfp4_utils.py
vllm/model_executor/layers/quantization/utils/mxfp6_utils.py
vllm/model_executor/layers/quantization/utils/mxfp8_utils.py
vllm/model_executor/layers/quantization/utils/nvfp4_emulation_utils.py
vllm/model_executor/layers/quantization/utils/nvfp4_utils.py
vllm/model_executor/layers/quantization/utils/ocp_mx_utils.py
vllm/model_executor/layers/quantization/utils/petit_utils.py
vllm/model_executor/layers/quantization/utils/quant_utils.py
vllm/model_executor/layers/quantization/utils/w8a8_utils.py
vllm/model_executor/layers/quantization/utils/configs/N=1024,K=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=10240,K=5120,device_name=NVIDIA_L40S,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=12288,K=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=12288,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=12288,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=2048,K=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=2112,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=2112,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=24576,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=24576,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=4096,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=4096,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=5120,K=25600,device_name=NVIDIA_L40S,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=5120,K=8192,device_name=NVIDIA_L40S,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=51200,K=5120,device_name=NVIDIA_L40S,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/N=9216,K=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
vllm/model_executor/layers/quantization/utils/configs/README.md
vllm/model_executor/layers/rotary_embedding/__init__.py
vllm/model_executor/layers/rotary_embedding/base.py
vllm/model_executor/layers/rotary_embedding/common.py
vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py
vllm/model_executor/layers/rotary_embedding/dual_chunk_rope.py
vllm/model_executor/layers/rotary_embedding/dynamic_ntk_alpha_rope.py
vllm/model_executor/layers/rotary_embedding/dynamic_ntk_scaling_rope.py
vllm/model_executor/layers/rotary_embedding/ernie45_vl_rope.py
vllm/model_executor/layers/rotary_embedding/fope.py
vllm/model_executor/layers/rotary_embedding/gemma4_rope.py
vllm/model_executor/layers/rotary_embedding/linear_scaling_rope.py
vllm/model_executor/layers/rotary_embedding/llama3_rope.py
vllm/model_executor/layers/rotary_embedding/llama4_vision_rope.py
vllm/model_executor/layers/rotary_embedding/mrope.py
vllm/model_executor/layers/rotary_embedding/mrope_interleaved.py
vllm/model_executor/layers/rotary_embedding/ntk_scaling_rope.py
vllm/model_executor/layers/rotary_embedding/phi3_long_rope_scaled_rope.py
vllm/model_executor/layers/rotary_embedding/xdrope.py
vllm/model_executor/layers/rotary_embedding/yarn_scaling_rope.py
vllm/model_executor/model_loader/__init__.py
vllm/model_executor/model_loader/base_loader.py
vllm/model_executor/model_loader/bitsandbytes_loader.py
vllm/model_executor/model_loader/default_loader.py
vllm/model_executor/model_loader/dummy_loader.py
vllm/model_executor/model_loader/ep_weight_filter.py
vllm/model_executor/model_loader/gguf_loader.py
vllm/model_executor/model_loader/runai_streamer_loader.py
vllm/model_executor/model_loader/sharded_state_loader.py
vllm/model_executor/model_loader/tensorizer.py
vllm/model_executor/model_loader/tensorizer_loader.py
vllm/model_executor/model_loader/utils.py
vllm/model_executor/model_loader/weight_utils.py
vllm/model_executor/model_loader/reload/__init__.py
vllm/model_executor/model_loader/reload/layerwise.py
vllm/model_executor/model_loader/reload/meta.py
vllm/model_executor/model_loader/reload/sanitize.py
vllm/model_executor/model_loader/reload/torchao_decorator.py
vllm/model_executor/model_loader/reload/types.py
vllm/model_executor/model_loader/reload/utils.py
vllm/model_executor/models/AXK1.py
vllm/model_executor/models/__init__.py
vllm/model_executor/models/adapters.py
vllm/model_executor/models/afmoe.py
vllm/model_executor/models/aimv2.py
vllm/model_executor/models/apertus.py
vllm/model_executor/models/arcee.py
vllm/model_executor/models/arctic.py
vllm/model_executor/models/aria.py
vllm/model_executor/models/audioflamingo3.py
vllm/model_executor/models/aya_vision.py
vllm/model_executor/models/bagel.py
vllm/model_executor/models/baichuan.py
vllm/model_executor/models/bailing_moe.py
vllm/model_executor/models/bailing_moe_linear.py
vllm/model_executor/models/bamba.py
vllm/model_executor/models/bee.py
vllm/model_executor/models/bert.py
vllm/model_executor/models/bert_with_rope.py
vllm/model_executor/models/blip.py
vllm/model_executor/models/blip2.py
vllm/model_executor/models/bloom.py
vllm/model_executor/models/chameleon.py
vllm/model_executor/models/chatglm.py
vllm/model_executor/models/clip.py
vllm/model_executor/models/cohere2_vision.py
vllm/model_executor/models/cohere_asr.py
vllm/model_executor/models/colbert.py
vllm/model_executor/models/colmodernvbert.py
vllm/model_executor/models/colpali.py
vllm/model_executor/models/colqwen3.py
vllm/model_executor/models/colqwen3_5.py
vllm/model_executor/models/commandr.py
vllm/model_executor/models/config.py
vllm/model_executor/models/dbrx.py
vllm/model_executor/models/deepencoder.py
vllm/model_executor/models/deepencoder2.py
vllm/model_executor/models/deepseek_eagle.py
vllm/model_executor/models/deepseek_eagle3.py
vllm/model_executor/models/deepseek_mtp.py
vllm/model_executor/models/deepseek_ocr.py
vllm/model_executor/models/deepseek_ocr2.py
vllm/model_executor/models/deepseek_v2.py
vllm/model_executor/models/deepseek_vl2.py
vllm/model_executor/models/dots1.py
vllm/model_executor/models/dots_ocr.py
vllm/model_executor/models/eagle2_5_vl.py
vllm/model_executor/models/ernie.py
vllm/model_executor/models/ernie45.py
vllm/model_executor/models/ernie45_moe.py
vllm/model_executor/models/ernie45_vl.py
vllm/model_executor/models/ernie45_vl_moe.py
vllm/model_executor/models/ernie_mtp.py
vllm/model_executor/models/exaone.py
vllm/model_executor/models/exaone4.py
vllm/model_executor/models/exaone_moe.py
vllm/model_executor/models/exaone_moe_mtp.py
vllm/model_executor/models/extract_hidden_states.py
vllm/model_executor/models/fairseq2_llama.py
vllm/model_executor/models/falcon.py
vllm/model_executor/models/falcon_h1.py
vllm/model_executor/models/fireredasr2.py
vllm/model_executor/models/flex_olmo.py
vllm/model_executor/models/funasr.py
vllm/model_executor/models/funaudiochat.py
vllm/model_executor/models/fuyu.py
vllm/model_executor/models/gemma.py
vllm/model_executor/models/gemma2.py
vllm/model_executor/models/gemma3.py
vllm/model_executor/models/gemma3_mm.py
vllm/model_executor/models/gemma3n.py
vllm/model_executor/models/gemma3n_audio_utils.py
vllm/model_executor/models/gemma3n_mm.py
vllm/model_executor/models/gemma4.py
vllm/model_executor/models/gemma4_mm.py
vllm/model_executor/models/gemma4_utils.py
vllm/model_executor/models/glm.py
vllm/model_executor/models/glm4.py
vllm/model_executor/models/glm4_1v.py
vllm/model_executor/models/glm4_moe.py
vllm/model_executor/models/glm4_moe_lite.py
vllm/model_executor/models/glm4_moe_lite_mtp.py
vllm/model_executor/models/glm4_moe_mtp.py
vllm/model_executor/models/glm4v.py
vllm/model_executor/models/glm_ocr.py
vllm/model_executor/models/glm_ocr_mtp.py
vllm/model_executor/models/glmasr.py
vllm/model_executor/models/glmasr_utils.py
vllm/model_executor/models/gpt2.py
vllm/model_executor/models/gpt_bigcode.py
vllm/model_executor/models/gpt_j.py
vllm/model_executor/models/gpt_neox.py
vllm/model_executor/models/gpt_oss.py
vllm/model_executor/models/granite.py
vllm/model_executor/models/granite_speech.py
vllm/model_executor/models/granitemoe.py
vllm/model_executor/models/granitemoehybrid.py
vllm/model_executor/models/granitemoeshared.py
vllm/model_executor/models/gritlm.py
vllm/model_executor/models/grok1.py
vllm/model_executor/models/h2ovl.py
vllm/model_executor/models/hunyuan_v1.py
vllm/model_executor/models/hunyuan_vision.py
vllm/model_executor/models/hyperclovax.py
vllm/model_executor/models/hyperclovax_vision.py
vllm/model_executor/models/hyperclovax_vision_v2.py
vllm/model_executor/models/idefics2_vision_model.py
vllm/model_executor/models/idefics3.py
vllm/model_executor/models/interfaces.py
vllm/model_executor/models/interfaces_base.py
vllm/model_executor/models/intern_vit.py
vllm/model_executor/models/internlm2.py
vllm/model_executor/models/internlm2_ve.py
vllm/model_executor/models/interns1.py
vllm/model_executor/models/interns1_pro.py
vllm/model_executor/models/interns1_vit.py
vllm/model_executor/models/internvl.py
vllm/model_executor/models/iquest_loopcoder.py
vllm/model_executor/models/isaac.py
vllm/model_executor/models/jais.py
vllm/model_executor/models/jais2.py
vllm/model_executor/models/jamba.py
vllm/model_executor/models/jina_vl.py
vllm/model_executor/models/kanana_v.py
vllm/model_executor/models/keye.py
vllm/model_executor/models/keye_vl1_5.py
vllm/model_executor/models/kimi_audio.py
vllm/model_executor/models/kimi_k25.py
vllm/model_executor/models/kimi_k25_vit.py
vllm/model_executor/models/kimi_linear.py
vllm/model_executor/models/kimi_vl.py
vllm/model_executor/models/lfm2.py
vllm/model_executor/models/lfm2_moe.py
vllm/model_executor/models/lfm2_siglip2.py
vllm/model_executor/models/lfm2_vl.py
vllm/model_executor/models/lightonocr.py
vllm/model_executor/models/llama.py
vllm/model_executor/models/llama4.py
vllm/model_executor/models/llama4_eagle.py
vllm/model_executor/models/llama_eagle.py
vllm/model_executor/models/llama_eagle3.py
vllm/model_executor/models/llava.py
vllm/model_executor/models/llava_next.py
vllm/model_executor/models/llava_next_video.py
vllm/model_executor/models/llava_onevision.py
vllm/model_executor/models/longcat_flash.py
vllm/model_executor/models/longcat_flash_mtp.py
vllm/model_executor/models/mamba.py
vllm/model_executor/models/mamba2.py
vllm/model_executor/models/medusa.py
vllm/model_executor/models/midashenglm.py
vllm/model_executor/models/mimo.py
vllm/model_executor/models/mimo_mtp.py
vllm/model_executor/models/mimo_v2_flash.py
vllm/model_executor/models/minicpm.py
vllm/model_executor/models/minicpm3.py
vllm/model_executor/models/minicpm_eagle.py
vllm/model_executor/models/minicpmo.py
vllm/model_executor/models/minicpmv.py
vllm/model_executor/models/minimax_m2.py
vllm/model_executor/models/minimax_text_01.py
vllm/model_executor/models/minimax_vl_01.py
vllm/model_executor/models/mistral.py
vllm/model_executor/models/mistral3.py
vllm/model_executor/models/mistral_large_3.py
vllm/model_executor/models/mistral_large_3_eagle.py
vllm/model_executor/models/mixtral.py
vllm/model_executor/models/mllama4.py
vllm/model_executor/models/mlp_speculator.py
vllm/model_executor/models/modernbert.py
vllm/model_executor/models/module_mapping.py
vllm/model_executor/models/molmo.py
vllm/model_executor/models/molmo2.py
vllm/model_executor/models/moonvit.py
vllm/model_executor/models/mpt.py
vllm/model_executor/models/musicflamingo.py
vllm/model_executor/models/nano_nemotron_vl.py
vllm/model_executor/models/nemotron.py
vllm/model_executor/models/nemotron_h.py
vllm/model_executor/models/nemotron_h_mtp.py
vllm/model_executor/models/nemotron_nas.py
vllm/model_executor/models/nemotron_parse.py
vllm/model_executor/models/nemotron_vl.py
vllm/model_executor/models/nvlm_d.py
vllm/model_executor/models/olmo.py
vllm/model_executor/models/olmo2.py
vllm/model_executor/models/olmo_hybrid.py
vllm/model_executor/models/olmoe.py
vllm/model_executor/models/opencua.py
vllm/model_executor/models/openpangu.py
vllm/model_executor/models/openpangu_mtp.py
vllm/model_executor/models/openpangu_vl.py
vllm/model_executor/models/opt.py
vllm/model_executor/models/orion.py
vllm/model_executor/models/ouro.py
vllm/model_executor/models/ovis.py
vllm/model_executor/models/ovis2_5.py
vllm/model_executor/models/paddleocr_vl.py
vllm/model_executor/models/paligemma.py
vllm/model_executor/models/parakeet.py
vllm/model_executor/models/persimmon.py
vllm/model_executor/models/phi.py
vllm/model_executor/models/phi3.py
vllm/model_executor/models/phi3v.py
vllm/model_executor/models/phi4mm.py
vllm/model_executor/models/phi4mm_audio.py
vllm/model_executor/models/phi4mm_utils.py
vllm/model_executor/models/phimoe.py
vllm/model_executor/models/pixtral.py
vllm/model_executor/models/plamo2.py
vllm/model_executor/models/plamo3.py
vllm/model_executor/models/qwen.py
vllm/model_executor/models/qwen2.py
vllm/model_executor/models/qwen2_5_omni_thinker.py
vllm/model_executor/models/qwen2_5_vl.py
vllm/model_executor/models/qwen2_audio.py
vllm/model_executor/models/qwen2_moe.py
vllm/model_executor/models/qwen2_rm.py
vllm/model_executor/models/qwen2_vl.py
vllm/model_executor/models/qwen3.py
vllm/model_executor/models/qwen3_5.py
vllm/model_executor/models/qwen3_5_mtp.py
vllm/model_executor/models/qwen3_asr.py
vllm/model_executor/models/qwen3_asr_forced_aligner.py
vllm/model_executor/models/qwen3_asr_realtime.py
vllm/model_executor/models/qwen3_moe.py
vllm/model_executor/models/qwen3_next.py
vllm/model_executor/models/qwen3_next_mtp.py
vllm/model_executor/models/qwen3_omni_moe_thinker.py
vllm/model_executor/models/qwen3_vl.py
vllm/model_executor/models/qwen3_vl_moe.py
vllm/model_executor/models/qwen_vl.py
vllm/model_executor/models/radio.py
vllm/model_executor/models/registry.py
vllm/model_executor/models/roberta.py
vllm/model_executor/models/rvl.py
vllm/model_executor/models/sarvam.py
vllm/model_executor/models/seed_oss.py
vllm/model_executor/models/siglip.py
vllm/model_executor/models/siglip2navit.py
vllm/model_executor/models/skyworkr1v.py
vllm/model_executor/models/smolvlm.py
vllm/model_executor/models/solar.py
vllm/model_executor/models/stablelm.py
vllm/model_executor/models/starcoder2.py
vllm/model_executor/models/step1.py
vllm/model_executor/models/step3_text.py
vllm/model_executor/models/step3_vl.py
vllm/model_executor/models/step3p5.py
vllm/model_executor/models/step3p5_mtp.py
vllm/model_executor/models/step_vl.py
vllm/model_executor/models/tarsier.py
vllm/model_executor/models/telechat2.py
vllm/model_executor/models/teleflm.py
vllm/model_executor/models/terratorch.py
vllm/model_executor/models/ultravox.py
vllm/model_executor/models/utils.py
vllm/model_executor/models/vision.py
vllm/model_executor/models/voxtral.py
vllm/model_executor/models/voxtral_realtime.py
vllm/model_executor/models/voyage.py
vllm/model_executor/models/whisper.py
vllm/model_executor/models/whisper_causal.py
vllm/model_executor/models/whisper_utils.py
vllm/model_executor/models/zamba2.py
vllm/model_executor/models/transformers/__init__.py
vllm/model_executor/models/transformers/base.py
vllm/model_executor/models/transformers/causal.py
vllm/model_executor/models/transformers/legacy.py
vllm/model_executor/models/transformers/moe.py
vllm/model_executor/models/transformers/multimodal.py
vllm/model_executor/models/transformers/pooling.py
vllm/model_executor/models/transformers/utils.py
vllm/model_executor/offloader/__init__.py
vllm/model_executor/offloader/base.py
vllm/model_executor/offloader/prefetch.py
vllm/model_executor/offloader/prefetch_ops.py
vllm/model_executor/offloader/uva.py
vllm/model_executor/warmup/__init__.py
vllm/model_executor/warmup/deep_gemm_warmup.py
vllm/model_executor/warmup/kernel_warmup.py
vllm/multimodal/__init__.py
vllm/multimodal/audio.py
vllm/multimodal/cache.py
vllm/multimodal/encoder_budget.py
vllm/multimodal/evs.py
vllm/multimodal/hasher.py
vllm/multimodal/image.py
vllm/multimodal/inputs.py
vllm/multimodal/parse.py
vllm/multimodal/registry.py
vllm/multimodal/utils.py
vllm/multimodal/video.py
vllm/multimodal/media/__init__.py
vllm/multimodal/media/audio.py
vllm/multimodal/media/base.py
vllm/multimodal/media/connector.py
vllm/multimodal/media/image.py
vllm/multimodal/media/video.py
vllm/multimodal/processing/__init__.py
vllm/multimodal/processing/context.py
vllm/multimodal/processing/dummy_inputs.py
vllm/multimodal/processing/inputs.py
vllm/multimodal/processing/processor.py
vllm/parser/__init__.py
vllm/parser/abstract_parser.py
vllm/parser/minimax_m2_parser.py
vllm/parser/parser_manager.py
vllm/platforms/__init__.py
vllm/platforms/cpu.py
vllm/platforms/cuda.py
vllm/platforms/interface.py
vllm/platforms/rocm.py
vllm/platforms/tpu.py
vllm/platforms/xpu.py
vllm/platforms/zen_cpu.py
vllm/plugins/__init__.py
vllm/plugins/io_processors/__init__.py
vllm/plugins/io_processors/interface.py
vllm/plugins/lora_resolvers/__init__.py
vllm/plugins/lora_resolvers/filesystem_resolver.py
vllm/plugins/lora_resolvers/hf_hub_resolver.py
vllm/profiler/__init__.py
vllm/profiler/layerwise_profile.py
vllm/profiler/utils.py
vllm/profiler/wrapper.py
vllm/ray/__init__.py
vllm/ray/lazy_utils.py
vllm/ray/ray_env.py
vllm/reasoning/__init__.py
vllm/reasoning/abs_reasoning_parsers.py
vllm/reasoning/basic_parsers.py
vllm/reasoning/deepseek_r1_reasoning_parser.py
vllm/reasoning/deepseek_v3_reasoning_parser.py
vllm/reasoning/ernie45_reasoning_parser.py
vllm/reasoning/gemma4_reasoning_parser.py
vllm/reasoning/gemma4_utils.py
vllm/reasoning/gptoss_reasoning_parser.py
vllm/reasoning/granite_reasoning_parser.py
vllm/reasoning/hunyuan_a13b_reasoning_parser.py
vllm/reasoning/identity_reasoning_parser.py
vllm/reasoning/kimi_k2_reasoning_parser.py
vllm/reasoning/minimax_m2_reasoning_parser.py
vllm/reasoning/mistral_reasoning_parser.py
vllm/reasoning/nemotron_v3_reasoning_parser.py
vllm/reasoning/olmo3_reasoning_parser.py
vllm/reasoning/qwen3_reasoning_parser.py
vllm/reasoning/seedoss_reasoning_parser.py
vllm/reasoning/step3_reasoning_parser.py
vllm/reasoning/step3p5_reasoning_parser.py
vllm/renderers/__init__.py
vllm/renderers/base.py
vllm/renderers/deepseek_v32.py
vllm/renderers/embed_utils.py
vllm/renderers/grok2.py
vllm/renderers/hf.py
vllm/renderers/mistral.py
vllm/renderers/params.py
vllm/renderers/registry.py
vllm/renderers/terratorch.py
vllm/renderers/inputs/__init__.py
vllm/renderers/inputs/preprocess.py
vllm/renderers/inputs/tokenize.py
vllm/third_party/__init__.py
vllm/third_party/pynvml.py
vllm/third_party/flashmla/__init__.py
vllm/tokenizers/__init__.py
vllm/tokenizers/deepseek_v32.py
vllm/tokenizers/deepseek_v32_encoding.py
vllm/tokenizers/detokenizer_utils.py
vllm/tokenizers/grok2.py
vllm/tokenizers/hf.py
vllm/tokenizers/kimi_audio.py
vllm/tokenizers/mistral.py
vllm/tokenizers/protocol.py
vllm/tokenizers/qwen_vl.py
vllm/tokenizers/registry.py
vllm/tool_parsers/__init__.py
vllm/tool_parsers/abstract_tool_parser.py
vllm/tool_parsers/deepseekv31_tool_parser.py
vllm/tool_parsers/deepseekv32_tool_parser.py
vllm/tool_parsers/deepseekv3_tool_parser.py
vllm/tool_parsers/ernie45_tool_parser.py
vllm/tool_parsers/functiongemma_tool_parser.py
vllm/tool_parsers/gemma4_tool_parser.py
vllm/tool_parsers/gemma4_utils.py
vllm/tool_parsers/gigachat3_tool_parser.py
vllm/tool_parsers/glm47_moe_tool_parser.py
vllm/tool_parsers/glm4_moe_tool_parser.py
vllm/tool_parsers/granite4_tool_parser.py
vllm/tool_parsers/granite_20b_fc_tool_parser.py
vllm/tool_parsers/granite_tool_parser.py
vllm/tool_parsers/hermes_tool_parser.py
vllm/tool_parsers/hunyuan_a13b_tool_parser.py
vllm/tool_parsers/internlm2_tool_parser.py
vllm/tool_parsers/jamba_tool_parser.py
vllm/tool_parsers/kimi_k2_tool_parser.py
vllm/tool_parsers/llama4_pythonic_tool_parser.py
vllm/tool_parsers/llama_tool_parser.py
vllm/tool_parsers/longcat_tool_parser.py
vllm/tool_parsers/minimax_m2_tool_parser.py
vllm/tool_parsers/minimax_tool_parser.py
vllm/tool_parsers/mistral_tool_parser.py
vllm/tool_parsers/olmo3_tool_parser.py
vllm/tool_parsers/openai_tool_parser.py
vllm/tool_parsers/phi4mini_tool_parser.py
vllm/tool_parsers/pythonic_tool_parser.py
vllm/tool_parsers/qwen3coder_tool_parser.py
vllm/tool_parsers/qwen3xml_tool_parser.py
vllm/tool_parsers/seed_oss_tool_parser.py
vllm/tool_parsers/step3_tool_parser.py
vllm/tool_parsers/step3p5_tool_parser.py
vllm/tool_parsers/utils.py
vllm/tool_parsers/xlam_tool_parser.py
vllm/tracing/__init__.py
vllm/tracing/otel.py
vllm/tracing/utils.py
vllm/transformers_utils/__init__.py
vllm/transformers_utils/config.py
vllm/transformers_utils/config_parser_base.py
vllm/transformers_utils/dynamic_module.py
vllm/transformers_utils/gguf_utils.py
vllm/transformers_utils/model_arch_config_convertor.py
vllm/transformers_utils/processor.py
vllm/transformers_utils/repo_utils.py
vllm/transformers_utils/runai_utils.py
vllm/transformers_utils/s3_utils.py
vllm/transformers_utils/tokenizer.py
vllm/transformers_utils/utils.py
vllm/transformers_utils/chat_templates/__init__.py
vllm/transformers_utils/chat_templates/registry.py
vllm/transformers_utils/chat_templates/template_basic.jinja
vllm/transformers_utils/chat_templates/template_blip2.jinja
vllm/transformers_utils/chat_templates/template_chatml.jinja
vllm/transformers_utils/chat_templates/template_deepseek_ocr.jinja
vllm/transformers_utils/chat_templates/template_deepseek_vl2.jinja
vllm/transformers_utils/chat_templates/template_fuyu.jinja
vllm/transformers_utils/chat_templates/template_kimi_audio.jinja
vllm/transformers_utils/chat_templates/template_minicpmv45.jinja
vllm/transformers_utils/configs/AXK1.py
vllm/transformers_utils/configs/__init__.py
vllm/transformers_utils/configs/afmoe.py
vllm/transformers_utils/configs/arctic.py
vllm/transformers_utils/configs/bagel.py
vllm/transformers_utils/configs/chatglm.py
vllm/transformers_utils/configs/colmodernvbert.py
vllm/transformers_utils/configs/colpali.py
vllm/transformers_utils/configs/colqwen3.py
vllm/transformers_utils/configs/deepseek_vl2.py
vllm/transformers_utils/configs/dotsocr.py
vllm/transformers_utils/configs/eagle.py
vllm/transformers_utils/configs/extract_hidden_states.py
vllm/transformers_utils/configs/falcon.py
vllm/transformers_utils/configs/flex_olmo.py
vllm/transformers_utils/configs/funaudiochat.py
vllm/transformers_utils/configs/hunyuan_vl.py
vllm/transformers_utils/configs/hyperclovax.py
vllm/transformers_utils/configs/isaac.py
vllm/transformers_utils/configs/jais.py
vllm/transformers_utils/configs/kimi_k25.py
vllm/transformers_utils/configs/kimi_linear.py
vllm/transformers_utils/configs/kimi_vl.py
vllm/transformers_utils/configs/lfm2_moe.py
vllm/transformers_utils/configs/medusa.py
vllm/transformers_utils/configs/midashenglm.py
vllm/transformers_utils/configs/mistral.py
vllm/transformers_utils/configs/mlp_speculator.py
vllm/transformers_utils/configs/moonvit.py
vllm/transformers_utils/configs/nemotron.py
vllm/transformers_utils/configs/nemotron_h.py
vllm/transformers_utils/configs/olmo_hybrid.py
vllm/transformers_utils/configs/ovis.py
vllm/transformers_utils/configs/parakeet.py
vllm/transformers_utils/configs/qwen3_5.py
vllm/transformers_utils/configs/qwen3_5_moe.py
vllm/transformers_utils/configs/qwen3_asr.py
vllm/transformers_utils/configs/qwen3_next.py
vllm/transformers_utils/configs/radio.py
vllm/transformers_utils/configs/step3_vl.py
vllm/transformers_utils/configs/step3p5.py
vllm/transformers_utils/configs/tarsier2.py
vllm/transformers_utils/configs/ultravox.py
vllm/transformers_utils/configs/speculators/__init__.py
vllm/transformers_utils/configs/speculators/algos.py
vllm/transformers_utils/configs/speculators/base.py
vllm/transformers_utils/processors/__init__.py
vllm/transformers_utils/processors/bagel.py
vllm/transformers_utils/processors/cohere_asr.py
vllm/transformers_utils/processors/deepseek_ocr.py
vllm/transformers_utils/processors/deepseek_vl2.py
vllm/transformers_utils/processors/fireredasr2.py
vllm/transformers_utils/processors/funasr.py
vllm/transformers_utils/processors/glm4v.py
vllm/transformers_utils/processors/h2ovl.py
vllm/transformers_utils/processors/hunyuan_vl.py
vllm/transformers_utils/processors/hunyuan_vl_image.py
vllm/transformers_utils/processors/internvl.py
vllm/transformers_utils/processors/isaac.py
vllm/transformers_utils/processors/kimi_audio.py
vllm/transformers_utils/processors/kimi_k25.py
vllm/transformers_utils/processors/nano_nemotron_vl.py
vllm/transformers_utils/processors/nemotron_vl.py
vllm/transformers_utils/processors/nvlm_d.py
vllm/transformers_utils/processors/ovis.py
vllm/transformers_utils/processors/ovis2_5.py
vllm/transformers_utils/processors/pixtral.py
vllm/transformers_utils/processors/qwen3_asr.py
vllm/transformers_utils/processors/qwen_vl.py
vllm/transformers_utils/processors/step3_vl.py
vllm/transformers_utils/processors/voxtral.py
vllm/triton_utils/__init__.py
vllm/triton_utils/allocation.py
vllm/triton_utils/importing.py
vllm/usage/__init__.py
vllm/usage/usage_lib.py
vllm/utils/__init__.py
vllm/utils/argparse_utils.py
vllm/utils/async_utils.py
vllm/utils/cache.py
vllm/utils/collection_utils.py
vllm/utils/counter.py
vllm/utils/cpu_triton_utils.py
vllm/utils/deep_gemm.py
vllm/utils/flashinfer.py
vllm/utils/func_utils.py
vllm/utils/gc_utils.py
vllm/utils/hashing.py
vllm/utils/import_utils.py
vllm/utils/jsontree.py
vllm/utils/math_utils.py
vllm/utils/mem_constants.py
vllm/utils/mem_utils.py
vllm/utils/mistral.py
vllm/utils/multi_stream_utils.py
vllm/utils/nccl.py
vllm/utils/network_utils.py
vllm/utils/nvtx_pytorch_hooks.py
vllm/utils/platform_utils.py
vllm/utils/print_utils.py
vllm/utils/profiling.py
vllm/utils/registry.py
vllm/utils/serial_utils.py
vllm/utils/system_utils.py
vllm/utils/tensor_schema.py
vllm/utils/torch_utils.py
vllm/utils/tqdm_utils.py
vllm/v1/__init__.py
vllm/v1/cudagraph_dispatcher.py
vllm/v1/kv_cache_interface.py
vllm/v1/outputs.py
vllm/v1/request.py
vllm/v1/serial_utils.py
vllm/v1/utils.py
vllm/v1/attention/__init__.py
vllm/v1/attention/backend.py
vllm/v1/attention/selector.py
vllm/v1/attention/backends/__init__.py
vllm/v1/attention/backends/cpu_attn.py
vllm/v1/attention/backends/fa_utils.py
vllm/v1/attention/backends/flash_attn.py
vllm/v1/attention/backends/flash_attn_diffkv.py
vllm/v1/attention/backends/flashinfer.py
vllm/v1/attention/backends/flex_attention.py
vllm/v1/attention/backends/gdn_attn.py
vllm/v1/attention/backends/linear_attn.py
vllm/v1/attention/backends/mamba1_attn.py
vllm/v1/attention/backends/mamba2_attn.py
vllm/v1/attention/backends/mamba_attn.py
vllm/v1/attention/backends/registry.py
vllm/v1/attention/backends/rocm_aiter_fa.py
vllm/v1/attention/backends/rocm_aiter_unified_attn.py
vllm/v1/attention/backends/rocm_attn.py
vllm/v1/attention/backends/short_conv_attn.py
vllm/v1/attention/backends/tree_attn.py
vllm/v1/attention/backends/triton_attn.py
vllm/v1/attention/backends/utils.py
vllm/v1/attention/backends/mla/__init__.py
vllm/v1/attention/backends/mla/aiter_triton_mla.py
vllm/v1/attention/backends/mla/cutlass_mla.py
vllm/v1/attention/backends/mla/flashattn_mla.py
vllm/v1/attention/backends/mla/flashinfer_mla.py
vllm/v1/attention/backends/mla/flashinfer_mla_sparse.py
vllm/v1/attention/backends/mla/flashmla.py
vllm/v1/attention/backends/mla/flashmla_sparse.py
vllm/v1/attention/backends/mla/indexer.py
vllm/v1/attention/backends/mla/rocm_aiter_mla.py
vllm/v1/attention/backends/mla/rocm_aiter_mla_sparse.py
vllm/v1/attention/backends/mla/sparse_utils.py
vllm/v1/attention/backends/mla/triton_mla.py
vllm/v1/attention/backends/mla/xpu_mla_sparse.py
vllm/v1/attention/ops/__init__.py
vllm/v1/attention/ops/chunked_prefill_paged_decode.py
vllm/v1/attention/ops/common.py
vllm/v1/attention/ops/dcp_alltoall.py
vllm/v1/attention/ops/flashmla.py
vllm/v1/attention/ops/merge_attn_states.py
vllm/v1/attention/ops/paged_attn.py
vllm/v1/attention/ops/prefix_prefill.py
vllm/v1/attention/ops/rocm_aiter_mla_sparse.py
vllm/v1/attention/ops/triton_decode_attention.py
vllm/v1/attention/ops/triton_merge_attn_states.py
vllm/v1/attention/ops/triton_prefill_attention.py
vllm/v1/attention/ops/triton_reshape_and_cache_flash.py
vllm/v1/attention/ops/triton_unified_attention.py
vllm/v1/attention/ops/vit_attn_wrappers.py
vllm/v1/attention/ops/xpu_mla_sparse.py
vllm/v1/core/__init__.py
vllm/v1/core/block_pool.py
vllm/v1/core/encoder_cache_manager.py
vllm/v1/core/kv_cache_coordinator.py
vllm/v1/core/kv_cache_manager.py
vllm/v1/core/kv_cache_metrics.py
vllm/v1/core/kv_cache_utils.py
vllm/v1/core/single_type_kv_cache_manager.py
vllm/v1/core/sched/__init__.py
vllm/v1/core/sched/async_scheduler.py
vllm/v1/core/sched/interface.py
vllm/v1/core/sched/output.py
vllm/v1/core/sched/request_queue.py
vllm/v1/core/sched/scheduler.py
vllm/v1/core/sched/utils.py
vllm/v1/engine/__init__.py
vllm/v1/engine/async_llm.py
vllm/v1/engine/coordinator.py
vllm/v1/engine/core.py
vllm/v1/engine/core_client.py
vllm/v1/engine/detokenizer.py
vllm/v1/engine/exceptions.py
vllm/v1/engine/input_processor.py
vllm/v1/engine/llm_engine.py
vllm/v1/engine/logprobs.py
vllm/v1/engine/output_processor.py
vllm/v1/engine/parallel_sampling.py
vllm/v1/engine/tensor_ipc.py
vllm/v1/engine/utils.py
vllm/v1/executor/__init__.py
vllm/v1/executor/abstract.py
vllm/v1/executor/multiproc_executor.py
vllm/v1/executor/ray_distributed_executor.py
vllm/v1/executor/ray_executor.py
vllm/v1/executor/ray_utils.py
vllm/v1/executor/uniproc_executor.py
vllm/v1/kv_offload/__init__.py
vllm/v1/kv_offload/abstract.py
vllm/v1/kv_offload/factory.py
vllm/v1/kv_offload/mediums.py
vllm/v1/kv_offload/reuse_manager.py
vllm/v1/kv_offload/spec.py
vllm/v1/kv_offload/cpu/__init__.py
vllm/v1/kv_offload/cpu/manager.py
vllm/v1/kv_offload/cpu/spec.py
vllm/v1/kv_offload/cpu/policies/__init__.py
vllm/v1/kv_offload/cpu/policies/abstract.py
vllm/v1/kv_offload/cpu/policies/arc.py
vllm/v1/kv_offload/cpu/policies/lru.py
vllm/v1/kv_offload/worker/__init__.py
vllm/v1/kv_offload/worker/cpu_gpu.py
vllm/v1/kv_offload/worker/worker.py
vllm/v1/metrics/__init__.py
vllm/v1/metrics/loggers.py
vllm/v1/metrics/perf.py
vllm/v1/metrics/prometheus.py
vllm/v1/metrics/ray_wrappers.py
vllm/v1/metrics/reader.py
vllm/v1/metrics/stats.py
vllm/v1/metrics/utils.py
vllm/v1/pool/__init__.py
vllm/v1/pool/late_interaction.py
vllm/v1/pool/metadata.py
vllm/v1/sample/__init__.py
vllm/v1/sample/metadata.py
vllm/v1/sample/rejection_sampler.py
vllm/v1/sample/sampler.py
vllm/v1/sample/logits_processor/__init__.py
vllm/v1/sample/logits_processor/builtin.py
vllm/v1/sample/logits_processor/interface.py
vllm/v1/sample/logits_processor/state.py
vllm/v1/sample/ops/__init__.py
vllm/v1/sample/ops/bad_words.py
vllm/v1/sample/ops/logprobs.py
vllm/v1/sample/ops/penalties.py
vllm/v1/sample/ops/topk_topp_sampler.py
vllm/v1/sample/ops/topk_topp_triton.py
vllm/v1/simple_kv_offload/__init__.py
vllm/v1/simple_kv_offload/copy_backend.py
vllm/v1/simple_kv_offload/cuda_mem_ops.py
vllm/v1/simple_kv_offload/manager.py
vllm/v1/simple_kv_offload/metadata.py
vllm/v1/simple_kv_offload/worker.py
vllm/v1/spec_decode/__init__.py
vllm/v1/spec_decode/draft_model.py
vllm/v1/spec_decode/eagle.py
vllm/v1/spec_decode/extract_hidden_states.py
vllm/v1/spec_decode/medusa.py
vllm/v1/spec_decode/metadata.py
vllm/v1/spec_decode/metrics.py
vllm/v1/spec_decode/ngram_proposer.py
vllm/v1/spec_decode/ngram_proposer_gpu.py
vllm/v1/spec_decode/suffix_decoding.py
vllm/v1/spec_decode/utils.py
vllm/v1/structured_output/__init__.py
vllm/v1/structured_output/backend_guidance.py
vllm/v1/structured_output/backend_lm_format_enforcer.py
vllm/v1/structured_output/backend_outlines.py
vllm/v1/structured_output/backend_types.py
vllm/v1/structured_output/backend_xgrammar.py
vllm/v1/structured_output/request.py
vllm/v1/structured_output/utils.py
vllm/v1/worker/__init__.py
vllm/v1/worker/block_table.py
vllm/v1/worker/cp_utils.py
vllm/v1/worker/cpu_model_runner.py
vllm/v1/worker/cpu_worker.py
vllm/v1/worker/dp_utils.py
vllm/v1/worker/ec_connector_model_runner_mixin.py
vllm/v1/worker/encoder_cudagraph.py
vllm/v1/worker/encoder_cudagraph_defs.py
vllm/v1/worker/gpu_input_batch.py
vllm/v1/worker/gpu_model_runner.py
vllm/v1/worker/gpu_ubatch_wrapper.py
vllm/v1/worker/gpu_worker.py
vllm/v1/worker/kv_connector_model_runner_mixin.py
vllm/v1/worker/lora_model_runner_mixin.py
vllm/v1/worker/mamba_utils.py
vllm/v1/worker/tpu_input_batch.py
vllm/v1/worker/ubatch_utils.py
vllm/v1/worker/ubatching.py
vllm/v1/worker/utils.py
vllm/v1/worker/worker_base.py
vllm/v1/worker/workspace.py
vllm/v1/worker/xpu_model_runner.py
vllm/v1/worker/xpu_worker.py
vllm/v1/worker/gpu/README.md
vllm/v1/worker/gpu/__init__.py
vllm/v1/worker/gpu/async_utils.py
vllm/v1/worker/gpu/attn_utils.py
vllm/v1/worker/gpu/block_table.py
vllm/v1/worker/gpu/buffer_utils.py
vllm/v1/worker/gpu/cp_utils.py
vllm/v1/worker/gpu/cudagraph_utils.py
vllm/v1/worker/gpu/dp_utils.py
vllm/v1/worker/gpu/eplb_utils.py
vllm/v1/worker/gpu/input_batch.py
vllm/v1/worker/gpu/kv_connector.py
vllm/v1/worker/gpu/lora_utils.py
vllm/v1/worker/gpu/model_runner.py
vllm/v1/worker/gpu/pp_utils.py
vllm/v1/worker/gpu/states.py
vllm/v1/worker/gpu/structured_outputs.py
vllm/v1/worker/gpu/warmup.py
vllm/v1/worker/gpu/metrics/__init__.py
vllm/v1/worker/gpu/metrics/logits.py
vllm/v1/worker/gpu/mm/__init__.py
vllm/v1/worker/gpu/mm/encoder_cache.py
vllm/v1/worker/gpu/mm/encoder_runner.py
vllm/v1/worker/gpu/mm/rope.py
vllm/v1/worker/gpu/model_states/__init__.py
vllm/v1/worker/gpu/model_states/default.py
vllm/v1/worker/gpu/model_states/interface.py
vllm/v1/worker/gpu/model_states/whisper.py
vllm/v1/worker/gpu/pool/__init__.py
vllm/v1/worker/gpu/pool/late_interaction_runner.py
vllm/v1/worker/gpu/pool/pooling_runner.py
vllm/v1/worker/gpu/sample/__init__.py
vllm/v1/worker/gpu/sample/bad_words.py
vllm/v1/worker/gpu/sample/gumbel.py
vllm/v1/worker/gpu/sample/logit_bias.py
vllm/v1/worker/gpu/sample/logprob.py
vllm/v1/worker/gpu/sample/min_p.py
vllm/v1/worker/gpu/sample/output.py
vllm/v1/worker/gpu/sample/penalties.py
vllm/v1/worker/gpu/sample/prompt_logprob.py
vllm/v1/worker/gpu/sample/sampler.py
vllm/v1/worker/gpu/sample/states.py
vllm/v1/worker/gpu/spec_decode/__init__.py
vllm/v1/worker/gpu/spec_decode/rejection_sampler.py
vllm/v1/worker/gpu/spec_decode/synthetic_rejection_sampler_utils.py
vllm/v1/worker/gpu/spec_decode/utils.py
vllm/v1/worker/gpu/spec_decode/eagle/__init__.py
vllm/v1/worker/gpu/spec_decode/eagle/cudagraph.py
vllm/v1/worker/gpu/spec_decode/eagle/eagle3_utils.py
vllm/v1/worker/gpu/spec_decode/eagle/speculator.py
vllm/v1/worker/gpu/spec_decode/eagle/utils.py
vllm/vllm_flash_attn/.gitkeep
vllm/vllm_flash_attn/__init__.py
vllm/vllm_flash_attn/flash_attn_interface.py