SumanthRH · devpatelio · Sep 21, 2025 · Sep 21, 2025 · Sep 21, 2025 · Sep 21, 2025
diff --git a/.buildkite/check-wheel-size.py b/.buildkite/check-wheel-size.py
@@ -5,11 +5,11 @@
 import sys
 import zipfile
 
-# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 400 MiB
-# Note that we have 400 MiB quota, please use it wisely.
-# See https://github.com/pypi/support/issues/3792 .
+# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 450 MiB
+# Note that we have 800 MiB quota, please use it wisely.
+# See https://github.com/pypi/support/issues/6326 .
 # Please also sync the value with the one in Dockerfile.
-VLLM_MAX_SIZE_MB = int(os.environ.get("VLLM_MAX_SIZE_MB", 400))
+VLLM_MAX_SIZE_MB = int(os.environ.get("VLLM_MAX_SIZE_MB", 450))
 
 
 def print_top_10_largest_files(zip_file):

diff --git a/.buildkite/generate_index.py b/.buildkite/generate_index.py
@@ -8,7 +8,8 @@
 <html>
     <body>
     <h1>Links for vLLM</h1/>
-        <a href="../{wheel_html_escaped}">{wheel}</a><br/>
+        <a href="../{x86_wheel_html_escaped}">{x86_wheel}</a><br/>
+        <a href="../{arm_wheel_html_escaped}">{arm_wheel}</a><br/>
     </body>
 </html>
 """
@@ -21,7 +22,25 @@
 
 with open("index.html", "w") as f:
     print(f"Generated index.html for {args.wheel}")
+    # sync the abi tag with .buildkite/scripts/upload-wheels.sh
+    if "x86_64" in filename:
+        x86_wheel = filename
+        arm_wheel = filename.replace("x86_64", "aarch64").replace(
+            "manylinux1", "manylinux2014"
+        )
+    elif "aarch64" in filename:
+        x86_wheel = filename.replace("aarch64", "x86_64").replace(
+            "manylinux2014", "manylinux1"
+        )
+        arm_wheel = filename
+    else:
+        raise ValueError(f"Unsupported wheel: {filename}")
     # cloudfront requires escaping the '+' character
     f.write(
-        template.format(wheel=filename, wheel_html_escaped=filename.replace("+", "%2B"))
+        template.format(
+            x86_wheel=x86_wheel,
+            x86_wheel_html_escaped=x86_wheel.replace("+", "%2B"),
+            arm_wheel=arm_wheel,
+            arm_wheel_html_escaped=arm_wheel.replace("+", "%2B"),
+        )
     )
diff --git a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml
diff --git a/.buildkite/lm-eval-harness/configs/models-large.txt b/.buildkite/lm-eval-harness/configs/models-large.txt
@@ -3,4 +3,3 @@ Meta-Llama-3-70B-Instruct.yaml
 Mixtral-8x7B-Instruct-v0.1.yaml
 Qwen2-57B-A14-Instruct.yaml
 DeepSeek-V2-Lite-Chat.yaml
-Meta-Llama-3-8B-QQQ.yaml
diff --git a/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh b/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
@@ -2,7 +2,7 @@
 # We can use this script to compute baseline accuracy on GSM for transformers.
 #
 # Make sure you have lm-eval-harness installed:
-#   pip install lm-eval==0.4.4
+#   pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
 
 usage() {
     echo``

diff --git a/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh b/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
@@ -3,7 +3,7 @@
 # We use this for fp8, which HF does not support.
 #
 # Make sure you have lm-eval-harness installed:
-#   pip install lm-eval==0.4.4
+#   pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
 
 usage() {
     echo``

diff --git a/.buildkite/nightly-benchmarks/README.md b/.buildkite/nightly-benchmarks/README.md
@@ -141,7 +141,7 @@ When run, benchmark script generates results under `benchmark/results` folder, a
 `compare-json-results.py` compares two `benchmark_results.json` files and provides performance ratio e.g. for Output Tput, Median TTFT and Median TPOT.  
 If only one benchmark_results.json is passed, `compare-json-results.py` compares different TP and PP configurations in the benchmark_results.json instead.
 
-Here is an example using the script to compare result_a and result_b with Model, Dataset name, input/output lenght, max concurrency and qps.
+Here is an example using the script to compare result_a and result_b with Model, Dataset name, input/output length, max concurrency and qps.
 `python3 compare-json-results.py -f results_a/benchmark_results.json -f results_b/benchmark_results.json`
 
 |   | Model | Dataset Name | Input Len | Output Len | # of max concurrency | qps  | results_a/benchmark_results.json | results_b/benchmark_results.json | perf_ratio        |

diff --git a/.buildkite/nightly-benchmarks/nightly-descriptions.md b/.buildkite/nightly-benchmarks/nightly-descriptions.md
@@ -8,7 +8,7 @@ This benchmark aims to:
 
 Latest results: [results link](https://blog.vllm.ai/2024/09/05/perf-update.html), scroll to the end.
 
-Latest reproduction guilde: [github issue link](https://github.com/vllm-project/vllm/issues/8176)
+Latest reproduction guide: [github issue link](https://github.com/vllm-project/vllm/issues/8176)
 
 ## Setup
 
@@ -17,7 +17,7 @@ Latest reproduction guilde: [github issue link](https://github.com/vllm-project/
     - SGLang: `lmsysorg/sglang:v0.3.2-cu121`
     - LMDeploy: `openmmlab/lmdeploy:v0.6.1-cu12`
     - TensorRT-LLM: `nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3`
-        - *NOTE: we uses r24.07 as the current implementation only works for this version. We are going to bump this up.*
+        - *NOTE: we use r24.07 as the current implementation only works for this version. We are going to bump this up.*
     - Check [nightly-pipeline.yaml](nightly-pipeline.yaml) for the concrete docker images, specs and commands we use for the benchmark.
 - Hardware
     - 8x Nvidia A100 GPUs

diff --git a/.buildkite/nightly-benchmarks/scripts/compare-json-results.py b/.buildkite/nightly-benchmarks/scripts/compare-json-results.py
@@ -3,44 +3,129 @@
 import argparse
 import json
 import os
+from importlib import util
 
 import pandas as pd
 
+plotly_found = util.find_spec("plotly.express") is not None
+
 
 def compare_data_columns(
     files, name_column, data_column, info_cols, drop_column, debug=False
 ):
-    print("\ncompare_data_column: " + data_column)
+    """
+    Align concatenation by keys derived from info_cols instead of row order.
+    - Pick one canonical key list: subset of info_cols present in ALL files.
+    - For each file: set index to those keys, aggregate duplicates
+    - (mean for metric, first for names).
+    - Concat along axis=1 (indexes align), then reset_index so callers can
+    - group by columns.
+    - If --debug, add a <file_label>_name column per file.
+    """
+    print("\ncompare_data_column:", data_column)
+
     frames = []
     raw_data_cols = []
     compare_frames = []
+
+    # 1) choose a canonical key list from info_cols that exists in ALL files
+    cols_per_file = []
+    for f in files:
+        try:
+            df_tmp = pd.read_json(f, orient="records")
+        except Exception as err:
+            raise ValueError(f"Failed to read {f}") from err
+        cols_per_file.append(set(df_tmp.columns))
+
+    key_cols = [c for c in info_cols if all(c in cset for cset in cols_per_file)]
+    if not key_cols:
+        # soft fallback: use any info_cols present in the first file
+        key_cols = [c for c in info_cols if c in list(cols_per_file[0])]
+    if not key_cols:
+        raise ValueError(
+            "No common key columns found from info_cols across the input files."
+        )
+
+    # 2) build a single "meta" block (keys as columns) once, aligned by the key index
+    meta_added = False
+
     for file in files:
-        data_df = pd.read_json(file)
-        serving_df = data_df.dropna(subset=[drop_column], ignore_index=True)
-        # Show all info columns in the first couple columns
-        if not frames:
-            for col in info_cols:
-                if col not in serving_df.columns:
-                    print(f"Skipping missing column: {col}")
-                    continue
-                frames.append(serving_df[col])
-        # only show test name under debug mode
-        if debug is True:
-            serving_df = serving_df.rename(columns={name_column: file + "_name"})
-            frames.append(serving_df[file + "_name"])
-
-        file = "/".join(file.split("/")[:-1])
-        serving_df = serving_df.rename(columns={data_column: file})
-        frames.append(serving_df[file])
-        raw_data_cols.append(file)
-        compare_frames.append(serving_df[file])
+        df = pd.read_json(file, orient="records")
+
+        # Keep rows that actually have the compared metric (same as original behavior)
+        if drop_column in df.columns:
+            df = df.dropna(subset=[drop_column], ignore_index=True)
+
+        # Stabilize numeric key columns (harmless if missing)
+        for c in (
+            "Input Len",
+            "Output Len",
+            "TP Size",
+            "PP Size",
+            "# of max concurrency.",
+            "qps",
+        ):
+            if c in df.columns:
+                df[c] = pd.to_numeric(df[c], errors="coerce")
+
+        # Ensure all key columns exist
+        for c in key_cols:
+            if c not in df.columns:
+                df[c] = pd.NA
+
+        # Set index = key_cols and aggregate duplicates → unique MultiIndex
+        df_idx = df.set_index(key_cols, drop=False)
+
+        # meta (key columns), unique per key
+        meta = df_idx[key_cols]
+        if not meta.index.is_unique:
+            meta = meta.groupby(level=key_cols, dropna=False).first()
+
+        # metric series for this file, aggregated to one row per key
+        file_label = "/".join(file.split("/")[:-1]) or os.path.basename(file)
+        s = df_idx[data_column]
+        if not s.index.is_unique:
+            s = s.groupby(level=key_cols, dropna=False).mean()
+        s.name = file_label  # column label like original
+
+        # add meta once (from first file) so keys are the leftmost columns
+        if not meta_added:
+            frames.append(meta)
+            meta_added = True
+
+        # (NEW) debug: aligned test-name column per file
+        if debug and name_column in df_idx.columns:
+            name_s = df_idx[name_column]
+            if not name_s.index.is_unique:
+                name_s = name_s.groupby(level=key_cols, dropna=False).first()
+            name_s.name = f"{file_label}_name"
+            frames.append(name_s)
+
+        frames.append(s)
+        raw_data_cols.append(file_label)
+        compare_frames.append(s)
+
+        # Generalize ratio: for any file N>=2, add ratio (fileN / file1)
         if len(compare_frames) >= 2:
-            # Compare numbers among two files
-            ratio_df = compare_frames[1] / compare_frames[0]
-            frames.append(ratio_df)
-            compare_frames.pop(1)
+            base = compare_frames[0]
+            current = compare_frames[-1]
+            ratio = current / base
+            ratio = ratio.mask(base == 0)  # avoid inf when baseline is 0
+            ratio.name = f"Ratio 1 vs {len(compare_frames)}"
+            frames.append(ratio)
 
+    # 4) concat on columns with aligned MultiIndex;
+    # then reset_index to return keys as columns
     concat_df = pd.concat(frames, axis=1)
+    concat_df = concat_df.reset_index(drop=True).reset_index()
+    if "index" in concat_df.columns:
+        concat_df = concat_df.drop(columns=["index"])
+
+    # Ensure key/info columns appear first (in your info_cols order)
+    front = [c for c in info_cols if c in concat_df.columns]
+    rest = [c for c in concat_df.columns if c not in front]
+    concat_df = concat_df[front + rest]
+
     print(raw_data_cols)
     return concat_df, raw_data_cols
 
@@ -67,6 +152,15 @@ def split_json_by_tp_pp(
 
     df = pd.DataFrame(data)
 
+    # Keep only "serving" tests
+    name_col = next(
+        (c for c in ["Test name", "test_name", "Test Name"] if c in df.columns), None
+    )
+    if name_col:
+        df = df[
+            df[name_col].astype(str).str.contains(r"serving", case=False, na=False)
+        ].copy()
+
     # Handle alias column names
     rename_map = {
         "tp_size": "TP Size",
@@ -124,7 +218,7 @@ def split_json_by_tp_pp(
         "--xaxis",
         type=str,
         default="# of max concurrency.",
-        help="column name to use as X Axis in comparision graph",
+        help="column name to use as X Axis in comparison graph",
     )
     args = parser.parse_args()
 
@@ -181,16 +275,14 @@ def split_json_by_tp_pp(
                     f"Expected subset: {filtered_info_cols}, "
                     f"but DataFrame has: {list(output_df.columns)}"
                 )
-
             output_df_sorted = output_df.sort_values(by=existing_group_cols)
             output_groups = output_df_sorted.groupby(existing_group_cols, dropna=False)
             for name, group in output_groups:
                 html = group.to_html()
                 text_file.write(html_msgs_for_data_cols[i])
                 text_file.write(html)
 
-                if plot is True:
-                    import pandas as pd
+                if plot and plotly_found:
                     import plotly.express as px
 
                     df = group[raw_data_cols]

diff --git a/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh b/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
@@ -382,7 +382,7 @@ run_genai_perf_tests() {
       client_command="genai-perf profile \
         -m $model \
         --service-kind openai \
-        --backend vllm \
+        --backend "$backend" \
         --endpoint-type chat \
         --streaming \
         --url localhost:$port \