Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .bazelrc
Original file line number Diff line number Diff line change
Expand Up @@ -150,3 +150,7 @@ run:nsys --run_under=//tools:nsys

run:rocprofv3 --@zml//platforms:rocm=true
run:rocprofv3 --run_under=//tools:rocprofv3

run:neuron-profile --@zml//platforms:neuron=true
run:neuron-profile --@zml//platforms:cpu=false
run:neuron-profile --run_under='//tools/neuron:profile /tmp/zml-neuron-profile'
2 changes: 1 addition & 1 deletion MODULE.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,7 @@ tpu = use_extension("//platforms/tpu:tpu.bzl", "tpu_packages")
use_repo(tpu, "libpjrt_tpu")

neuron = use_extension("//platforms/neuron:neuron.bzl", "neuron_packages")
use_repo(neuron, "aws-neuronx-collectives", "aws-neuronx-runtime-lib", "libgomp1", "libpjrt_neuron")
use_repo(neuron, "aws-neuronx-collectives", "aws-neuronx-runtime-lib", "aws-neuronx-tools", "libgcc_s1", "libgomp1", "libpjrt_neuron", "libstdcpp6")

non_module_deps = use_extension("//:third_party/non_module_deps.bzl", "non_module_deps")
use_repo(non_module_deps, "arocc", "cloud_accelerator_diagnostics", "com_google_sentencepiece", "flashattn_linux_amd64", "flashattn_linux_arm64", "iree", "libvaxis", "linenoise", "mnist", "mosaic_tpu", "org_swig_swig", "stb", "translate-c", "uucode", "xla", "zigimg")
Expand Down
3 changes: 2 additions & 1 deletion bin/zml-smi/platforms/neuron/nrt.zig
Original file line number Diff line number Diff line change
Expand Up @@ -72,8 +72,9 @@ pub fn init(allocator: std.mem.Allocator, io: std.Io) !Nrt {
var dev: ?*c.ndl_device_t = null;
var t: c.struct_ndl_device_init_param = .{
.initialize_device = false,
.map_hbm = false,
.num_dram_regions = 0,
.map_hbm = false,
.skip_copy_bufs = true,
};

if (private_fns.ndl_open_device(device_idx, &t, &dev) == 0) {
Expand Down
54 changes: 52 additions & 2 deletions docs/howtos/profiling.md
Original file line number Diff line number Diff line change
Expand Up @@ -266,6 +266,55 @@ That means:
- ZML still records local host `TraceMe` scopes
- ROCTx ranges from `zml.tracer.scope(...)` stay visible as device annotations

## Neuron Profiling

Use `--config=neuron-profile` with the command you already run. The wrapper
prints the profile directory when the program exits.

```bash
NEURON_RT_VISIBLE_CORES=0,1 \
bazel run --config=neuron-profile //examples/llm -- \
--model=/path/to/model \
--prompt="Explain mixture-of-experts routing in one paragraph." \
--seqlen=128 \
--backend=nki \
--topk=1
```

The default profile root is `/tmp/zml-neuron-profile`. Each run contains:

```text
/tmp/zml-neuron-profile/<run>/
compile/
execution/
```

Start the Neuron Explorer server in one terminal:

```bash
bazel run //tools/neuron:server
```

Open <http://127.0.0.1:3001>. When working through SSH, forward ports `3001`
and `3002`.

With the server running, ingest the run from another terminal:

```bash
bazel run //tools/neuron:ingest -- /tmp/zml-neuron-profile/<run>
```

Create a quick summary from one run when you want a file artifact:

```bash
bazel run //tools/neuron:summary-json -- /tmp/zml-neuron-profile/<run>
bazel run //tools/neuron:summary-txt -- /tmp/zml-neuron-profile/<run>
bazel run //tools/neuron:summary-perfetto -- /tmp/zml-neuron-profile/<run>
```

Each summary command writes beside the run and prints the output path:
`summary.json`, `summary.txt`, or `summary.pftrace`.

## macOS Instruments

On macOS, `zml.tracer.scope(...)` emits `os_signpost` intervals. These show up
Expand Down Expand Up @@ -314,8 +363,9 @@ which keeps large traces much more manageable than the old in-memory path.

## Troubleshooting

- If you run under `--config=nsys` or `--config=rocprofv3`, PJRT profiling is
skipped on purpose because the wrappers set `SKIP_PJRT_PROFILER=true`.
- If you run under `--config=nsys`, `--config=rocprofv3`, or
`--config=neuron-profile`, PJRT profiling is skipped on purpose because the
wrappers set `SKIP_PJRT_PROFILER=true`.
- If you do not see device annotations, first check whether you are on a
supported platform:
- CUDA/Linux for NVTX
Expand Down
2 changes: 1 addition & 1 deletion examples/llm/main.zig
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ const Args = struct {
\\ --prompt=<string> Prompt to use for generation (default: none)
\\ --seqlen=<number> Sequence length (default: 2048)
\\ --topk=<number> Top-k sampling cutoff (default: 4)
\\ --backend=<text> Attention backend to use ([vanilla, attnd, cuda_fa2, cuda_fa3], default: auto-selection)
\\ --backend=<text> Attention backend to use ([vanilla, attnd, nki, cuda_fa2, cuda_fa3], default: auto-selection)
\\ --attnd-ip=<addr> Register and prefer the `attnd` backend at the provided `IP:PORT`
\\ --profile Capture a PJRT profile for non-interactive runs and write a Perfetto trace
\\
Expand Down
1 change: 1 addition & 0 deletions examples/llm/models/llama/model.zig
Original file line number Diff line number Diff line change
Expand Up @@ -541,6 +541,7 @@ const SelfAttn = struct {
.vanilla => attention_metadata,
.cuda_fa2 => attention_metadata,
.cuda_fa3 => attention_metadata,
.nki => attention_metadata,
};

const attn_output = zml.attention.attention.attention(
Expand Down
4 changes: 2 additions & 2 deletions examples/llm/models/llama/session.zig
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ pub const Session = struct {
} };
break :b buffers;
},
.vanilla, .cuda_fa2, .cuda_fa3 => null,
.vanilla, .cuda_fa2, .cuda_fa3, .nki => null,
};
errdefer if (decode_attention_metadata_buffers) |*buffers| zml.attention.attention.Metadata.deinitBuffer(buffers);

Expand Down Expand Up @@ -167,7 +167,7 @@ pub const Session = struct {
.layer_id = try zml.Buffer.scalar(self.io, self.platform, 0, .u16),
.num_tokens = try zml.Buffer.scalar(self.io, self.platform, all_tokens.len, .u32),
} },
.vanilla, .cuda_fa2, .cuda_fa3 => self.attention_metadata_buffers,
.vanilla, .cuda_fa2, .cuda_fa3, .nki => self.attention_metadata_buffers,
};

try self.compiled_model.prefill.run(.{
Expand Down
8 changes: 8 additions & 0 deletions mlir/dialects/stablehlo/stablehlo.zig
Original file line number Diff line number Diff line change
Expand Up @@ -862,6 +862,14 @@ pub fn custom_call(ctx: *mlir.Context, inputs: []const *const mlir.Value, result
});
}

pub fn optimizationBarrier(ctx: *mlir.Context, inputs: []const *const mlir.Value, result_types: []const *const mlir.Type, location: *const mlir.Location) *mlir.Operation {
return mlir.Operation.make(ctx, "stablehlo.optimization_barrier", .{
.operands = .{ .flat = inputs },
.results = .{ .flat = result_types },
.location = location,
});
}

pub fn createBuffer(ctx: *mlir.Context, value: *const mlir.Type, location: *const mlir.Location) *mlir.Operation {
return custom_call(ctx, &.{}, &.{value}, .{
.call_target_name = "CreateBuffer",
Expand Down
122 changes: 88 additions & 34 deletions platforms/neuron/BUILD.bazel
Original file line number Diff line number Diff line change
@@ -1,22 +1,32 @@
load("@com_google_protobuf//bazel:upb_proto_library.bzl", "upb_c_proto_library")
load("@rules_cc//cc:cc_library.bzl", "cc_library")
load("@rules_python//python/uv:lock.bzl", uv_lock = "lock")
load("@rules_zig//zig:defs.bzl", "zig_binary", "zig_library", "zig_shared_library")
load("@zml//bazel:cc_import.bzl", "cc_import")
load("@zml//bazel:runfiles.bzl", "runfiles_to_default")

exports_files([
"neuronx_cc_shim.py",
"nki_kernel_compiler.py",
])

# A proxy PJRT Plugin that loads the Neuron PJRT Plugin
# and returns the instance from nested GetPjrtApi.
#
# Additionally, it provides a way to load implicit transitive dependencies
# of neuronx-cc (see add_needed of the patchelf target below).
# The proxy also initializes the Neuron runtime environment and exposes the
# sandboxed compiler binaries to libneuronxla 3.0.
zig_shared_library(
name = "libpjrt_neuron",
srcs = ["neuron.zig"],
main = "libpjrt_neuron.zig",
target_compatible_with = ["@platforms//os:linux"],
visibility = ["@libpjrt_neuron//:__subpackages__"],
zigopts = ["-fno-stack-check"],
deps = [
":libpython",
":libnrt_headers",
":neuron_nrt",
":zmlxneuron",
"//bazel",
"//pjrt",
"//stdx",
"@rules_zig//zig/runfiles",
"@xla//xla/pjrt/c:pjrt_c_api_hdrs",
Expand All @@ -25,12 +35,15 @@ zig_shared_library(

zig_binary(
name = "neuronx-cc",
target_compatible_with = ["@platforms//os:linux"],
data = ["@neuron_py_deps//neuronx_cc"],
linkopts = ["-Wl,-rpath,$$ORIGIN/../lib"],
main = "neuronx-cc.zig",
main = "neuronxcc_compiler_launcher.zig",
tags = ["manual"],
deps = [":libpython"],
target_compatible_with = ["@platforms//os:linux"],
deps = [
":libpython",
":python_launcher",
],
)

runfiles_to_default(
Expand All @@ -40,10 +53,18 @@ runfiles_to_default(
deps = [":neuronx-cc"],
)

runfiles_to_default(
name = "nki-cc_files",
target_compatible_with = ["@platforms//os:linux"],
visibility = ["@libpjrt_neuron//:__subpackages__"],
deps = [":nki-cc"],
)

cc_library(
name = "libpython",
tags = ["manual"],
hdrs = ["libpython.h"],
tags = ["manual"],
visibility = ["//platforms/neuron:__subpackages__"],
deps = [
"@rules_python//python/cc:current_py_cc_headers",
"@rules_python//python/cc:current_py_cc_libs",
Expand All @@ -52,57 +73,77 @@ cc_library(

cc_library(
name = "empty",
tags = ["manual"],
defines = ["ZML_RUNTIME_NEURON_DISABLED"],
visibility = ["//platforms/neuron/topology:__pkg__"],
tags = ["manual"],
)

cc_library(
name = "zmlxneuron",
tags = ["manual"],
defines = ["ZML_RUNTIME_NEURON"],
visibility = ["//platforms/neuron/topology:__pkg__"],
tags = ["manual"],
)

cc_library(
name = "libnrt_headers",
tags = ["manual"],
hdrs = ["nrt.h"],
tags = ["manual"],
visibility = [
"//bin/zml-smi:__subpackages__",
"//platforms/neuron/topology:__pkg__",
],
deps = ["@libpjrt_neuron//:libnrt_headers"],
)

upb_c_proto_library(
name = "xla_data_upb",
deps = ["@xla//xla:xla_data_proto"],
cc_import(
name = "neuron_ncfw",
shared_library = "@aws-neuronx-runtime-lib//:libncfw.patchelf",
tags = ["manual"],
)

upb_c_proto_library(
name = "hlo_proto_upb",
deps = ["@xla//xla/service:hlo_proto"],
cc_import(
name = "neuron_nrt",
shared_library = "@aws-neuronx-runtime-lib//:libnrt.patchelf",
tags = ["manual"],
deps = [":neuron_ncfw"],
)

zig_shared_library(
name = "libneuronxla",
main = "libneuronxla.zig",
shared_lib_name = "libneuronxla.so",
target_compatible_with = ["@platforms//os:linux"],
visibility = ["@libpjrt_neuron//:__subpackages__"],
zigopts = [
"-fno-stack-check",
"-fPIC",
zig_library(
name = "python_launcher",
import_name = "platforms/neuron/python_launcher",
main = "python_launcher.zig",
tags = ["manual"],
visibility = ["//platforms/neuron:__subpackages__"],
deps = [":libpython"],
)

zig_binary(
name = "nki-cc",
data = [
"nki_kernel_compiler.py",
"@neuron_py_deps//neuronx_cc",
"@neuron_py_deps//nki",
],
linkopts = ["-Wl,-rpath,$$ORIGIN/../lib"],
main = "nki_compiler_launcher.zig",
tags = ["manual"],
target_compatible_with = ["@platforms//os:linux"],
visibility = ["//platforms/neuron:__pkg__"],
deps = [
":hlo_proto_upb",
":libpython",
":xla_data_upb",
":python_launcher",
],
)

zig_library(
name = "nki_kernel",
import_name = "platforms/neuron/nki_kernel",
main = "nki_kernel.zig",
tags = ["manual"],
visibility = ["//zml:__pkg__"],
deps = [
":neuron",
"//bazel",
"//stdx",
"//upb",
"@rules_zig//zig/runfiles",
],
)

Expand All @@ -122,16 +163,16 @@ uv_lock(

zig_library(
name = "neuron",
tags = ["manual"],
import_name = "platforms/neuron",
main = "neuron.zig",
tags = ["manual"],
visibility = ["//visibility:public"],
deps = [
"//pjrt",
] + select({
"//platforms:neuron.enabled": [
":libnrt_headers",
":libpython",
":neuron_nrt",
":zmlxneuron",
"//bazel",
"//stdx",
Expand All @@ -141,3 +182,16 @@ zig_library(
"//conditions:default": [":empty"],
}),
)

zig_library(
name = "topology",
data = [":libpjrt_neuron"],
import_name = "platforms/neuron/topology",
main = "topology.zig",
tags = ["manual"],
visibility = ["//visibility:public"],
deps = [
":neuron",
":zmlxneuron",
],
)
Loading
Loading