Skip to content

[DT][CPU] Failed to relayout if the incoming layout is from GPU (exceeded stack allocation limit) #22939

@hanhanW

Description

@hanhanW

I haven't triaged it yet. IIRC, the relayout ops are fused into scf.forall, and we may need to make it happen as well in vector level tiling. Something is missing here.

To repro:

Run iree-compile --compile-from=executable-sources --compile-to=executable-targets repro.mlir

hal.executable public @_encoding_0 {
  hal.executable.variant public @embedded_elf_x86_64 target(<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+f16c,+fsgsbase,+sahf,+lzcnt,+movbe,+mwaitx,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+cx8,+crc32,+invpcid,+rdpru,+x87,+fxsr", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", iree.encoding.resolver = #iree_cpu.cpu_encoding_resolver<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 64 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>) {
    hal.executable.export public @_encoding_0_encode_456x789xf16_to_456x789xf16 ordinal(0) layout(#hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) count(%arg0: !hal.device) -> (index, index, index) {
      %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
      hal.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @_encoding_0_encode_456x789xf16_to_456x789xf16() {
        %c0 = arith.constant 0 : index
        %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<456x789xf16>>
        %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags(Indirect) : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<456x789xf16, #iree_encoding.layout<[#iree_gpu.gpu_encoding_resolver<configuration = {encoding_info = {innerDimsPos = [1, 0], innerTileSizes = [64, 16], outerDimsPerm = [1, 0], swizzle = {expandShape = [[["CrossThread", 2 : i16], ["CrossIntrinsic", 2 : i16], ["CrossThread", 16 : i16]], [["CrossThread", 1 : i16], ["Internal", 16 : i16]]], permutation = [0, 1, 3, 2, 4]}}}>]>>>
        %2 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0, 0], sizes = [456, 789], strides = [1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<456x789xf16>> -> tensor<456x789xf16>
        %3 = iree_encoding.set_encoding %2 : tensor<456x789xf16> -> tensor<456x789xf16, #iree_encoding.layout<[#iree_gpu.gpu_encoding_resolver<configuration = {encoding_info = {innerDimsPos = [1, 0], innerTileSizes = [64, 16], outerDimsPerm = [1, 0], swizzle = {expandShape = [[["CrossThread", 2 : i16], ["CrossIntrinsic", 2 : i16], ["CrossThread", 16 : i16]], [["CrossThread", 1 : i16], ["Internal", 16 : i16]]], permutation = [0, 1, 3, 2, 4]}}}>]>>
        iree_tensor_ext.dispatch.tensor.store %3, %1, offsets = [0, 0], sizes = [456, 789], strides = [1, 1] : tensor<456x789xf16, #iree_encoding.layout<[#iree_gpu.gpu_encoding_resolver<configuration = {encoding_info = {innerDimsPos = [1, 0], innerTileSizes = [64, 16], outerDimsPerm = [1, 0], swizzle = {expandShape = [[["CrossThread", 2 : i16], ["CrossIntrinsic", 2 : i16], ["CrossThread", 16 : i16]], [["CrossThread", 1 : i16], ["Internal", 16 : i16]]], permutation = [0, 1, 3, 2, 4]}}}>]>> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<456x789xf16, #iree_encoding.layout<[#iree_gpu.gpu_encoding_resolver<configuration = {encoding_info = {innerDimsPos = [1, 0], innerTileSizes = [64, 16], outerDimsPerm = [1, 0], swizzle = {expandShape = [[["CrossThread", 2 : i16], ["CrossIntrinsic", 2 : i16], ["CrossThread", 16 : i16]], [["CrossThread", 1 : i16], ["Internal", 16 : i16]]], permutation = [0, 1, 3, 2, 4]}}}>]>>>
        return
      }
    }
  }
}

Metadata

Metadata

Assignees

Labels

codegen/llvmLLVM code generation compiler backend

Type

No type

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions