[inductor] refine loop split logic #128812

zhuhaozhe · 2024-06-17T06:19:41Z

This PR aims to improves parallelization by collapsing vectorized loop. #122281

For such case, the parallel level is only 2.
And the vectorized loop cannot be collapsed.

#pragma omp for
for(long x0=static_cast<long>(0L); x0<static_cast<long>(2L); x0+=static_cast<long>(1L))
{
    for(long x1=static_cast<long>(0L); x1<static_cast<long>(199984L); x1+=static_cast<long>(16L))
    {
        auto tmp0 = at::vec::VectorizedN<int64_t,2>::loadu(in_ptr0 + static_cast<long>(x1 + (199985L*x0)), 16);
        tmp0.store(out_ptr0 + static_cast<long>(x1 + (209985L*x0)), 16);
    }
    #pragma omp simd simdlen(8) 
    for(long x1=static_cast<long>(199984L); x1<static_cast<long>(199985L); x1+=static_cast<long>(1L))
    {
        auto tmp0 = in_ptr0[static_cast<long>(x1 + (199985L*x0))];
        out_ptr0[static_cast<long>(x1 + (209985L*x0))] = tmp0;
    }
}

After this PR, we will gen code

#pragma omp for collapse(2)
for(long x0=static_cast<long>(0L); x0<static_cast<long>(2L); x0+=static_cast<long>(1L))
{
    for(long x1=static_cast<long>(0L); x1<static_cast<long>(199985L); x1+=static_cast<long>(16L))
    {
        if (x1 >= 0 && x1 <199984) {
            auto tmp0 = at::vec::VectorizedN<int64_t,2>::loadu(in_ptr0 + static_cast<long>(x1 + (199985L*x0)), 16);
            tmp0.store(out_ptr0 + static_cast<long>(x1 + (209985L*x0)), 16);
        }
        if (x1 >= 199984 && x1 <199985) {
            auto tmp0 = in_ptr0[static_cast<long>(x1 + (199985L*x0))];
            out_ptr0[static_cast<long>(x1 + (209985L*x0))] = tmp0;
        }
    }
}

Highlight

For reduction case, we have some side-effect here.
For below case, we vectorized x1 dim and reduction at x2 dim.

#pragma omp for
for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(39L); x0+=static_cast<int64_t>(1L))
{
    for(int64_t x1=static_cast<int64_t>(0L); x1<static_cast<int64_t>(16L); x1+=static_cast<int64_t>(8L))
    {
        {
            float tmp_acc0 = -std::numeric_limits<float>::infinity();
            at::vec::Vectorized<float> tmp_acc0_vec = at::vec::Vectorized<float>(-std::numeric_limits<float>::infinity());
            for(int64_t x2=static_cast<int64_t>(0L); x2<static_cast<int64_t>(18L); x2+=static_cast<int64_t>(1L))
            {
                auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr1 + static_cast<int64_t>(x1 + (17L*x2) + (306L*x0)), 8);
                tmp_acc0_vec = at::vec::maximum(tmp_acc0_vec, tmp0);
            }
            [&]
            {
                __at_align__ std::array<float, 8> tmpbuf;
                tmp_acc0_vec.store(tmpbuf.data(), 8);
                #pragma GCC unroll 8
                for (long x1_inner = 0; x1_inner < 8; x1_inner++)
                {
                    out_ptr1[static_cast<int64_t>(x0 + (39L*x1) + (39L*x1_inner))] = tmpbuf[x1_inner];
                }
            }
            ()
            ;
        }
    }
    #pragma omp simd simdlen(4) 
    for(int64_t x1=static_cast<int64_t>(16L); x1<static_cast<int64_t>(17L); x1+=static_cast<int64_t>(1L))
    {
        {
            float tmp_acc0 = -std::numeric_limits<float>::infinity();
            for(int64_t x2=static_cast<int64_t>(0L); x2<static_cast<int64_t>(18L); x2+=static_cast<int64_t>(1L))
            {
                auto tmp0 = in_ptr1[static_cast<int64_t>(x1 + (17L*x2) + (306L*x0))];
                tmp_acc0 = max_propagate_nan(tmp_acc0, tmp0);
            }
            out_ptr1[static_cast<int64_t>(x0 + (39L*x1))] = tmp_acc0;
        }
    }
}

After collapse, the loop order will be x1 -> x2 -> x1_tail_part, thus we will need a tmp_acc_arr to store the reduction result for x1_tail_part. And for reduction_stores, we also need to check x1's value like what we do in the loopbody since the reduction_stores happened between x1 and x2 loops.

#pragma omp for collapse(2)
for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(39L); x0+=static_cast<int64_t>(1L))
{
    for(int64_t x1=static_cast<int64_t>(0L); x1<static_cast<int64_t>(17L); x1+=static_cast<int64_t>(8L))
    {
        {
            float tmp_acc0_arr[8];           ######### need an array to hold acc result for tail part
            for (int i = 0; i < 8; i++)
            {
                tmp_acc0_arr[i] = -std::numeric_limits<float>::infinity();
            }
            float tmp_acc0 = -std::numeric_limits<float>::infinity();
            at::vec::Vectorized<float> tmp_acc0_vec = at::vec::Vectorized<float>(-std::numeric_limits<float>::infinity());
            for(int64_t x2=static_cast<int64_t>(0L); x2<static_cast<int64_t>(18L); x2+=static_cast<int64_t>(1L))
            {
                {
                    if(C10_LIKELY(x1 >= static_cast<int64_t>(0) && x1 < static_cast<int64_t>(16L)))
                    {
                        auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr1 + static_cast<int64_t>(x1 + (17L*x2) + (306L*x0)), 8);
                        tmp_acc0_vec = at::vec::maximum(tmp_acc0_vec, tmp0);
                    }
                    if(C10_UNLIKELY(x1 >= static_cast<int64_t>(16L) && x1 < static_cast<int64_t>(17L)))
                    {
                        for (long x1_tail = static_cast<int64_t>(16L); x1_tail < static_cast<int64_t>(17L); x1_tail++)
                        {
                            auto tmp0 = in_ptr1[static_cast<int64_t>(x1_tail + (17L*x2) + (306L*x0))];
                            tmp_acc0_arr[x1_tail - static_cast<int64_t>(16L)] = max_propagate_nan(tmp_acc0_arr[x1_tail - static_cast<int64_t>(16L)], tmp0);
                        }
                    }
                }
            }

            ############### reduction stores
            if(C10_LIKELY(x1 >= static_cast<int64_t>(0) && x1 < static_cast<int64_t>(16L)))
            {
                [&]
                {
                    __at_align__ std::array<float, 8> tmpbuf;
                    tmp_acc0_vec.store(tmpbuf.data(), 8);
                    #pragma GCC unroll 8
                    for (long x1_inner = 0; x1_inner < 8; x1_inner++)
                    {
                        out_ptr1[static_cast<int64_t>(x0 + (39L*x1) + (39L*x1_inner))] = tmpbuf[x1_inner];
                    }
                }
                ()
                ;
            }
            if(C10_UNLIKELY(x1 >= static_cast<int64_t>(16L) && x1 < static_cast<int64_t>(17L)))
            {
                for (long x1_tail = static_cast<int64_t>(16L); x1_tail < static_cast<int64_t>(17L); x1_tail++)
                {
                    out_ptr1[static_cast<int64_t>(x0 + (39L*x1_tail))] = tmp_acc0_arr[x1_tail - static_cast<int64_t>(16L)];
                }
            }
        }
    }
}

Stack from ghstack (oldest at bottom):

-> [inductor] refine loop split logic #128812

cc @voznesenskym @penguinwu @EikanWang @jgong5 @Guobing-Chen @XiaobingSuper @blzheng @wenzhe-nrv @jiayisunx @peterbell10 @ipiszy @yf225 @chenyang78 @kadeng @muchulee8 @ColinPeppler @amjames @desertfire @chauhang

pytorch-bot · 2024-06-17T06:19:44Z

🔗 Helpful Links

🧪 See artifacts and rendered test results at hud.pytorch.org/pr/128812

📄 Preview Python docs built from this PR
📄 Preview C++ docs built from this PR
❓ Need help or want to give feedback on the CI? Visit the bot commands wiki or our office hours

Note: Links to docs will display an error until the docs builds have been completed.

❌ 11 New Failures

As of commit 36213d7 with merge base 32a3dbc ():

NEW FAILURES - The following jobs have failed:

Check mergeability of ghstack PR / ghstack-mergeability-check (gh)
RuntimeError: Command git -C /home/runner/work/pytorch/pytorch cherry-pick -x 873b31c returned non-zero exit code 1
Lint / lintrunner-noclang / linux-job (gh)
>>> Lint for torch/_inductor/codegen/cpp.py:
pull / linux-focal-py3.11-clang10 / test (default, 2, 4, linux.4xlarge) (gh)
inductor/test_cpu_repro.py::CPUReproTests::test_lstm_packed_unbatched_False_input_size_1_hidden_size_2_num_layers_1_bidirectional_False_bias_False_empty_state_True_batch_first_True_batch_size_2_seq_len_2
pull / linux-focal-py3.11-clang10 / test (default, 3, 4, linux.4xlarge) (gh)
inductor/test_cpu_repro.py::CPUReproTests::test_lstm_packed_unbatched_False_input_size_1_hidden_size_2_num_layers_1_bidirectional_True_bias_False_empty_state_True_batch_first_True_batch_size_2_seq_len_2
pull / linux-focal-py3.11-clang10 / test (default, 4, 4, linux.4xlarge) (gh)
inductor/test_cpu_repro.py::CPUReproTests::test_lstm_packed_unbatched_False_input_size_1_hidden_size_2_num_layers_1_bidirectional_True_bias_True_empty_state_True_batch_first_True_batch_size_2_seq_len_2
pull / linux-focal-py3.12-clang10 / test (default, 2, 4, linux.4xlarge) (gh)
inductor/test_cpu_repro.py::CPUReproTests::test_lstm_packed_unbatched_False_input_size_1_hidden_size_2_num_layers_1_bidirectional_False_bias_False_empty_state_True_batch_first_True_batch_size_2_seq_len_2
pull / linux-focal-py3.12-clang10 / test (default, 3, 4, linux.4xlarge) (gh)
inductor/test_cpu_repro.py::CPUReproTests::test_lstm_packed_unbatched_False_input_size_1_hidden_size_2_num_layers_1_bidirectional_True_bias_False_empty_state_True_batch_first_True_batch_size_2_seq_len_2
pull / linux-focal-py3.12-clang10 / test (default, 4, 4, linux.4xlarge) (gh)
inductor/test_cpu_repro.py::CPUReproTests::test_lstm_packed_unbatched_False_input_size_1_hidden_size_2_num_layers_1_bidirectional_True_bias_True_empty_state_True_batch_first_True_batch_size_2_seq_len_2
pull / linux-focal-py3.9-clang10 / test (default, 2, 4, linux.4xlarge) (gh)
inductor/test_cpu_repro.py::CPUReproTests::test_lstm_packed_unbatched_False_input_size_1_hidden_size_2_num_layers_1_bidirectional_False_bias_False_empty_state_True_batch_first_True_batch_size_2_seq_len_2
pull / linux-focal-py3.9-clang10 / test (default, 3, 4, linux.4xlarge) (gh)
inductor/test_cpu_repro.py::CPUReproTests::test_lstm_packed_unbatched_False_input_size_1_hidden_size_2_num_layers_1_bidirectional_True_bias_False_empty_state_True_batch_first_True_batch_size_2_seq_len_2
pull / linux-focal-py3.9-clang10 / test (default, 4, 4, linux.4xlarge) (gh)
inductor/test_cpu_repro.py::CPUReproTests::test_lstm_packed_unbatched_False_input_size_1_hidden_size_2_num_layers_1_bidirectional_True_bias_True_empty_state_True_batch_first_True_batch_size_2_seq_len_2

This comment was automatically generated by Dr. CI and updates every 15 minutes.

ghstack-source-id: a0ffb42b1c0b2159b72f278aa4184ab75325cd03 Pull Request resolved: #128812

[ghstack-poisoned]

ghstack-source-id: a0ffb42b1c0b2159b72f278aa4184ab75325cd03 Pull Request resolved: pytorch#128812

ghstack-source-id: ae8e67d681d811c0cd0ed703d186ddbe8e39f854 Pull Request resolved: #128812

[ghstack-poisoned]

ghstack-source-id: ae8e67d681d811c0cd0ed703d186ddbe8e39f854 Pull Request resolved: pytorch#128812

ghstack-source-id: ff1dcca4bbb2cf3100f86bf622b492f73df3ad16 Pull Request resolved: #128812

[ghstack-poisoned]

ghstack-source-id: 39d237a5cf04be275029125ef488469b2f430dda Pull Request resolved: #128812

[ghstack-poisoned]

ghstack-source-id: 6baf7b0426bbcc1ea0c06180b393ecb4619bb59d Pull Request resolved: #128812

[ghstack-poisoned]

ghstack-source-id: 8254f219519f68724f941713938b04d9d44c53ac Pull Request resolved: #128812

[ghstack-poisoned]

ghstack-source-id: 470238141e894f1cd0ea1c798987c229020dccf4 Pull Request resolved: #128812

[ghstack-poisoned]

ghstack-source-id: ceb03c79c58a58e489f216df12556cc559db904d Pull Request resolved: #128812

[ghstack-poisoned]

ghstack-source-id: af9d8dc8e5e77cfa9c203081e20cafd17569b38c Pull Request resolved: #128812

[ghstack-poisoned]

ghstack-source-id: 8cb091acab68b47a147e84d64a1d22bfa203ad02 Pull Request resolved: #128812

[ghstack-poisoned]

jgong5

I put some early comments on the LoopNest and LoopLevel changes. Still need more time to review others.

jgong5 · 2024-09-13T04:01:43Z

torch/_inductor/codegen/cpp.py

@@ -4872,7 +5049,7 @@ def lines(self):


 @dataclasses.dataclass
-class LoopNestWithSplit:
+class LoopNest:
    """
    A loop-nest like structure but with some loop level split along
    the loop range into the main tiling loop and the tail. It is built


This need amendment?

jgong5 · 2024-09-13T04:03:34Z

torch/_inductor/codegen/cpp.py

-        else:
-            loop_nest.kernel = kernel
+
+        loop_nest = LoopNest(loops)


Do we want to set the kernel field of LoopNest here too?

jgong5 · 2024-09-13T05:50:59Z

torch/_inductor/codegen/cpp.py

-    parent: Optional["LoopLevel"] = None
-    # the next inner level of the loop, empty if it is inner-most
-    # contains >1 LoopLevel if the inner level of loop is split
-    inner: List["LoopLevel"] = dataclasses.field(default_factory=list)
    # kernel assigned to this loop level, only valid when it is a leaf
    kernel: Optional[CppKernel] = None


Do we still need it considering each LoopNest only has a single kernel now?

Yes. In this PR. We maintained all kinds of kernels (CppKernel, CppVecKernel, CppTile2dKernel in one CppKernelProxy. And assert the kernel is CppKernelProxy https://github.com/pytorch/pytorch/pull/128812/files#diff-5ab7b0235e2076a5fc6629ba0b109208940f5b94f5c13babc3e0f87cf4fcec82R2077 here

I mean why can't we just use the kernel object from LoopNest, and why we still have to keep a kernel object in the LoopLevel?

jgong5 · 2024-09-13T05:53:42Z

torch/_inductor/codegen/cpp.py

-                inner_loop_clone.parent = loop
-                loop.inner.append(inner_loop_clone)
-        loop.kernel = deepcopy(self.kernel)
+    def split_with_tiling(self, factor):


"split" is an op that splits the loop into two but this function doesn't seem to do so, right? It seems to create a vectorized loop level?

Thanks for reminder here, renamed to vectorized_with_tiling.

Did you commit the change? Also name it tile sounds better? Please also amend the code doc for related functions.

ghstack-source-id: 8cb091acab68b47a147e84d64a1d22bfa203ad02 Pull Request resolved: #128812

jgong5 · 2024-09-29T02:00:05Z

torch/_inductor/codegen/cpp.py

-    parent: Optional["LoopLevel"] = None
-    # the next inner level of the loop, empty if it is inner-most
-    # contains >1 LoopLevel if the inner level of loop is split
-    inner: List["LoopLevel"] = dataclasses.field(default_factory=list)
    # kernel assigned to this loop level, only valid when it is a leaf
    kernel: Optional[CppKernel] = None


I mean why can't we just use the kernel object from LoopNest, and why we still have to keep a kernel object in the LoopLevel?

jgong5 · 2024-10-01T02:00:16Z

torch/_inductor/codegen/cpp.py

+        dtype: torch.dtype,
+        init_fn,
+    ):
+        # gen preduction prefix


Suggested change

# gen preduction prefix

# gen reduction prefix

jgong5 · 2024-10-01T02:10:51Z

torch/_inductor/codegen/cpp.py


            stack.enter_context(code.indent())
-            if loop_nest.root:
+            if loop_nest.loops:


Do we still need this check here?

jgong5 · 2024-10-01T02:20:14Z

torch/_inductor/codegen/cpp.py

+                        kernel.gen_body(code)
+
+            def get_reduction_prefix_suffix(kernel, parallel=False, buffer="prefix"):
+                if buffer == "suffix":


Why not using a boolean flag here?

jgong5 · 2024-10-01T02:33:39Z

torch/_inductor/codegen/cpp.py

-                        gen_loops(loop.inner, loop.is_reduction)
-                    else:
-                        gen_loop_kernel(loop)
+                    gen_loop_nest(_loop_nest, depth, loop.is_reduction)


nit: you don't have to do depth += 1 separately.

Suggested change

gen_loop_nest(_loop_nest, depth, loop.is_reduction)

gen_loop_nest(_loop_nest, depth + 1, loop.is_reduction)

jgong5 · 2024-10-01T03:29:45Z

torch/_inductor/codegen/cpp.py

+        tiling_idx = FloorDiv(loop.size, sympy_factor) * sympy_factor
+        loop.steps = sympy_factor
+        loop.simd_vec = True
+        loop.tiling_offset = tiling_idx


Suggest to use loop.tiled_size.

jgong5 · 2024-10-01T04:10:22Z

torch/_inductor/codegen/cpp.py

@@ -1714,11 +1782,14 @@ class CppKernel(Kernel):

    def __init__(self, args, num_threads):
        super().__init__(args)
+        self.active_ranges: dict[sympy.Expr, Tuple[sympy.Expr, ...]] = {}
+        self.inner_itervars: List[sympy.Symbol] = []


jgong5 · 2024-10-01T04:12:36Z

torch/_inductor/codegen/cpp.py

        self.call_ranges: Optional[Tuple[sympy.Expr, ...]] = None
        self.ranges: List[sympy.Expr] = []
        self.itervars: List[sympy.Symbol] = []
        self.reduction_depth = None
        self.reduction_prefix = IndentedBuffer()
+        self.reduction_prefix_fn: List[Callable] = []  # type: ignore[type-arg]


Suggested change

self.reduction_prefix_fn: List[Callable] = [] # type: ignore[type-arg]

self.reduction_prefix_generators: List[Callable] = [] # type: ignore[type-arg]

jgong5 · 2024-10-01T04:23:04Z

torch/_inductor/codegen/cpp.py

@@ -239,6 +238,101 @@ def reduction_project(reduction_type, acc):
    return acc


+def transform_kernel_codes_under_inner_loop(


Is move_code_under_inner_loop simpler?

jgong5 · 2024-10-01T04:25:14Z

torch/_inductor/codegen/cpp.py

+                        reduction_vars = tail_loop_kernel.reduction_var_names
+                        for name in reduction_vars:
+                            new_name = f"{name}_arr[{outer_loop.var}_tail - {cexpr_index(outer_loop.tiling_offset)}]"
+                            replace_acc_name(tail_loop_kernel.stores, name, new_name)


Do we still need the logic of replacing a generated buffer after introducing the design of lazy generation with callbacks?

ghstack-source-id: d888c7594d9013e594f1e317cb1d2486acb481e6 Pull Request resolved: #128812

zhuhaozhe · 2024-10-18T06:19:42Z

@pytorchbot rebase

[ghstack-poisoned]

pytorchmergebot · 2024-10-18T06:21:19Z

@pytorchbot started a rebase job onto refs/remotes/origin/viable/strict. Check the current status here

[ghstack-poisoned]

pytorchmergebot · 2024-10-18T06:21:32Z

Successfully rebased gh/zhuhaozhe/39/orig onto refs/remotes/origin/viable/strict, please pull locally before adding more changes (for example, via ghstack checkout https://github.com/pytorch/pytorch/pull/128812)

ghstack-source-id: e47cfd0541b61da2496bbfdd74ea1420035de280 Pull Request resolved: #128812

zhuhaozhe · 2024-10-21T05:39:38Z

@pytorchbot rebase

pytorchmergebot · 2024-10-21T05:41:01Z

@pytorchbot started a rebase job onto refs/remotes/origin/viable/strict. Check the current status here

[ghstack-poisoned]

pytorchmergebot · 2024-10-21T05:41:13Z

Successfully rebased gh/zhuhaozhe/39/orig onto refs/remotes/origin/viable/strict, please pull locally before adding more changes (for example, via ghstack checkout https://github.com/pytorch/pytorch/pull/128812)

ghstack-source-id: f9e59d934c7bda4fbf166e70d444772d0b6ca1b7 Pull Request resolved: #128812

zhuhaozhe · 2024-10-25T07:25:01Z

@pytorchbot rebase

pytorchmergebot · 2024-10-25T07:26:26Z

@pytorchbot started a rebase job onto refs/remotes/origin/viable/strict. Check the current status here

[ghstack-poisoned]

pytorchmergebot · 2024-10-25T07:26:39Z

Successfully rebased gh/zhuhaozhe/39/orig onto refs/remotes/origin/viable/strict, please pull locally before adding more changes (for example, via ghstack checkout https://github.com/pytorch/pytorch/pull/128812)

ghstack-source-id: f9193cb0534abf568ae32986ca7f4e7817c3bd9b Pull Request resolved: #128812

pytorch-bot bot added ciflow/inductor module: inductor labels Jun 17, 2024

zhuhaozhe added a commit that referenced this pull request Jun 17, 2024

[inductor] refine loop split logic

5413bb8

ghstack-source-id: a0ffb42b1c0b2159b72f278aa4184ab75325cd03 Pull Request resolved: #128812

Update

1b424d0

[ghstack-poisoned]

pytorchbot added the open source label Jun 17, 2024

zhuhaozhe marked this pull request as draft July 17, 2024 07:47

zhuhaozhe added a commit to zhuhaozhe/pytorch that referenced this pull request Jul 24, 2024

[inductor] refine loop split logic

c3d519c

ghstack-source-id: a0ffb42b1c0b2159b72f278aa4184ab75325cd03 Pull Request resolved: pytorch#128812

zhuhaozhe added a commit to zhuhaozhe/pytorch that referenced this pull request Jul 24, 2024

[inductor] refine loop split logic

0153704

ghstack-source-id: a0ffb42b1c0b2159b72f278aa4184ab75325cd03 Pull Request resolved: pytorch#128812

zhuhaozhe added a commit that referenced this pull request Jul 25, 2024

[inductor] refine loop split logic

93d10ed

ghstack-source-id: ae8e67d681d811c0cd0ed703d186ddbe8e39f854 Pull Request resolved: #128812

Update

fa2080d

[ghstack-poisoned]

zhuhaozhe added a commit to zhuhaozhe/pytorch that referenced this pull request Jul 26, 2024

[inductor] refine loop split logic

5e05640

ghstack-source-id: ae8e67d681d811c0cd0ed703d186ddbe8e39f854 Pull Request resolved: pytorch#128812

zhuhaozhe added a commit to zhuhaozhe/pytorch that referenced this pull request Jul 27, 2024

[inductor] refine loop split logic

6cc8326

ghstack-source-id: ae8e67d681d811c0cd0ed703d186ddbe8e39f854 Pull Request resolved: pytorch#128812

zhuhaozhe added a commit that referenced this pull request Aug 16, 2024

[inductor] refine loop split logic

abbb08f

ghstack-source-id: ff1dcca4bbb2cf3100f86bf622b492f73df3ad16 Pull Request resolved: #128812

Update

f5edeb5

[ghstack-poisoned]

zhuhaozhe added a commit that referenced this pull request Aug 16, 2024

[inductor] refine loop split logic

038f273

ghstack-source-id: 39d237a5cf04be275029125ef488469b2f430dda Pull Request resolved: #128812

Update

594e303

[ghstack-poisoned]

zhuhaozhe added a commit that referenced this pull request Aug 16, 2024

[inductor] refine loop split logic

9acccc8

ghstack-source-id: 6baf7b0426bbcc1ea0c06180b393ecb4619bb59d Pull Request resolved: #128812

Update

d0d1807

[ghstack-poisoned]

zhuhaozhe added a commit that referenced this pull request Aug 16, 2024

[inductor] refine loop split logic

cfcf8f8

ghstack-source-id: 8254f219519f68724f941713938b04d9d44c53ac Pull Request resolved: #128812

Update

5080505

[ghstack-poisoned]

This was referenced Aug 20, 2024

[inductor] refine loop split logic #131438

Closed

[inductor] refine loop split logic #124060

Closed

leslie-fang-intel mentioned this pull request Aug 22, 2024

[inductor][cpu]performance regression in 2024-08-18 nightly release #134094

Open

zhuhaozhe added a commit that referenced this pull request Aug 29, 2024

[inductor] refine loop split logic

162d06d

ghstack-source-id: 470238141e894f1cd0ea1c798987c229020dccf4 Pull Request resolved: #128812

Update

d08d8f3

[ghstack-poisoned]

Update

9a5835f

[ghstack-poisoned]

zhuhaozhe added a commit that referenced this pull request Sep 9, 2024

[inductor] refine loop split logic

f8f93ca

ghstack-source-id: ceb03c79c58a58e489f216df12556cc559db904d Pull Request resolved: #128812

Update

28166d6

[ghstack-poisoned]

zhuhaozhe added a commit that referenced this pull request Sep 9, 2024

[inductor] refine loop split logic

537e4ee

ghstack-source-id: af9d8dc8e5e77cfa9c203081e20cafd17569b38c Pull Request resolved: #128812

zhuhaozhe requested a review from leslie-fang-intel September 9, 2024 12:55

Update

3a8a9f9

[ghstack-poisoned]

zhuhaozhe added a commit that referenced this pull request Sep 10, 2024

[inductor] refine loop split logic

1d26b2b

ghstack-source-id: 8cb091acab68b47a147e84d64a1d22bfa203ad02 Pull Request resolved: #128812

Update

8e51801

[ghstack-poisoned]

jgong5 requested changes Sep 13, 2024

View reviewed changes

jgong5 mentioned this pull request Sep 23, 2024

[inductor][cpu] basic_gnn_gcn fp32 static/dynamic shape default wrapper multiple thread performance regression #122281

Open

zhuhaozhe added a commit that referenced this pull request Sep 25, 2024

[inductor] refine loop split logic

9dad6b8

ghstack-source-id: 8cb091acab68b47a147e84d64a1d22bfa203ad02 Pull Request resolved: #128812

zhuhaozhe requested a review from jgong5 September 26, 2024 01:25

jgong5 requested changes Oct 1, 2024

View reviewed changes

zhuhaozhe added a commit that referenced this pull request Oct 18, 2024

[inductor] refine loop split logic

de0548c

ghstack-source-id: d888c7594d9013e594f1e317cb1d2486acb481e6 Pull Request resolved: #128812

Update

4fd4f19

[ghstack-poisoned]

Update

37662d1

[ghstack-poisoned]

pytorchmergebot pushed a commit that referenced this pull request Oct 18, 2024

[inductor] refine loop split logic

d131c5b

ghstack-source-id: e47cfd0541b61da2496bbfdd74ea1420035de280 Pull Request resolved: #128812

Update

d0995e4

[ghstack-poisoned]

pytorchmergebot pushed a commit that referenced this pull request Oct 21, 2024

[inductor] refine loop split logic

c841853

ghstack-source-id: f9e59d934c7bda4fbf166e70d444772d0b6ca1b7 Pull Request resolved: #128812

Update

36213d7

[ghstack-poisoned]

pytorchmergebot pushed a commit that referenced this pull request Oct 25, 2024

[inductor] refine loop split logic

873b31c

ghstack-source-id: f9193cb0534abf568ae32986ca7f4e7817c3bd9b Pull Request resolved: #128812

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[inductor] refine loop split logic #128812

[inductor] refine loop split logic #128812

zhuhaozhe commented Jun 17, 2024 •

edited

Loading

pytorch-bot bot commented Jun 17, 2024 •

edited

Loading

jgong5 left a comment

jgong5 Sep 13, 2024

zhuhaozhe Sep 25, 2024

jgong5 Sep 13, 2024

jgong5 Sep 13, 2024

zhuhaozhe Sep 25, 2024 •

edited

Loading

jgong5 Sep 29, 2024

jgong5 Sep 13, 2024

zhuhaozhe Sep 25, 2024

jgong5 Oct 1, 2024

jgong5 Sep 29, 2024

jgong5 Oct 1, 2024

jgong5 Oct 1, 2024

jgong5 Oct 1, 2024

jgong5 Oct 1, 2024

jgong5 Oct 1, 2024

jgong5 Oct 1, 2024

jgong5 Oct 1, 2024

jgong5 Oct 1, 2024

jgong5 Oct 1, 2024

zhuhaozhe commented Oct 18, 2024

pytorchmergebot commented Oct 18, 2024

pytorchmergebot commented Oct 18, 2024

zhuhaozhe commented Oct 21, 2024

pytorchmergebot commented Oct 21, 2024

pytorchmergebot commented Oct 21, 2024

zhuhaozhe commented Oct 25, 2024

pytorchmergebot commented Oct 25, 2024

pytorchmergebot commented Oct 25, 2024

	gen_loop_nest(_loop_nest, depth, loop.is_reduction)
	gen_loop_nest(_loop_nest, depth + 1, loop.is_reduction)

	self.reduction_prefix_fn: List[Callable] = [] # type: ignore[type-arg]
	self.reduction_prefix_generators: List[Callable] = [] # type: ignore[type-arg]

		@@ -239,6 +238,101 @@ def reduction_project(reduction_type, acc):
		return acc


		def transform_kernel_codes_under_inner_loop(

[inductor] refine loop split logic #128812

Are you sure you want to change the base?

[inductor] refine loop split logic #128812

Conversation

zhuhaozhe commented Jun 17, 2024 • edited Loading

Highlight

pytorch-bot bot commented Jun 17, 2024 • edited Loading

🔗 Helpful Links

🧪 See artifacts and rendered test results at hud.pytorch.org/pr/128812

❌ 11 New Failures

jgong5 left a comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

zhuhaozhe Sep 25, 2024 • edited Loading

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

zhuhaozhe commented Oct 18, 2024

pytorchmergebot commented Oct 18, 2024

pytorchmergebot commented Oct 18, 2024

zhuhaozhe commented Oct 21, 2024

pytorchmergebot commented Oct 21, 2024

pytorchmergebot commented Oct 21, 2024

zhuhaozhe commented Oct 25, 2024

pytorchmergebot commented Oct 25, 2024

pytorchmergebot commented Oct 25, 2024

zhuhaozhe commented Jun 17, 2024 •

edited

Loading

pytorch-bot bot commented Jun 17, 2024 •

edited

Loading

zhuhaozhe Sep 25, 2024 •

edited

Loading