Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[inductor] refine loop split logic #128812

Draft
wants to merge 28 commits into
base: gh/zhuhaozhe/39/base
Choose a base branch
from

Conversation

zhuhaozhe
Copy link
Collaborator

@zhuhaozhe zhuhaozhe commented Jun 17, 2024

This PR aims to improves parallelization by collapsing vectorized loop. #122281

For such case, the parallel level is only 2.
And the vectorized loop cannot be collapsed.

#pragma omp for
for(long x0=static_cast<long>(0L); x0<static_cast<long>(2L); x0+=static_cast<long>(1L))
{
    for(long x1=static_cast<long>(0L); x1<static_cast<long>(199984L); x1+=static_cast<long>(16L))
    {
        auto tmp0 = at::vec::VectorizedN<int64_t,2>::loadu(in_ptr0 + static_cast<long>(x1 + (199985L*x0)), 16);
        tmp0.store(out_ptr0 + static_cast<long>(x1 + (209985L*x0)), 16);
    }
    #pragma omp simd simdlen(8) 
    for(long x1=static_cast<long>(199984L); x1<static_cast<long>(199985L); x1+=static_cast<long>(1L))
    {
        auto tmp0 = in_ptr0[static_cast<long>(x1 + (199985L*x0))];
        out_ptr0[static_cast<long>(x1 + (209985L*x0))] = tmp0;
    }
}

After this PR, we will gen code

#pragma omp for collapse(2)
for(long x0=static_cast<long>(0L); x0<static_cast<long>(2L); x0+=static_cast<long>(1L))
{
    for(long x1=static_cast<long>(0L); x1<static_cast<long>(199985L); x1+=static_cast<long>(16L))
    {
        if (x1 >= 0 && x1 <199984) {
            auto tmp0 = at::vec::VectorizedN<int64_t,2>::loadu(in_ptr0 + static_cast<long>(x1 + (199985L*x0)), 16);
            tmp0.store(out_ptr0 + static_cast<long>(x1 + (209985L*x0)), 16);
        }
        if (x1 >= 199984 && x1 <199985) {
            auto tmp0 = in_ptr0[static_cast<long>(x1 + (199985L*x0))];
            out_ptr0[static_cast<long>(x1 + (209985L*x0))] = tmp0;
        }
    }
}

Highlight

For reduction case, we have some side-effect here.
For below case, we vectorized x1 dim and reduction at x2 dim.

#pragma omp for
for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(39L); x0+=static_cast<int64_t>(1L))
{
    for(int64_t x1=static_cast<int64_t>(0L); x1<static_cast<int64_t>(16L); x1+=static_cast<int64_t>(8L))
    {
        {
            float tmp_acc0 = -std::numeric_limits<float>::infinity();
            at::vec::Vectorized<float> tmp_acc0_vec = at::vec::Vectorized<float>(-std::numeric_limits<float>::infinity());
            for(int64_t x2=static_cast<int64_t>(0L); x2<static_cast<int64_t>(18L); x2+=static_cast<int64_t>(1L))
            {
                auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr1 + static_cast<int64_t>(x1 + (17L*x2) + (306L*x0)), 8);
                tmp_acc0_vec = at::vec::maximum(tmp_acc0_vec, tmp0);
            }
            [&]
            {
                __at_align__ std::array<float, 8> tmpbuf;
                tmp_acc0_vec.store(tmpbuf.data(), 8);
                #pragma GCC unroll 8
                for (long x1_inner = 0; x1_inner < 8; x1_inner++)
                {
                    out_ptr1[static_cast<int64_t>(x0 + (39L*x1) + (39L*x1_inner))] = tmpbuf[x1_inner];
                }
            }
            ()
            ;
        }
    }
    #pragma omp simd simdlen(4) 
    for(int64_t x1=static_cast<int64_t>(16L); x1<static_cast<int64_t>(17L); x1+=static_cast<int64_t>(1L))
    {
        {
            float tmp_acc0 = -std::numeric_limits<float>::infinity();
            for(int64_t x2=static_cast<int64_t>(0L); x2<static_cast<int64_t>(18L); x2+=static_cast<int64_t>(1L))
            {
                auto tmp0 = in_ptr1[static_cast<int64_t>(x1 + (17L*x2) + (306L*x0))];
                tmp_acc0 = max_propagate_nan(tmp_acc0, tmp0);
            }
            out_ptr1[static_cast<int64_t>(x0 + (39L*x1))] = tmp_acc0;
        }
    }
}

After collapse, the loop order will be x1 -> x2 -> x1_tail_part, thus we will need a tmp_acc_arr to store the reduction result for x1_tail_part. And for reduction_stores, we also need to check x1's value like what we do in the loopbody since the reduction_stores happened between x1 and x2 loops.

#pragma omp for collapse(2)
for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(39L); x0+=static_cast<int64_t>(1L))
{
    for(int64_t x1=static_cast<int64_t>(0L); x1<static_cast<int64_t>(17L); x1+=static_cast<int64_t>(8L))
    {
        {
            float tmp_acc0_arr[8];           ######### need an array to hold acc result for tail part
            for (int i = 0; i < 8; i++)
            {
                tmp_acc0_arr[i] = -std::numeric_limits<float>::infinity();
            }
            float tmp_acc0 = -std::numeric_limits<float>::infinity();
            at::vec::Vectorized<float> tmp_acc0_vec = at::vec::Vectorized<float>(-std::numeric_limits<float>::infinity());
            for(int64_t x2=static_cast<int64_t>(0L); x2<static_cast<int64_t>(18L); x2+=static_cast<int64_t>(1L))
            {
                {
                    if(C10_LIKELY(x1 >= static_cast<int64_t>(0) && x1 < static_cast<int64_t>(16L)))
                    {
                        auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr1 + static_cast<int64_t>(x1 + (17L*x2) + (306L*x0)), 8);
                        tmp_acc0_vec = at::vec::maximum(tmp_acc0_vec, tmp0);
                    }
                    if(C10_UNLIKELY(x1 >= static_cast<int64_t>(16L) && x1 < static_cast<int64_t>(17L)))
                    {
                        for (long x1_tail = static_cast<int64_t>(16L); x1_tail < static_cast<int64_t>(17L); x1_tail++)
                        {
                            auto tmp0 = in_ptr1[static_cast<int64_t>(x1_tail + (17L*x2) + (306L*x0))];
                            tmp_acc0_arr[x1_tail - static_cast<int64_t>(16L)] = max_propagate_nan(tmp_acc0_arr[x1_tail - static_cast<int64_t>(16L)], tmp0);
                        }
                    }
                }
            }

            ############### reduction stores
            if(C10_LIKELY(x1 >= static_cast<int64_t>(0) && x1 < static_cast<int64_t>(16L)))
            {
                [&]
                {
                    __at_align__ std::array<float, 8> tmpbuf;
                    tmp_acc0_vec.store(tmpbuf.data(), 8);
                    #pragma GCC unroll 8
                    for (long x1_inner = 0; x1_inner < 8; x1_inner++)
                    {
                        out_ptr1[static_cast<int64_t>(x0 + (39L*x1) + (39L*x1_inner))] = tmpbuf[x1_inner];
                    }
                }
                ()
                ;
            }
            if(C10_UNLIKELY(x1 >= static_cast<int64_t>(16L) && x1 < static_cast<int64_t>(17L)))
            {
                for (long x1_tail = static_cast<int64_t>(16L); x1_tail < static_cast<int64_t>(17L); x1_tail++)
                {
                    out_ptr1[static_cast<int64_t>(x0 + (39L*x1_tail))] = tmp_acc0_arr[x1_tail - static_cast<int64_t>(16L)];
                }
            }
        }
    }
}

Stack from ghstack (oldest at bottom):

cc @voznesenskym @penguinwu @EikanWang @jgong5 @Guobing-Chen @XiaobingSuper @blzheng @wenzhe-nrv @jiayisunx @peterbell10 @ipiszy @yf225 @chenyang78 @kadeng @muchulee8 @ColinPeppler @amjames @desertfire @chauhang

Copy link

pytorch-bot bot commented Jun 17, 2024

🔗 Helpful Links

🧪 See artifacts and rendered test results at hud.pytorch.org/pr/128812

Note: Links to docs will display an error until the docs builds have been completed.

❌ 11 New Failures

As of commit 36213d7 with merge base 32a3dbc (image):

NEW FAILURES - The following jobs have failed:

This comment was automatically generated by Dr. CI and updates every 15 minutes.

zhuhaozhe added a commit that referenced this pull request Jun 17, 2024
ghstack-source-id: a0ffb42b1c0b2159b72f278aa4184ab75325cd03
Pull Request resolved: #128812
[ghstack-poisoned]
@zhuhaozhe zhuhaozhe marked this pull request as draft July 17, 2024 07:47
zhuhaozhe added a commit to zhuhaozhe/pytorch that referenced this pull request Jul 24, 2024
ghstack-source-id: a0ffb42b1c0b2159b72f278aa4184ab75325cd03
Pull Request resolved: pytorch#128812
zhuhaozhe added a commit to zhuhaozhe/pytorch that referenced this pull request Jul 24, 2024
ghstack-source-id: a0ffb42b1c0b2159b72f278aa4184ab75325cd03
Pull Request resolved: pytorch#128812
zhuhaozhe added a commit that referenced this pull request Jul 25, 2024
ghstack-source-id: ae8e67d681d811c0cd0ed703d186ddbe8e39f854
Pull Request resolved: #128812
[ghstack-poisoned]
zhuhaozhe added a commit to zhuhaozhe/pytorch that referenced this pull request Jul 26, 2024
ghstack-source-id: ae8e67d681d811c0cd0ed703d186ddbe8e39f854
Pull Request resolved: pytorch#128812
zhuhaozhe added a commit to zhuhaozhe/pytorch that referenced this pull request Jul 27, 2024
ghstack-source-id: ae8e67d681d811c0cd0ed703d186ddbe8e39f854
Pull Request resolved: pytorch#128812
zhuhaozhe added a commit that referenced this pull request Aug 16, 2024
ghstack-source-id: ff1dcca4bbb2cf3100f86bf622b492f73df3ad16
Pull Request resolved: #128812
[ghstack-poisoned]
zhuhaozhe added a commit that referenced this pull request Aug 16, 2024
ghstack-source-id: 39d237a5cf04be275029125ef488469b2f430dda
Pull Request resolved: #128812
[ghstack-poisoned]
zhuhaozhe added a commit that referenced this pull request Aug 16, 2024
ghstack-source-id: 6baf7b0426bbcc1ea0c06180b393ecb4619bb59d
Pull Request resolved: #128812
[ghstack-poisoned]
zhuhaozhe added a commit that referenced this pull request Aug 16, 2024
ghstack-source-id: 8254f219519f68724f941713938b04d9d44c53ac
Pull Request resolved: #128812
[ghstack-poisoned]
zhuhaozhe added a commit that referenced this pull request Aug 29, 2024
ghstack-source-id: 470238141e894f1cd0ea1c798987c229020dccf4
Pull Request resolved: #128812
[ghstack-poisoned]
[ghstack-poisoned]
zhuhaozhe added a commit that referenced this pull request Sep 9, 2024
ghstack-source-id: ceb03c79c58a58e489f216df12556cc559db904d
Pull Request resolved: #128812
[ghstack-poisoned]
zhuhaozhe added a commit that referenced this pull request Sep 9, 2024
ghstack-source-id: af9d8dc8e5e77cfa9c203081e20cafd17569b38c
Pull Request resolved: #128812
[ghstack-poisoned]
zhuhaozhe added a commit that referenced this pull request Sep 10, 2024
ghstack-source-id: 8cb091acab68b47a147e84d64a1d22bfa203ad02
Pull Request resolved: #128812
[ghstack-poisoned]
Copy link
Collaborator

@jgong5 jgong5 left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I put some early comments on the LoopNest and LoopLevel changes. Still need more time to review others.

@@ -4872,7 +5049,7 @@ def lines(self):


@dataclasses.dataclass
class LoopNestWithSplit:
class LoopNest:
"""
A loop-nest like structure but with some loop level split along
the loop range into the main tiling loop and the tail. It is built
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This need amendment?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Updated

else:
loop_nest.kernel = kernel

loop_nest = LoopNest(loops)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we want to set the kernel field of LoopNest here too?

parent: Optional["LoopLevel"] = None
# the next inner level of the loop, empty if it is inner-most
# contains >1 LoopLevel if the inner level of loop is split
inner: List["LoopLevel"] = dataclasses.field(default_factory=list)
# kernel assigned to this loop level, only valid when it is a leaf
kernel: Optional[CppKernel] = None
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we still need it considering each LoopNest only has a single kernel now?

Copy link
Collaborator Author

@zhuhaozhe zhuhaozhe Sep 25, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes. In this PR. We maintained all kinds of kernels (CppKernel, CppVecKernel, CppTile2dKernel in one CppKernelProxy. And assert the kernel is CppKernelProxy https://github.com/pytorch/pytorch/pull/128812/files#diff-5ab7b0235e2076a5fc6629ba0b109208940f5b94f5c13babc3e0f87cf4fcec82R2077 here

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I mean why can't we just use the kernel object from LoopNest, and why we still have to keep a kernel object in the LoopLevel?

inner_loop_clone.parent = loop
loop.inner.append(inner_loop_clone)
loop.kernel = deepcopy(self.kernel)
def split_with_tiling(self, factor):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"split" is an op that splits the loop into two but this function doesn't seem to do so, right? It seems to create a vectorized loop level?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for reminder here, renamed to vectorized_with_tiling.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Did you commit the change? Also name it tile sounds better? Please also amend the code doc for related functions.

zhuhaozhe added a commit that referenced this pull request Sep 25, 2024
ghstack-source-id: 8cb091acab68b47a147e84d64a1d22bfa203ad02
Pull Request resolved: #128812
parent: Optional["LoopLevel"] = None
# the next inner level of the loop, empty if it is inner-most
# contains >1 LoopLevel if the inner level of loop is split
inner: List["LoopLevel"] = dataclasses.field(default_factory=list)
# kernel assigned to this loop level, only valid when it is a leaf
kernel: Optional[CppKernel] = None
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I mean why can't we just use the kernel object from LoopNest, and why we still have to keep a kernel object in the LoopLevel?

dtype: torch.dtype,
init_fn,
):
# gen preduction prefix
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
# gen preduction prefix
# gen reduction prefix


stack.enter_context(code.indent())
if loop_nest.root:
if loop_nest.loops:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we still need this check here?

kernel.gen_body(code)

def get_reduction_prefix_suffix(kernel, parallel=False, buffer="prefix"):
if buffer == "suffix":
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why not using a boolean flag here?

gen_loops(loop.inner, loop.is_reduction)
else:
gen_loop_kernel(loop)
gen_loop_nest(_loop_nest, depth, loop.is_reduction)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: you don't have to do depth += 1 separately.

Suggested change
gen_loop_nest(_loop_nest, depth, loop.is_reduction)
gen_loop_nest(_loop_nest, depth + 1, loop.is_reduction)

tiling_idx = FloorDiv(loop.size, sympy_factor) * sympy_factor
loop.steps = sympy_factor
loop.simd_vec = True
loop.tiling_offset = tiling_idx
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggest to use loop.tiled_size.

@@ -1714,11 +1782,14 @@ class CppKernel(Kernel):

def __init__(self, args, num_threads):
super().__init__(args)
self.active_ranges: dict[sympy.Expr, Tuple[sympy.Expr, ...]] = {}
self.inner_itervars: List[sympy.Symbol] = []
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ditto

self.call_ranges: Optional[Tuple[sympy.Expr, ...]] = None
self.ranges: List[sympy.Expr] = []
self.itervars: List[sympy.Symbol] = []
self.reduction_depth = None
self.reduction_prefix = IndentedBuffer()
self.reduction_prefix_fn: List[Callable] = [] # type: ignore[type-arg]
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
self.reduction_prefix_fn: List[Callable] = [] # type: ignore[type-arg]
self.reduction_prefix_generators: List[Callable] = [] # type: ignore[type-arg]

@@ -239,6 +238,101 @@ def reduction_project(reduction_type, acc):
return acc


def transform_kernel_codes_under_inner_loop(
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is move_code_under_inner_loop simpler?

reduction_vars = tail_loop_kernel.reduction_var_names
for name in reduction_vars:
new_name = f"{name}_arr[{outer_loop.var}_tail - {cexpr_index(outer_loop.tiling_offset)}]"
replace_acc_name(tail_loop_kernel.stores, name, new_name)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we still need the logic of replacing a generated buffer after introducing the design of lazy generation with callbacks?

zhuhaozhe added a commit that referenced this pull request Oct 18, 2024
ghstack-source-id: d888c7594d9013e594f1e317cb1d2486acb481e6
Pull Request resolved: #128812
@zhuhaozhe
Copy link
Collaborator Author

@pytorchbot rebase

[ghstack-poisoned]
@pytorchmergebot
Copy link
Collaborator

@pytorchbot started a rebase job onto refs/remotes/origin/viable/strict. Check the current status here

[ghstack-poisoned]
@pytorchmergebot
Copy link
Collaborator

Successfully rebased gh/zhuhaozhe/39/orig onto refs/remotes/origin/viable/strict, please pull locally before adding more changes (for example, via ghstack checkout https://github.com/pytorch/pytorch/pull/128812)

pytorchmergebot pushed a commit that referenced this pull request Oct 18, 2024
ghstack-source-id: e47cfd0541b61da2496bbfdd74ea1420035de280
Pull Request resolved: #128812
@zhuhaozhe
Copy link
Collaborator Author

@pytorchbot rebase

@pytorchmergebot
Copy link
Collaborator

@pytorchbot started a rebase job onto refs/remotes/origin/viable/strict. Check the current status here

[ghstack-poisoned]
@pytorchmergebot
Copy link
Collaborator

Successfully rebased gh/zhuhaozhe/39/orig onto refs/remotes/origin/viable/strict, please pull locally before adding more changes (for example, via ghstack checkout https://github.com/pytorch/pytorch/pull/128812)

pytorchmergebot pushed a commit that referenced this pull request Oct 21, 2024
ghstack-source-id: f9e59d934c7bda4fbf166e70d444772d0b6ca1b7
Pull Request resolved: #128812
@zhuhaozhe
Copy link
Collaborator Author

@pytorchbot rebase

@pytorchmergebot
Copy link
Collaborator

@pytorchbot started a rebase job onto refs/remotes/origin/viable/strict. Check the current status here

[ghstack-poisoned]
@pytorchmergebot
Copy link
Collaborator

Successfully rebased gh/zhuhaozhe/39/orig onto refs/remotes/origin/viable/strict, please pull locally before adding more changes (for example, via ghstack checkout https://github.com/pytorch/pytorch/pull/128812)

pytorchmergebot pushed a commit that referenced this pull request Oct 25, 2024
ghstack-source-id: f9193cb0534abf568ae32986ca7f4e7817c3bd9b
Pull Request resolved: #128812
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

5 participants