Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
119 changes: 79 additions & 40 deletions src/autoschedulers/anderson2021/LoopNest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -718,7 +718,7 @@ double LoopNest::compute_local_mem_stride(double stride, double bytes) const {

// Get the stride over "node's" storage and its element-wise stride for a unit
// increment in the given thread loops
Strides LoopNest::compute_strides(const LoadJacobian &jac, int innermost_storage_dim, const FunctionDAG::Node *storage_node, const Bound &store_bounds, const ThreadInfo &thread_info, bool verbose) const {
Strides LoopNest::compute_strides(const LoadJacobian &jac, int innermost_storage_dim, const FunctionDAG::Node *storage_node, const Bound &store_bounds, const ThreadInfo *thread_info, bool verbose) const {
internal_assert(innermost_storage_dim >= 0);

if (verbose) {
Expand Down Expand Up @@ -756,7 +756,7 @@ Strides LoopNest::compute_strides(const LoadJacobian &jac, int innermost_storage
}

Strides strides{storage_strides};
for (const auto &thread_loop_var : thread_info.loop_vars) {
for (const auto &thread_loop_var : thread_info->loop_vars) {
int loop_index = stage->get_loop_index_from_var(thread_loop_var);
bool loop_index_exists = loop_index >= 0;

Expand Down Expand Up @@ -843,7 +843,8 @@ void LoopNest::compute_gpu_store_features(const LoadJacobian &jac, int consumer_
return;
}

const ThreadInfo &thread_info = *gpu_loop_info.thread_info;
internal_assert(gpu_loop_info.thread_info != nullptr);
const ThreadInfo *thread_info = gpu_loop_info.thread_info;
bool is_shared_mem = consumer_site.gpu_store_memory_type == GPUMemoryType::Shared;

size_t actual_vector_dim = get_actual_vector_dim(consumer_store_bounds);
Expand Down Expand Up @@ -967,18 +968,29 @@ void LoopNest::compute_gpu_store_features(const LoadJacobian &jac, int consumer_
}

template<typename T>
void LoopNest::compute_num_mem_accesses_per_block(const LoadJacobian &jac, const FunctionDAG::Node *node, const Bound &store_bounds, const ThreadInfo &thread_info, int innermost_dim, double num_requests_per_warp, MemInfoType<T> &mem_info, bool verbose) const {
void LoopNest::compute_num_mem_accesses_per_block(const LoadJacobian &jac, const FunctionDAG::Node *node, const Bound &store_bounds, const ThreadInfo *thread_info, int innermost_dim, double num_requests_per_warp, MemInfoType<T> &mem_info, bool verbose) const {
int bytes_per_access = node->bytes_per_point;

// If the consumer is a scalar and is compute_root, then it will not be
// surrounded by a gpu_threads loop, in which case thread_info will be null.
// In this case, there is no need to compute the below thread/warp-related
// details because only a single point is being computed
if (!thread_info && is_scalar()) {
mem_info.add_access_info(num_requests_per_warp, 1, bytes_per_access);
return;
}

internal_assert(thread_info != nullptr);

Strides strides = compute_strides(jac, innermost_dim, node, store_bounds, thread_info, verbose);

size_t dimensions = thread_info.loop_indices.size();
size_t dimensions = thread_info->loop_indices.size();
strides.dump(verbose);

int bytes_per_access = node->bytes_per_point;

{
int num_requests = thread_info.num_regular_active_warps_per_block * num_requests_per_warp;
int num_requests = thread_info->num_regular_active_warps_per_block * num_requests_per_warp;
Accumulator<T> accumulator(bytes_per_access, dimensions, strides, verbose);
thread_info.for_each_thread_id_in_first_warp(accumulator);
thread_info->for_each_thread_id_in_first_warp(accumulator);

accumulator.add_access_info(
num_requests,
Expand All @@ -987,21 +999,21 @@ void LoopNest::compute_num_mem_accesses_per_block(const LoadJacobian &jac, const

if (verbose) {
aslog(2) << "num_requests_per_warp = " << num_requests_per_warp << "\n";
aslog(2) << "num_regular_warps = " << thread_info.num_regular_active_warps_per_block << "\n";
aslog(2) << "num_regular_warps = " << thread_info->num_regular_active_warps_per_block << "\n";
}
}

if (!thread_info.has_tail_warp) {
if (!thread_info->has_tail_warp) {
return;
}

if (verbose) {
aslog(2) << "\nBEGIN tail warp\n";
aslog(2) << "# threads in tail warp: " << thread_info.num_threads_in_final_warp << "\n";
aslog(2) << "# threads in tail warp: " << thread_info->num_threads_in_final_warp << "\n";
}

Accumulator<T> accumulator(bytes_per_access, dimensions, strides, verbose);
thread_info.for_each_thread_id_in_tail_warp(accumulator);
thread_info->for_each_thread_id_in_tail_warp(accumulator);

accumulator.add_access_info(
num_requests_per_warp,
Expand All @@ -1013,18 +1025,27 @@ void LoopNest::compute_num_mem_accesses_per_block(const LoadJacobian &jac, const
}
}

template void LoopNest::compute_num_mem_accesses_per_block<GlobalMem>(const LoadJacobian &jac, const FunctionDAG::Node *node, const Bound &store_bounds, const ThreadInfo &thread_info, int innermost_dim, double num_requests_per_warp, MemInfoType<GlobalMem> &mem_info, bool verbose) const;
template void LoopNest::compute_num_mem_accesses_per_block<GlobalMem>(const LoadJacobian &jac, const FunctionDAG::Node *node, const Bound &store_bounds, const ThreadInfo *thread_info, int innermost_dim, double num_requests_per_warp, MemInfoType<GlobalMem> &mem_info, bool verbose) const;

template void LoopNest::compute_num_mem_accesses_per_block<SharedMem>(const LoadJacobian &jac, const FunctionDAG::Node *node, const Bound &store_bounds, const ThreadInfo &thread_info, int innermost_dim, double num_requests_per_warp, MemInfoType<SharedMem> &mem_info, bool verbose) const;
template void LoopNest::compute_num_mem_accesses_per_block<SharedMem>(const LoadJacobian &jac, const FunctionDAG::Node *node, const Bound &store_bounds, const ThreadInfo *thread_info, int innermost_dim, double num_requests_per_warp, MemInfoType<SharedMem> &mem_info, bool verbose) const;

template<>
void LoopNest::compute_num_mem_accesses_per_block<LocalMem>(const LoadJacobian &jac, const FunctionDAG::Node *node, const Bound &store_bounds, const ThreadInfo &thread_info, int innermost_dim, double num_requests_per_warp, MemInfoType<LocalMem> &mem_info, bool verbose) const {
void LoopNest::compute_num_mem_accesses_per_block<LocalMem>(const LoadJacobian &jac, const FunctionDAG::Node *node, const Bound &store_bounds, const ThreadInfo *thread_info, int innermost_dim, double num_requests_per_warp, MemInfoType<LocalMem> &mem_info, bool verbose) const {
int bytes_per_access = node->bytes_per_point;

// If the consumer is a scalar and is compute_root, then it will not be
// surrounded by a gpu_threads loop, in which case thread_info will be null.
// In this case, there is no need to compute the below thread/warp-related
// details because only a single point is being computed
if (!thread_info && is_scalar()) {
mem_info.add_access_info(num_requests_per_warp, 1, bytes_per_access);
return;
}

{
int num_requests = thread_info.num_regular_active_warps_per_block * num_requests_per_warp;
int num_requests = thread_info->num_regular_active_warps_per_block * num_requests_per_warp;
LocalAccessAccumulator accumulator(bytes_per_access, verbose);
thread_info.for_each_thread_id_in_first_warp(accumulator);
thread_info->for_each_thread_id_in_first_warp(accumulator);

accumulator.add_access_info(
num_requests,
Expand All @@ -1033,21 +1054,21 @@ void LoopNest::compute_num_mem_accesses_per_block<LocalMem>(const LoadJacobian &

if (verbose) {
aslog(2) << "num_requests_per_warp = " << num_requests_per_warp << "\n";
aslog(2) << "num_regular_warps = " << thread_info.num_regular_active_warps_per_block << "\n";
aslog(2) << "num_regular_warps = " << thread_info->num_regular_active_warps_per_block << "\n";
}
}

if (!thread_info.has_tail_warp) {
if (!thread_info->has_tail_warp) {
return;
}

if (verbose) {
aslog(2) << "\nBEGIN tail warp\n";
aslog(2) << "# threads in tail warp: " << thread_info.num_threads_in_final_warp << "\n";
aslog(2) << "# threads in tail warp: " << thread_info->num_threads_in_final_warp << "\n";
}

LocalAccessAccumulator accumulator(bytes_per_access, verbose);
thread_info.for_each_thread_id_in_tail_warp(accumulator);
thread_info->for_each_thread_id_in_tail_warp(accumulator);

accumulator.add_access_info(
num_requests_per_warp,
Expand All @@ -1074,19 +1095,19 @@ std::pair<double, double> LoopNest::compute_local_mem_store_features(const LoadJ
}

template<typename T>
MemInfoType<T> LoopNest::compute_mem_store_info(const LoadJacobian &jac, int consumer_innermost_dim, const FunctionDAG::Node *node, const Bound &consumer_store_bounds, const ThreadInfo &thread_info, double serial_loop_extents, bool verbose) const {
MemInfoType<T> LoopNest::compute_mem_store_info(const LoadJacobian &jac, int consumer_innermost_dim, const FunctionDAG::Node *node, const Bound &consumer_store_bounds, const ThreadInfo *thread_info, double serial_loop_extents, bool verbose) const {
MemInfoType<T> mem_info;

compute_num_mem_accesses_per_block<T>(jac, node, consumer_store_bounds, thread_info, consumer_innermost_dim, serial_loop_extents, mem_info, verbose);
return mem_info;
}

template MemInfoType<GlobalMem> LoopNest::compute_mem_store_info<GlobalMem>(const LoadJacobian &jac, int consumer_innermost_dim, const FunctionDAG::Node *node, const Bound &consumer_store_bounds, const ThreadInfo &thread_info, double serial_loop_extents, bool verbose) const;
template MemInfoType<GlobalMem> LoopNest::compute_mem_store_info<GlobalMem>(const LoadJacobian &jac, int consumer_innermost_dim, const FunctionDAG::Node *node, const Bound &consumer_store_bounds, const ThreadInfo *thread_info, double serial_loop_extents, bool verbose) const;

template MemInfoType<SharedMem> LoopNest::compute_mem_store_info<SharedMem>(const LoadJacobian &jac, int consumer_innermost_dim, const FunctionDAG::Node *node, const Bound &consumer_store_bounds, const ThreadInfo &thread_info, double serial_loop_extents, bool verbose) const;
template MemInfoType<SharedMem> LoopNest::compute_mem_store_info<SharedMem>(const LoadJacobian &jac, int consumer_innermost_dim, const FunctionDAG::Node *node, const Bound &consumer_store_bounds, const ThreadInfo *thread_info, double serial_loop_extents, bool verbose) const;

template<typename T>
void LoopNest::compute_mem_load_features(const LoadJacobian &jac, int producer_innermost_dim, const FunctionDAG::Node *node, const Bound &producer_store_bounds, bool producer_has_been_scheduled, const ThreadInfo &thread_info, MemInfoType<T> &mem_info, double points_accessed_per_thread, bool verbose) const {
void LoopNest::compute_mem_load_features(const LoadJacobian &jac, int producer_innermost_dim, const FunctionDAG::Node *node, const Bound &producer_store_bounds, bool producer_has_been_scheduled, const ThreadInfo *thread_info, MemInfoType<T> &mem_info, double points_accessed_per_thread, bool verbose) const {
if (producer_has_been_scheduled) {
compute_num_mem_accesses_per_block<T>(jac, node, producer_store_bounds, thread_info, producer_innermost_dim, points_accessed_per_thread, mem_info, verbose);

Expand Down Expand Up @@ -1115,7 +1136,7 @@ template void LoopNest::compute_mem_load_features<GlobalMem>(const LoadJacobian
const FunctionDAG::Node *node,
const Bound &producer_store_bounds,
bool producer_has_been_scheduled,
const ThreadInfo &thread_info,
const ThreadInfo *thread_info,
MemInfoType<GlobalMem> &mem_info,
double points_accessed_per_thread,
bool verbose) const;
Expand All @@ -1125,7 +1146,7 @@ template void LoopNest::compute_mem_load_features<SharedMem>(const LoadJacobian
const FunctionDAG::Node *node,
const Bound &producer_store_bounds,
bool producer_has_been_scheduled,
const ThreadInfo &thread_info,
const ThreadInfo *thread_info,
MemInfoType<SharedMem> &mem_info,
double points_accessed_per_thread,
bool verbose) const;
Expand All @@ -1136,7 +1157,7 @@ void LoopNest::compute_mem_load_features<LocalMem>(const LoadJacobian &jac,
const FunctionDAG::Node *node,
const Bound &producer_store_bounds,
bool producer_has_been_scheduled,
const ThreadInfo &thread_info,
const ThreadInfo *thread_info,
MemInfoType<LocalMem> &mem_info,
double points_accessed_per_thread,
bool verbose) const {
Expand Down Expand Up @@ -2163,11 +2184,13 @@ void LoopNest::compute_features(const FunctionDAG &dag,
// The store_at location of the consumer
const auto *consumer_store_site = innermost ? parent : consumer_site.store;

bool inner_serial_loop_extents_computed = false;
std::vector<int64_t> inner_serial_loop_extents;

if (innermost && !stage->store_jacobian->empty()) {
const auto &bounds = consumer_site.store->get_bounds(stage->node);
inner_serial_loop_extents = gpu_loop_info.get_inner_serial_loop_extents(this);
inner_serial_loop_extents_computed = true;
auto store_jac = *stage->store_jacobian;

compute_gpu_store_features(
Expand Down Expand Up @@ -2223,10 +2246,16 @@ void LoopNest::compute_features(const FunctionDAG &dag,
for (const auto &j : e->load_jacobians) {
jacobians.emplace_back(j, e->producer);

if (!inner_serial_loop_extents_computed && !is_scalar()) {
inner_serial_loop_extents = gpu_loop_info.get_inner_serial_loop_extents(this);
inner_serial_loop_extents_computed = true;
}

// Thread loops may not be innermost so in the
// Jacobians we need to account for the stride
// of the inner loops
thread_jacobians.emplace_back(j * inner_serial_loop_extents, e->producer);
// of the inner loops (but only for non-scalars,
// since scalars never have inner serial loops)
thread_jacobians.emplace_back(is_scalar() ? j : j * inner_serial_loop_extents, e->producer);
}
} else {
// Consumer was inlined. Multiply the Jacobians to look through it.
Expand Down Expand Up @@ -2334,7 +2363,7 @@ void LoopNest::compute_features(const FunctionDAG &dag,
e->producer,
producer_store_bounds,
producer_has_been_scheduled,
*gpu_loop_info.thread_info,
gpu_loop_info.thread_info,
shared_mem_loads,
points_accessed,
verbose);
Expand Down Expand Up @@ -2365,7 +2394,7 @@ void LoopNest::compute_features(const FunctionDAG &dag,
e->producer,
producer_store_bounds,
producer_has_been_scheduled,
*gpu_loop_info.thread_info,
gpu_loop_info.thread_info,
global_mem_loads,
points_accessed,
verbose);
Expand Down Expand Up @@ -2405,7 +2434,7 @@ void LoopNest::compute_features(const FunctionDAG &dag,
e->producer,
producer_store_bounds,
producer_has_been_scheduled,
*gpu_loop_info.thread_info,
gpu_loop_info.thread_info,
local_mem_loads,
points_accessed,
verbose);
Expand Down Expand Up @@ -2678,10 +2707,20 @@ void LoopNest::compute_features(const FunctionDAG &dag,
inlined_feat.outer_parallelism = parallelism;
inlined_feat.num_blocks = parallelism;

internal_assert(gpu_loop_info.thread_info);
auto num_warps = it.value() * gpu_loop_info.total_serial_extents() * gpu_loop_info.thread_info->num_warps_per_block * inlined_feat.num_blocks;
inlined_feat.num_warps_per_block += num_warps;
inlined_feat.num_threads_per_block += gpu_loop_info.thread_info->num_threads;
internal_assert(is_scalar() || gpu_loop_info.thread_info);

auto num_warps_per_block = it.value();
auto num_threads_per_block = 1;

// If the func is being inlined into a scalar, then the scalar will not
// be surrounded by block/thread/serial loops so there's no need to take
// them into account when computing these features
if (!is_scalar()) {
num_warps_per_block *= gpu_loop_info.total_serial_extents() * gpu_loop_info.thread_info->num_warps_per_block * inlined_feat.num_blocks;
num_threads_per_block = gpu_loop_info.thread_info->num_threads;
}
inlined_feat.num_warps_per_block += num_warps_per_block;
inlined_feat.num_threads_per_block += num_threads_per_block;
double points_computed_per_thread = it.value() * feat.points_computed_per_thread;
inlined_feat.points_computed_per_thread += points_computed_per_thread;

Expand All @@ -2695,9 +2734,9 @@ void LoopNest::compute_features(const FunctionDAG &dag,

intermediate.innermost_pure_loop_extent = feat.innermost_pure_loop_extent;
intermediate.outer_parallelism = parallelism;
intermediate.num_warps_per_block = num_warps;
intermediate.num_warps_per_block = num_warps_per_block;

intermediate.num_threads_per_block = gpu_loop_info.thread_info->num_threads;
intermediate.num_threads_per_block = num_threads_per_block;
intermediate.points_computed_per_thread = points_computed_per_thread;
}
}
Expand Down
Loading