halide · steven-johnson · Jul 31, 2023 · Jul 31, 2023 · Jul 31, 2023
diff --git a/src/autoschedulers/anderson2021/LoopNest.cpp b/src/autoschedulers/anderson2021/LoopNest.cpp
@@ -718,7 +718,7 @@ double LoopNest::compute_local_mem_stride(double stride, double bytes) const {
 
 // Get the stride over "node's" storage and its element-wise stride for a unit
 // increment in the given thread loops
-Strides LoopNest::compute_strides(const LoadJacobian &jac, int innermost_storage_dim, const FunctionDAG::Node *storage_node, const Bound &store_bounds, const ThreadInfo &thread_info, bool verbose) const {
+Strides LoopNest::compute_strides(const LoadJacobian &jac, int innermost_storage_dim, const FunctionDAG::Node *storage_node, const Bound &store_bounds, const ThreadInfo *thread_info, bool verbose) const {
     internal_assert(innermost_storage_dim >= 0);
 
     if (verbose) {
@@ -756,7 +756,7 @@ Strides LoopNest::compute_strides(const LoadJacobian &jac, int innermost_storage
     }
 
     Strides strides{storage_strides};
-    for (const auto &thread_loop_var : thread_info.loop_vars) {
+    for (const auto &thread_loop_var : thread_info->loop_vars) {
         int loop_index = stage->get_loop_index_from_var(thread_loop_var);
         bool loop_index_exists = loop_index >= 0;
 
@@ -843,7 +843,8 @@ void LoopNest::compute_gpu_store_features(const LoadJacobian &jac, int consumer_
         return;
     }
 
-    const ThreadInfo &thread_info = *gpu_loop_info.thread_info;
+    internal_assert(gpu_loop_info.thread_info != nullptr);
+    const ThreadInfo *thread_info = gpu_loop_info.thread_info;
     bool is_shared_mem = consumer_site.gpu_store_memory_type == GPUMemoryType::Shared;
 
     size_t actual_vector_dim = get_actual_vector_dim(consumer_store_bounds);
@@ -967,18 +968,29 @@ void LoopNest::compute_gpu_store_features(const LoadJacobian &jac, int consumer_
 }
 
 template<typename T>
-void LoopNest::compute_num_mem_accesses_per_block(const LoadJacobian &jac, const FunctionDAG::Node *node, const Bound &store_bounds, const ThreadInfo &thread_info, int innermost_dim, double num_requests_per_warp, MemInfoType<T> &mem_info, bool verbose) const {
+void LoopNest::compute_num_mem_accesses_per_block(const LoadJacobian &jac, const FunctionDAG::Node *node, const Bound &store_bounds, const ThreadInfo *thread_info, int innermost_dim, double num_requests_per_warp, MemInfoType<T> &mem_info, bool verbose) const {
+    int bytes_per_access = node->bytes_per_point;
+
+    // If the consumer is a scalar and is compute_root, then it will not be
+    // surrounded by a gpu_threads loop, in which case thread_info will be null.
+    // In this case, there is no need to compute the below thread/warp-related
+    // details because only a single point is being computed
+    if (!thread_info && is_scalar()) {
+        mem_info.add_access_info(num_requests_per_warp, 1, bytes_per_access);
+        return;
+    }
+
+    internal_assert(thread_info != nullptr);
+
     Strides strides = compute_strides(jac, innermost_dim, node, store_bounds, thread_info, verbose);
 
-    size_t dimensions = thread_info.loop_indices.size();
+    size_t dimensions = thread_info->loop_indices.size();
     strides.dump(verbose);
 
-    int bytes_per_access = node->bytes_per_point;
-
     {
-        int num_requests = thread_info.num_regular_active_warps_per_block * num_requests_per_warp;
+        int num_requests = thread_info->num_regular_active_warps_per_block * num_requests_per_warp;
         Accumulator<T> accumulator(bytes_per_access, dimensions, strides, verbose);
-        thread_info.for_each_thread_id_in_first_warp(accumulator);
+        thread_info->for_each_thread_id_in_first_warp(accumulator);
 
         accumulator.add_access_info(
             num_requests,
@@ -987,21 +999,21 @@ void LoopNest::compute_num_mem_accesses_per_block(const LoadJacobian &jac, const
 
         if (verbose) {
             aslog(2) << "num_requests_per_warp = " << num_requests_per_warp << "\n";
-            aslog(2) << "num_regular_warps = " << thread_info.num_regular_active_warps_per_block << "\n";
+            aslog(2) << "num_regular_warps = " << thread_info->num_regular_active_warps_per_block << "\n";
         }
     }
 
-    if (!thread_info.has_tail_warp) {
+    if (!thread_info->has_tail_warp) {
         return;
     }
 
     if (verbose) {
         aslog(2) << "\nBEGIN tail warp\n";
-        aslog(2) << "# threads in tail warp: " << thread_info.num_threads_in_final_warp << "\n";
+        aslog(2) << "# threads in tail warp: " << thread_info->num_threads_in_final_warp << "\n";
     }
 
     Accumulator<T> accumulator(bytes_per_access, dimensions, strides, verbose);
-    thread_info.for_each_thread_id_in_tail_warp(accumulator);
+    thread_info->for_each_thread_id_in_tail_warp(accumulator);
 
     accumulator.add_access_info(
         num_requests_per_warp,
@@ -1013,18 +1025,27 @@ void LoopNest::compute_num_mem_accesses_per_block(const LoadJacobian &jac, const
     }
 }
 
-template void LoopNest::compute_num_mem_accesses_per_block<GlobalMem>(const LoadJacobian &jac, const FunctionDAG::Node *node, const Bound &store_bounds, const ThreadInfo &thread_info, int innermost_dim, double num_requests_per_warp, MemInfoType<GlobalMem> &mem_info, bool verbose) const;
+template void LoopNest::compute_num_mem_accesses_per_block<GlobalMem>(const LoadJacobian &jac, const FunctionDAG::Node *node, const Bound &store_bounds, const ThreadInfo *thread_info, int innermost_dim, double num_requests_per_warp, MemInfoType<GlobalMem> &mem_info, bool verbose) const;
 
-template void LoopNest::compute_num_mem_accesses_per_block<SharedMem>(const LoadJacobian &jac, const FunctionDAG::Node *node, const Bound &store_bounds, const ThreadInfo &thread_info, int innermost_dim, double num_requests_per_warp, MemInfoType<SharedMem> &mem_info, bool verbose) const;
+template void LoopNest::compute_num_mem_accesses_per_block<SharedMem>(const LoadJacobian &jac, const FunctionDAG::Node *node, const Bound &store_bounds, const ThreadInfo *thread_info, int innermost_dim, double num_requests_per_warp, MemInfoType<SharedMem> &mem_info, bool verbose) const;
 
 template<>
-void LoopNest::compute_num_mem_accesses_per_block<LocalMem>(const LoadJacobian &jac, const FunctionDAG::Node *node, const Bound &store_bounds, const ThreadInfo &thread_info, int innermost_dim, double num_requests_per_warp, MemInfoType<LocalMem> &mem_info, bool verbose) const {
+void LoopNest::compute_num_mem_accesses_per_block<LocalMem>(const LoadJacobian &jac, const FunctionDAG::Node *node, const Bound &store_bounds, const ThreadInfo *thread_info, int innermost_dim, double num_requests_per_warp, MemInfoType<LocalMem> &mem_info, bool verbose) const {
     int bytes_per_access = node->bytes_per_point;
 
+    // If the consumer is a scalar and is compute_root, then it will not be
+    // surrounded by a gpu_threads loop, in which case thread_info will be null.
+    // In this case, there is no need to compute the below thread/warp-related
+    // details because only a single point is being computed
+    if (!thread_info && is_scalar()) {
+        mem_info.add_access_info(num_requests_per_warp, 1, bytes_per_access);
+        return;
+    }
+
     {
-        int num_requests = thread_info.num_regular_active_warps_per_block * num_requests_per_warp;
+        int num_requests = thread_info->num_regular_active_warps_per_block * num_requests_per_warp;
         LocalAccessAccumulator accumulator(bytes_per_access, verbose);
-        thread_info.for_each_thread_id_in_first_warp(accumulator);
+        thread_info->for_each_thread_id_in_first_warp(accumulator);
 
         accumulator.add_access_info(
             num_requests,
@@ -1033,21 +1054,21 @@ void LoopNest::compute_num_mem_accesses_per_block<LocalMem>(const LoadJacobian &
 
         if (verbose) {
             aslog(2) << "num_requests_per_warp = " << num_requests_per_warp << "\n";
-            aslog(2) << "num_regular_warps = " << thread_info.num_regular_active_warps_per_block << "\n";
+            aslog(2) << "num_regular_warps = " << thread_info->num_regular_active_warps_per_block << "\n";
         }
     }
 
-    if (!thread_info.has_tail_warp) {
+    if (!thread_info->has_tail_warp) {
         return;
     }
 
     if (verbose) {
         aslog(2) << "\nBEGIN tail warp\n";
-        aslog(2) << "# threads in tail warp: " << thread_info.num_threads_in_final_warp << "\n";
+        aslog(2) << "# threads in tail warp: " << thread_info->num_threads_in_final_warp << "\n";
     }
 
     LocalAccessAccumulator accumulator(bytes_per_access, verbose);
-    thread_info.for_each_thread_id_in_tail_warp(accumulator);
+    thread_info->for_each_thread_id_in_tail_warp(accumulator);
 
     accumulator.add_access_info(
         num_requests_per_warp,
@@ -1074,19 +1095,19 @@ std::pair<double, double> LoopNest::compute_local_mem_store_features(const LoadJ
 }
 
 template<typename T>
-MemInfoType<T> LoopNest::compute_mem_store_info(const LoadJacobian &jac, int consumer_innermost_dim, const FunctionDAG::Node *node, const Bound &consumer_store_bounds, const ThreadInfo &thread_info, double serial_loop_extents, bool verbose) const {
+MemInfoType<T> LoopNest::compute_mem_store_info(const LoadJacobian &jac, int consumer_innermost_dim, const FunctionDAG::Node *node, const Bound &consumer_store_bounds, const ThreadInfo *thread_info, double serial_loop_extents, bool verbose) const {
     MemInfoType<T> mem_info;
 
     compute_num_mem_accesses_per_block<T>(jac, node, consumer_store_bounds, thread_info, consumer_innermost_dim, serial_loop_extents, mem_info, verbose);
     return mem_info;
 }
 
-template MemInfoType<GlobalMem> LoopNest::compute_mem_store_info<GlobalMem>(const LoadJacobian &jac, int consumer_innermost_dim, const FunctionDAG::Node *node, const Bound &consumer_store_bounds, const ThreadInfo &thread_info, double serial_loop_extents, bool verbose) const;
+template MemInfoType<GlobalMem> LoopNest::compute_mem_store_info<GlobalMem>(const LoadJacobian &jac, int consumer_innermost_dim, const FunctionDAG::Node *node, const Bound &consumer_store_bounds, const ThreadInfo *thread_info, double serial_loop_extents, bool verbose) const;
 
-template MemInfoType<SharedMem> LoopNest::compute_mem_store_info<SharedMem>(const LoadJacobian &jac, int consumer_innermost_dim, const FunctionDAG::Node *node, const Bound &consumer_store_bounds, const ThreadInfo &thread_info, double serial_loop_extents, bool verbose) const;
+template MemInfoType<SharedMem> LoopNest::compute_mem_store_info<SharedMem>(const LoadJacobian &jac, int consumer_innermost_dim, const FunctionDAG::Node *node, const Bound &consumer_store_bounds, const ThreadInfo *thread_info, double serial_loop_extents, bool verbose) const;
 
 template<typename T>
-void LoopNest::compute_mem_load_features(const LoadJacobian &jac, int producer_innermost_dim, const FunctionDAG::Node *node, const Bound &producer_store_bounds, bool producer_has_been_scheduled, const ThreadInfo &thread_info, MemInfoType<T> &mem_info, double points_accessed_per_thread, bool verbose) const {
+void LoopNest::compute_mem_load_features(const LoadJacobian &jac, int producer_innermost_dim, const FunctionDAG::Node *node, const Bound &producer_store_bounds, bool producer_has_been_scheduled, const ThreadInfo *thread_info, MemInfoType<T> &mem_info, double points_accessed_per_thread, bool verbose) const {
     if (producer_has_been_scheduled) {
         compute_num_mem_accesses_per_block<T>(jac, node, producer_store_bounds, thread_info, producer_innermost_dim, points_accessed_per_thread, mem_info, verbose);
 
@@ -1115,7 +1136,7 @@ template void LoopNest::compute_mem_load_features<GlobalMem>(const LoadJacobian
                                                              const FunctionDAG::Node *node,
                                                              const Bound &producer_store_bounds,
                                                              bool producer_has_been_scheduled,
-                                                             const ThreadInfo &thread_info,
+                                                             const ThreadInfo *thread_info,
                                                              MemInfoType<GlobalMem> &mem_info,
                                                              double points_accessed_per_thread,
                                                              bool verbose) const;
@@ -1125,7 +1146,7 @@ template void LoopNest::compute_mem_load_features<SharedMem>(const LoadJacobian
                                                              const FunctionDAG::Node *node,
                                                              const Bound &producer_store_bounds,
                                                              bool producer_has_been_scheduled,
-                                                             const ThreadInfo &thread_info,
+                                                             const ThreadInfo *thread_info,
                                                              MemInfoType<SharedMem> &mem_info,
                                                              double points_accessed_per_thread,
                                                              bool verbose) const;
@@ -1136,7 +1157,7 @@ void LoopNest::compute_mem_load_features<LocalMem>(const LoadJacobian &jac,
                                                    const FunctionDAG::Node *node,
                                                    const Bound &producer_store_bounds,
                                                    bool producer_has_been_scheduled,
-                                                   const ThreadInfo &thread_info,
+                                                   const ThreadInfo *thread_info,
                                                    MemInfoType<LocalMem> &mem_info,
                                                    double points_accessed_per_thread,
                                                    bool verbose) const {
@@ -2163,11 +2184,13 @@ void LoopNest::compute_features(const FunctionDAG &dag,
         // The store_at location of the consumer
         const auto *consumer_store_site = innermost ? parent : consumer_site.store;
 
+        bool inner_serial_loop_extents_computed = false;
         std::vector<int64_t> inner_serial_loop_extents;
 
         if (innermost && !stage->store_jacobian->empty()) {
             const auto &bounds = consumer_site.store->get_bounds(stage->node);
             inner_serial_loop_extents = gpu_loop_info.get_inner_serial_loop_extents(this);
+            inner_serial_loop_extents_computed = true;
             auto store_jac = *stage->store_jacobian;
 
             compute_gpu_store_features(
@@ -2223,10 +2246,16 @@ void LoopNest::compute_features(const FunctionDAG &dag,
                         for (const auto &j : e->load_jacobians) {
                             jacobians.emplace_back(j, e->producer);
 
+                            if (!inner_serial_loop_extents_computed && !is_scalar()) {
+                                inner_serial_loop_extents = gpu_loop_info.get_inner_serial_loop_extents(this);
+                                inner_serial_loop_extents_computed = true;
+                            }
+
                             // Thread loops may not be innermost so in the
                             // Jacobians we need to account for the stride
-                            // of the inner loops
-                            thread_jacobians.emplace_back(j * inner_serial_loop_extents, e->producer);
+                            // of the inner loops (but only for non-scalars,
+                            // since scalars never have inner serial loops)
+                            thread_jacobians.emplace_back(is_scalar() ? j : j * inner_serial_loop_extents, e->producer);
                         }
                     } else {
                         // Consumer was inlined. Multiply the Jacobians to look through it.
@@ -2334,7 +2363,7 @@ void LoopNest::compute_features(const FunctionDAG &dag,
                                 e->producer,
                                 producer_store_bounds,
                                 producer_has_been_scheduled,
-                                *gpu_loop_info.thread_info,
+                                gpu_loop_info.thread_info,
                                 shared_mem_loads,
                                 points_accessed,
                                 verbose);
@@ -2365,7 +2394,7 @@ void LoopNest::compute_features(const FunctionDAG &dag,
                                 e->producer,
                                 producer_store_bounds,
                                 producer_has_been_scheduled,
-                                *gpu_loop_info.thread_info,
+                                gpu_loop_info.thread_info,
                                 global_mem_loads,
                                 points_accessed,
                                 verbose);
@@ -2405,7 +2434,7 @@ void LoopNest::compute_features(const FunctionDAG &dag,
                                 e->producer,
                                 producer_store_bounds,
                                 producer_has_been_scheduled,
-                                *gpu_loop_info.thread_info,
+                                gpu_loop_info.thread_info,
                                 local_mem_loads,
                                 points_accessed,
                                 verbose);
@@ -2678,10 +2707,20 @@ void LoopNest::compute_features(const FunctionDAG &dag,
         inlined_feat.outer_parallelism = parallelism;
         inlined_feat.num_blocks = parallelism;
 
-        internal_assert(gpu_loop_info.thread_info);
-        auto num_warps = it.value() * gpu_loop_info.total_serial_extents() * gpu_loop_info.thread_info->num_warps_per_block * inlined_feat.num_blocks;
-        inlined_feat.num_warps_per_block += num_warps;
-        inlined_feat.num_threads_per_block += gpu_loop_info.thread_info->num_threads;
+        internal_assert(is_scalar() || gpu_loop_info.thread_info);
+
+        auto num_warps_per_block = it.value();
+        auto num_threads_per_block = 1;
+
+        // If the func is being inlined into a scalar, then the scalar will not
+        // be surrounded by block/thread/serial loops so there's no need to take
+        // them into account when computing these features
+        if (!is_scalar()) {
+            num_warps_per_block *= gpu_loop_info.total_serial_extents() * gpu_loop_info.thread_info->num_warps_per_block * inlined_feat.num_blocks;
+            num_threads_per_block = gpu_loop_info.thread_info->num_threads;
+        }
+        inlined_feat.num_warps_per_block += num_warps_per_block;
+        inlined_feat.num_threads_per_block += num_threads_per_block;
         double points_computed_per_thread = it.value() * feat.points_computed_per_thread;
         inlined_feat.points_computed_per_thread += points_computed_per_thread;
 
@@ -2695,9 +2734,9 @@ void LoopNest::compute_features(const FunctionDAG &dag,
 
             intermediate.innermost_pure_loop_extent = feat.innermost_pure_loop_extent;
             intermediate.outer_parallelism = parallelism;
-            intermediate.num_warps_per_block = num_warps;
+            intermediate.num_warps_per_block = num_warps_per_block;
 
-            intermediate.num_threads_per_block = gpu_loop_info.thread_info->num_threads;
+            intermediate.num_threads_per_block = num_threads_per_block;
             intermediate.points_computed_per_thread = points_computed_per_thread;
         }
     }