halide · alexreinking · Feb 2, 2021 · Jan 19, 2021 · Jan 28, 2021
diff --git a/src/LowerWarpShuffles.cpp b/src/LowerWarpShuffles.cpp
@@ -539,6 +539,15 @@ class LowerWarpShuffles : public IRMutator {
         Expr base_val = Load::make(type, name, idx, Buffer<>(),
                                    Parameter(), const_true(idx.type().lanes()), ModulusRemainder());
 
+        Expr scalar_lane = lane;
+        if (const Broadcast *b = scalar_lane.as<Broadcast>()) {
+            scalar_lane = b->value;
+        }
+        if (equal(scalar_lane, this_lane)) {
+            // This is a regular load. No shuffling required.
+            return base_val;
+        }
+
         // Make 32-bit with a combination of reinterprets and zero extension
         Type shuffle_type = type;
         if (type.bits() < 32) {
@@ -551,15 +560,6 @@ class LowerWarpShuffles : public IRMutator {
             user_assert(type.bits() == 32) << "Warp shuffles not supported for this type: " << type << "\n";
         }
 
-        Expr scalar_lane = lane;
-        if (const Broadcast *b = scalar_lane.as<Broadcast>()) {
-            scalar_lane = b->value;
-        }
-        if (equal(scalar_lane, this_lane)) {
-            // This is a regular load. No shuffling required.
-            return base_val;
-        }
-
         internal_assert(may_use_warp_shuffle) << name << ", " << idx << ", " << lane << "\n";
 
         string intrin_suffix;
@@ -712,7 +712,7 @@ class HoistWarpShufflesFromSingleIfStmt : public IRMutator {
             body = rewrap(body);
             success = false;
         } else {
-            debug(0) << "Successfully hoisted shuffle out of for loop\n";
+            debug(3) << "Successfully hoisted shuffle out of for loop\n";
         }
         return For::make(op->name, op->min, op->extent, op->for_type, op->device_api, body);
     }

diff --git a/test/correctness/register_shuffle.cpp b/test/correctness/register_shuffle.cpp
@@ -5,9 +5,11 @@ using namespace Halide;
 int main(int argc, char **argv) {
     Target t = get_jit_target_from_environment();
 
-    if (!t.features_any_of({Target::CUDACapability50,
-                            Target::CUDACapability61})) {
-        printf("[SKIP] CUDA with capability 5.0 or greater required\n");
+    int cap = t.get_cuda_capability_lower_bound();
+    if (cap < 50 || cap >= 80) {
+        printf("[SKIP] CUDA with capability between 5.0 and 7.5 required\n");
+        // TODO: Use the shfl.sync intrinsics for cuda 8.0 and above
+        // See issue #5630
         return 0;
     }
 
@@ -16,26 +18,28 @@ int main(int argc, char **argv) {
         Func f, g;
         Var x, y;
 
-        f(x, y) = x + y;
+        f(x, y) = cast<uint8_t>(x + y);
         g(x, y) = f(x - 1, y) + f(x + 1, y);
 
         Var xo, xi, yi, yo;
         g
             .gpu_tile(x, y, xi, yi, 32, 2, TailStrategy::RoundUp)
             .gpu_lanes(xi);
+
         f.compute_root();
+
         f
             .in(g)
             .compute_at(g, yi)
             .split(x, xo, xi, 32, TailStrategy::RoundUp)
             .gpu_lanes(xi)
             .unroll(xo);
 
-        Buffer<int> out = g.realize(32, 4);
+        Buffer<uint8_t> out = g.realize(32, 4);
         for (int y = 0; y < out.height(); y++) {
             for (int x = 0; x < out.width(); x++) {
-                int correct = 2 * (x + y);
-                int actual = out(x, y);
+                uint8_t correct = 2 * (x + y);
+                uint8_t actual = out(x, y);
                 if (correct != actual) {
                     printf("out(%d, %d) = %d instead of %d\n",
                            x, y, actual, correct);