halide · rootjalex · Apr 12, 2022 · Apr 5, 2022 · Apr 5, 2022 · Apr 5, 2022
diff --git a/src/CodeGen_X86.cpp b/src/CodeGen_X86.cpp
@@ -163,6 +163,10 @@ const x86Intrinsic intrinsic_defs[] = {
     {"packuswbx32", UInt(8, 32), "saturating_narrow", {Int(16, 32)}, Target::AVX2},
     {"packuswbx16", UInt(8, 16), "saturating_narrow", {Int(16, 16)}},
 
+    // Widening multiplies that use (v)pmaddwd
+    {"wmul_pmaddwd_avx2", Int(32, 8), "widening_mul", {Int(16, 8), Int(16, 8)}, Target::AVX2},
+    {"wmul_pmaddwd_sse2", Int(32, 4), "widening_mul", {Int(16, 4), Int(16, 4)}},
+
     // Multiply keep high half
     {"llvm.x86.avx2.pmulh.w", Int(16, 16), "pmulh", {Int(16, 16), Int(16, 16)}, Target::AVX2},
     {"llvm.x86.avx2.pmulhu.w", UInt(16, 16), "pmulh", {UInt(16, 16), UInt(16, 16)}, Target::AVX2},

diff --git a/src/runtime/x86.ll b/src/runtime/x86.ll
@@ -51,6 +51,28 @@ define weak_odr <8 x i16>  @packssdwx8(<8 x i32> %arg) nounwind alwaysinline {
   ret <8 x i16> %3
 }
 
+define weak_odr <8 x i32> @wmul_pmaddwd_avx2(<8 x i16> %a, <8 x i16> %b) nounwind alwaysinline {
+  %1 = zext <8 x i16> %a to <8 x i32>
+  %2 = zext <8 x i16> %b to <8 x i32>
+  %3 = bitcast <8 x i32> %1 to <16 x i16>
+  %4 = bitcast <8 x i32> %2 to <16 x i16>
+  %res = call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> %3, <16 x i16> %4)
+  ret <8 x i32> %res
+}
+
+declare <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16>, <16 x i16>) nounwind readnone
+
+define weak_odr <4 x i32> @wmul_pmaddwd_sse2(<4 x i16> %a, <4 x i16> %b) nounwind alwaysinline {
+  %1 = zext <4 x i16> %a to <4 x i32>
+  %2 = zext <4 x i16> %b to <4 x i32>
+  %3 = bitcast <4 x i32> %1 to <8 x i16>
+  %4 = bitcast <4 x i32> %2 to <8 x i16>
+  %res = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %3, <8 x i16> %4)
+  ret <4 x i32> %res
+}
+
+declare <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16>, <8 x i16>) nounwind readnone
+
 define weak_odr <4 x float> @sqrt_f32x4(<4 x float> %x) nounwind uwtable readnone alwaysinline {
   %1 = tail call <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float> %x) nounwind
   ret <4 x float> %1

diff --git a/test/correctness/simd_op_check.cpp b/test/correctness/simd_op_check.cpp
@@ -318,6 +318,9 @@ class SimdOpCheck : public SimdOpCheckTest {
             // And also for dot-products
             RDom r4(0, 4);
             check(check_pmaddwd, 2 * w, sum(i32(in_i16(x * 4 + r4)) * in_i16(x * 4 + r4 + 32)));
+
+            // Also generate for widening_mul
+            check(check_pmaddwd, 2 * w, i32(i16_1) * i32(i16_2));
         }
 
         // llvm doesn't distinguish between signed and unsigned multiplies