xiph · barrbrain · Oct 19, 2023 · Oct 18, 2023 · Oct 18, 2023 · Oct 18, 2023
diff --git a/src/asm/aarch64/transform/inverse.rs b/src/asm/aarch64/transform/inverse.rs
@@ -16,7 +16,25 @@ use crate::{Pixel, PixelType};
 use crate::asm::shared::transform::inverse::*;
 use crate::asm::shared::transform::*;
 
-pub use crate::transform::inverse::rust::inverse_transform_add_lossless;
+#[inline]
+pub fn inverse_transform_add_lossless<T: Pixel>(
+  input: &[T::Coeff], output: &mut PlaneRegionMut<'_, T>, eob: usize,
+  bd: usize, cpu: CpuFeatureLevel,
+) {
+  match T::type_enum() {
+    PixelType::U8 => {
+      if let Some(func) = INV_TXFM_WHT_FN[cpu.as_index()] {
+        return call_inverse_func(func, input, output, eob, 4, 4, bd);
+      }
+    }
+    PixelType::U16 => {
+      if let Some(func) = INV_TXFM_WHT_HBD_FN[cpu.as_index()] {
+        return call_inverse_hbd_func(func, input, output, eob, 4, 4, bd);
+      }
+    }
+  }
+  rust::inverse_transform_add_lossless(input, output, eob, bd, cpu);
+}
 
 pub fn inverse_transform_add<T: Pixel>(
   input: &[T::Coeff], output: &mut PlaneRegionMut<'_, T>, eob: usize,
@@ -59,6 +77,32 @@ pub fn inverse_transform_add<T: Pixel>(
   rust::inverse_transform_add(input, output, eob, tx_size, tx_type, bd, cpu);
 }
 
+extern {
+  fn rav1e_inv_txfm_add_wht_wht_4x4_8bpc_neon(
+    dst: *mut u8, dst_stride: libc::ptrdiff_t, coeff: *mut i16, eob: i32,
+  );
+  fn rav1e_inv_txfm_add_wht_wht_4x4_16bpc_neon(
+    dst: *mut u16, dst_stride: libc::ptrdiff_t, coeff: *mut i16, eob: i32,
+    bitdepth_max: i32,
+  );
+}
+const INV_TXFM_WHT_FN_NEON: Option<InvTxfmFunc> =
+  Some(rav1e_inv_txfm_add_wht_wht_4x4_8bpc_neon as _);
+const INV_TXFM_WHT_HBD_FN_NEON: Option<InvTxfmHBDFunc> =
+  Some(rav1e_inv_txfm_add_wht_wht_4x4_16bpc_neon as _);
+
+cpu_function_lookup_table!(
+  INV_TXFM_WHT_FN: [Option<InvTxfmFunc>],
+  default: None,
+  [NEON]
+);
+
+cpu_function_lookup_table!(
+  INV_TXFM_WHT_HBD_FN: [Option<InvTxfmHBDFunc>],
+  default: None,
+  [NEON]
+);
+
 macro_rules! decl_itx_fns {
   // Takes a 2d list of tx types for W and H
   ([$([$(($ENUM:expr, $TYPE1:ident, $TYPE2:ident)),*]),*], $W:expr, $H:expr,
@@ -102,7 +146,7 @@ macro_rules! decl_itx_hbd_fns {
             // Note: type1 and type2 are flipped
             fn [<rav1e_inv_txfm_add_ $TYPE2 _$TYPE1 _$W x $H _16bpc_$OPT_LOWER>](
               dst: *mut u16, dst_stride: libc::ptrdiff_t, coeff: *mut i16,
-              eob: i32,
+              eob: i32, bitdepth_max: i32,
             );
           }
         )*

diff --git a/src/asm/shared/transform/inverse.rs b/src/asm/shared/transform/inverse.rs
@@ -15,7 +15,7 @@ pub type InvTxfmFunc =
   unsafe extern fn(*mut u8, libc::ptrdiff_t, *mut i16, i32);
 
 pub type InvTxfmHBDFunc =
-  unsafe extern fn(*mut u16, libc::ptrdiff_t, *mut i16, i32);
+  unsafe extern fn(*mut u16, libc::ptrdiff_t, *mut i16, i32, i32);
 
 pub fn call_inverse_func<T: Pixel>(
   func: InvTxfmFunc, input: &[T::Coeff], output: &mut PlaneRegionMut<'_, T>,
@@ -51,7 +51,7 @@ pub fn call_inverse_func<T: Pixel>(
 pub fn call_inverse_hbd_func<T: Pixel>(
   func: InvTxfmHBDFunc, input: &[T::Coeff],
   output: &mut PlaneRegionMut<'_, T>, eob: usize, width: usize, height: usize,
-  _bd: usize,
+  bd: usize,
 ) {
   // Only use at most 32 columns and 32 rows of input coefficients.
   let input: &[T::Coeff] = &input[..width.min(32) * height.min(32)];
@@ -74,6 +74,7 @@ pub fn call_inverse_hbd_func<T: Pixel>(
       T::to_asm_stride(output.plane_cfg.stride),
       copied.data.as_mut_ptr() as *mut _,
       eob as i32 - 1,
+      (1 << bd) - 1,
     );
   }
 }

diff --git a/src/asm/x86/transform/inverse.rs b/src/asm/x86/transform/inverse.rs
@@ -16,7 +16,25 @@ use crate::{Pixel, PixelType};
 use crate::asm::shared::transform::inverse::*;
 use crate::asm::shared::transform::*;
 
-pub use crate::transform::inverse::rust::inverse_transform_add_lossless;
+#[inline]
+pub fn inverse_transform_add_lossless<T: Pixel>(
+  input: &[T::Coeff], output: &mut PlaneRegionMut<'_, T>, eob: usize,
+  bd: usize, cpu: CpuFeatureLevel,
+) {
+  match T::type_enum() {
+    PixelType::U8 => {
+      if let Some(func) = INV_TXFM_WHT_FN[cpu.as_index()] {
+        return call_inverse_func(func, input, output, eob, 4, 4, bd);
+      }
+    }
+    PixelType::U16 => {
+      if let Some(func) = INV_TXFM_WHT_HBD_FN[cpu.as_index()] {
+        return call_inverse_hbd_func(func, input, output, eob, 4, 4, bd);
+      }
+    }
+  }
+  rust::inverse_transform_add_lossless(input, output, eob, bd, cpu);
+}
 
 pub fn inverse_transform_add<T: Pixel>(
   input: &[T::Coeff], output: &mut PlaneRegionMut<'_, T>, eob: usize,
@@ -73,6 +91,44 @@ pub fn inverse_transform_add<T: Pixel>(
   rust::inverse_transform_add(input, output, eob, tx_size, tx_type, bd, cpu);
 }
 
+extern {
+  fn rav1e_inv_txfm_add_wht_wht_4x4_8bpc_avx2(
+    dst: *mut u8, dst_stride: libc::ptrdiff_t, coeff: *mut i16, eob: i32,
+  );
+  fn rav1e_inv_txfm_add_wht_wht_4x4_8bpc_sse2(
+    dst: *mut u8, dst_stride: libc::ptrdiff_t, coeff: *mut i16, eob: i32,
+  );
+  fn rav1e_inv_txfm_add_wht_wht_4x4_16bpc_avx2(
+    dst: *mut u16, dst_stride: libc::ptrdiff_t, coeff: *mut i16, eob: i32,
+    bitdepth_max: i32,
+  );
+  fn rav1e_inv_txfm_add_wht_wht_4x4_16bpc_sse2(
+    dst: *mut u16, dst_stride: libc::ptrdiff_t, coeff: *mut i16, eob: i32,
+    bitdepth_max: i32,
+  );
+}
+
+const INV_TXFM_WHT_FN_AVX2: Option<InvTxfmFunc> =
+  Some(rav1e_inv_txfm_add_wht_wht_4x4_8bpc_avx2 as _);
+const INV_TXFM_WHT_FN_SSE2: Option<InvTxfmFunc> =
+  Some(rav1e_inv_txfm_add_wht_wht_4x4_8bpc_sse2 as _);
+const INV_TXFM_WHT_HBD_FN_AVX2: Option<InvTxfmHBDFunc> =
+  Some(rav1e_inv_txfm_add_wht_wht_4x4_16bpc_avx2 as _);
+const INV_TXFM_WHT_HBD_FN_SSE2: Option<InvTxfmHBDFunc> =
+  Some(rav1e_inv_txfm_add_wht_wht_4x4_16bpc_sse2 as _);
+
+cpu_function_lookup_table!(
+  INV_TXFM_WHT_FN: [Option<InvTxfmFunc>],
+  default: None,
+  [SSE2, AVX2]
+);
+
+cpu_function_lookup_table!(
+  INV_TXFM_WHT_HBD_FN: [Option<InvTxfmHBDFunc>],
+  default: None,
+  [SSE2, AVX2]
+);
+
 macro_rules! decl_itx_fns {
   // Takes a 2d list of tx types for W and H
   ([$([$(($ENUM:expr, $TYPE1:ident, $TYPE2:ident)),*]),*], $W:expr, $H:expr,
@@ -116,7 +172,7 @@ macro_rules! decl_itx_hbd_fns {
             // Note: type1 and type2 are flipped
             fn [<rav1e_inv_txfm_add_ $TYPE2 _$TYPE1 _$W x $H _ $BPC bpc_$OPT_LOWER>](
               dst: *mut u16, dst_stride: libc::ptrdiff_t, coeff: *mut i16,
-              eob: i32,
+              eob: i32, bitdepth_max: i32,
             );
           }
         )*

diff --git a/src/transform/forward_shared.rs b/src/transform/forward_shared.rs
@@ -1741,15 +1741,12 @@ $($s)* fn fwht4<T: TxOperations>(coeffs: &mut [T]) {
   let s1 = x3.sub(x2);
   let s2 = s0.sub_avg(s1);
 
-  let q3 = s2.sub(x2);
-  let q0 = s0.sub(q3);
-  let q2 = s2.sub(x1);
-  let q1 = s1.add(q2);
-
-  coeffs[0] = q0;
-  coeffs[1] = q1;
-  coeffs[2] = q2;
-  coeffs[3] = q3;
+  let q1 = s2.sub(x2);
+  let q0 = s0.sub(q1);
+  let q3 = s2.sub(x1);
+  let q2 = s1.add(q3);
+
+  store_coeffs!(coeffs, q0, q1, q2, q3);
 }
 
 }

diff --git a/src/transform/inverse.rs b/src/transform/inverse.rs
@@ -48,9 +48,9 @@ pub fn av1_iwht4(input: &[i32], output: &mut [i32]) {
   let s3 = s4 - x3;
   let s1 = s4 - x1;
   output[0] = s0 - s3;
-  output[1] = s1;
-  output[2] = s2 + s1;
-  output[3] = s3;
+  output[1] = s3;
+  output[2] = s1;
+  output[3] = s2 + s1;
 }
 
 static COSPI_INV: [i32; 64] = [
@@ -1624,9 +1624,10 @@ pub(crate) mod rust {
   use simd_helpers::cold_for_target_arch;
   use std::cmp;
 
+  #[cold_for_target_arch("x86_64", "aarch64")]
   pub fn inverse_transform_add_lossless<T: Pixel>(
-    input: &[T::Coeff], output: &mut PlaneRegionMut<'_, T>,
-    _cpu: CpuFeatureLevel,
+    input: &[T::Coeff], output: &mut PlaneRegionMut<'_, T>, _eob: usize,
+    _bd: usize, _cpu: CpuFeatureLevel,
   ) {
     // <https://aomediacodec.github.io/av1-spec/#2d-inverse-transform-process>
     let input: &[T::Coeff] = &input[..4 * 4];

diff --git a/src/transform/mod.rs b/src/transform/mod.rs
@@ -511,7 +511,7 @@ mod test {
       *r = i16::cast_from(*s) - i16::cast_from(*d);
     }
     forward_transform_lossless(res, freq, 4, cpu);
-    inverse_transform_add_lossless(freq, &mut dst.as_region_mut(), cpu);
+    inverse_transform_add_lossless(freq, &mut dst.as_region_mut(), 15, 8, cpu);
 
     assert_eq!(&src[..], &dst.data[..]);
   }