Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 46 additions & 2 deletions src/asm/aarch64/transform/inverse.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,25 @@ use crate::{Pixel, PixelType};
use crate::asm::shared::transform::inverse::*;
use crate::asm::shared::transform::*;

pub use crate::transform::inverse::rust::inverse_transform_add_lossless;
#[inline]
pub fn inverse_transform_add_lossless<T: Pixel>(
input: &[T::Coeff], output: &mut PlaneRegionMut<'_, T>, eob: usize,
bd: usize, cpu: CpuFeatureLevel,
) {
match T::type_enum() {
PixelType::U8 => {
if let Some(func) = INV_TXFM_WHT_FN[cpu.as_index()] {
return call_inverse_func(func, input, output, eob, 4, 4, bd);
}
}
PixelType::U16 => {
if let Some(func) = INV_TXFM_WHT_HBD_FN[cpu.as_index()] {
return call_inverse_hbd_func(func, input, output, eob, 4, 4, bd);
}
}
}
rust::inverse_transform_add_lossless(input, output, eob, bd, cpu);
}

pub fn inverse_transform_add<T: Pixel>(
input: &[T::Coeff], output: &mut PlaneRegionMut<'_, T>, eob: usize,
Expand Down Expand Up @@ -59,6 +77,32 @@ pub fn inverse_transform_add<T: Pixel>(
rust::inverse_transform_add(input, output, eob, tx_size, tx_type, bd, cpu);
}

extern {
fn rav1e_inv_txfm_add_wht_wht_4x4_8bpc_neon(
dst: *mut u8, dst_stride: libc::ptrdiff_t, coeff: *mut i16, eob: i32,
);
fn rav1e_inv_txfm_add_wht_wht_4x4_16bpc_neon(
dst: *mut u16, dst_stride: libc::ptrdiff_t, coeff: *mut i16, eob: i32,
bitdepth_max: i32,
);
}
const INV_TXFM_WHT_FN_NEON: Option<InvTxfmFunc> =
Some(rav1e_inv_txfm_add_wht_wht_4x4_8bpc_neon as _);
const INV_TXFM_WHT_HBD_FN_NEON: Option<InvTxfmHBDFunc> =
Some(rav1e_inv_txfm_add_wht_wht_4x4_16bpc_neon as _);

cpu_function_lookup_table!(
INV_TXFM_WHT_FN: [Option<InvTxfmFunc>],
default: None,
[NEON]
);

cpu_function_lookup_table!(
INV_TXFM_WHT_HBD_FN: [Option<InvTxfmHBDFunc>],
default: None,
[NEON]
);

macro_rules! decl_itx_fns {
// Takes a 2d list of tx types for W and H
([$([$(($ENUM:expr, $TYPE1:ident, $TYPE2:ident)),*]),*], $W:expr, $H:expr,
Expand Down Expand Up @@ -102,7 +146,7 @@ macro_rules! decl_itx_hbd_fns {
// Note: type1 and type2 are flipped
fn [<rav1e_inv_txfm_add_ $TYPE2 _$TYPE1 _$W x $H _16bpc_$OPT_LOWER>](
dst: *mut u16, dst_stride: libc::ptrdiff_t, coeff: *mut i16,
eob: i32,
eob: i32, bitdepth_max: i32,
);
}
)*
Expand Down
5 changes: 3 additions & 2 deletions src/asm/shared/transform/inverse.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ pub type InvTxfmFunc =
unsafe extern fn(*mut u8, libc::ptrdiff_t, *mut i16, i32);

pub type InvTxfmHBDFunc =
unsafe extern fn(*mut u16, libc::ptrdiff_t, *mut i16, i32);
unsafe extern fn(*mut u16, libc::ptrdiff_t, *mut i16, i32, i32);

pub fn call_inverse_func<T: Pixel>(
func: InvTxfmFunc, input: &[T::Coeff], output: &mut PlaneRegionMut<'_, T>,
Expand Down Expand Up @@ -51,7 +51,7 @@ pub fn call_inverse_func<T: Pixel>(
pub fn call_inverse_hbd_func<T: Pixel>(
func: InvTxfmHBDFunc, input: &[T::Coeff],
output: &mut PlaneRegionMut<'_, T>, eob: usize, width: usize, height: usize,
_bd: usize,
bd: usize,
) {
// Only use at most 32 columns and 32 rows of input coefficients.
let input: &[T::Coeff] = &input[..width.min(32) * height.min(32)];
Expand All @@ -74,6 +74,7 @@ pub fn call_inverse_hbd_func<T: Pixel>(
T::to_asm_stride(output.plane_cfg.stride),
copied.data.as_mut_ptr() as *mut _,
eob as i32 - 1,
(1 << bd) - 1,
);
}
}
Expand Down
60 changes: 58 additions & 2 deletions src/asm/x86/transform/inverse.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,25 @@ use crate::{Pixel, PixelType};
use crate::asm::shared::transform::inverse::*;
use crate::asm::shared::transform::*;

pub use crate::transform::inverse::rust::inverse_transform_add_lossless;
#[inline]
pub fn inverse_transform_add_lossless<T: Pixel>(
input: &[T::Coeff], output: &mut PlaneRegionMut<'_, T>, eob: usize,
bd: usize, cpu: CpuFeatureLevel,
) {
match T::type_enum() {
PixelType::U8 => {
if let Some(func) = INV_TXFM_WHT_FN[cpu.as_index()] {
return call_inverse_func(func, input, output, eob, 4, 4, bd);
}
}
PixelType::U16 => {
if let Some(func) = INV_TXFM_WHT_HBD_FN[cpu.as_index()] {
return call_inverse_hbd_func(func, input, output, eob, 4, 4, bd);
}
}
}
rust::inverse_transform_add_lossless(input, output, eob, bd, cpu);
}

pub fn inverse_transform_add<T: Pixel>(
input: &[T::Coeff], output: &mut PlaneRegionMut<'_, T>, eob: usize,
Expand Down Expand Up @@ -73,6 +91,44 @@ pub fn inverse_transform_add<T: Pixel>(
rust::inverse_transform_add(input, output, eob, tx_size, tx_type, bd, cpu);
}

extern {
fn rav1e_inv_txfm_add_wht_wht_4x4_8bpc_avx2(
dst: *mut u8, dst_stride: libc::ptrdiff_t, coeff: *mut i16, eob: i32,
);
fn rav1e_inv_txfm_add_wht_wht_4x4_8bpc_sse2(
dst: *mut u8, dst_stride: libc::ptrdiff_t, coeff: *mut i16, eob: i32,
);
fn rav1e_inv_txfm_add_wht_wht_4x4_16bpc_avx2(
dst: *mut u16, dst_stride: libc::ptrdiff_t, coeff: *mut i16, eob: i32,
bitdepth_max: i32,
);
fn rav1e_inv_txfm_add_wht_wht_4x4_16bpc_sse2(
dst: *mut u16, dst_stride: libc::ptrdiff_t, coeff: *mut i16, eob: i32,
bitdepth_max: i32,
);
}

const INV_TXFM_WHT_FN_AVX2: Option<InvTxfmFunc> =
Some(rav1e_inv_txfm_add_wht_wht_4x4_8bpc_avx2 as _);
const INV_TXFM_WHT_FN_SSE2: Option<InvTxfmFunc> =
Some(rav1e_inv_txfm_add_wht_wht_4x4_8bpc_sse2 as _);
const INV_TXFM_WHT_HBD_FN_AVX2: Option<InvTxfmHBDFunc> =
Some(rav1e_inv_txfm_add_wht_wht_4x4_16bpc_avx2 as _);
const INV_TXFM_WHT_HBD_FN_SSE2: Option<InvTxfmHBDFunc> =
Some(rav1e_inv_txfm_add_wht_wht_4x4_16bpc_sse2 as _);

cpu_function_lookup_table!(
INV_TXFM_WHT_FN: [Option<InvTxfmFunc>],
default: None,
[SSE2, AVX2]
);

cpu_function_lookup_table!(
INV_TXFM_WHT_HBD_FN: [Option<InvTxfmHBDFunc>],
default: None,
[SSE2, AVX2]
);

macro_rules! decl_itx_fns {
// Takes a 2d list of tx types for W and H
([$([$(($ENUM:expr, $TYPE1:ident, $TYPE2:ident)),*]),*], $W:expr, $H:expr,
Expand Down Expand Up @@ -116,7 +172,7 @@ macro_rules! decl_itx_hbd_fns {
// Note: type1 and type2 are flipped
fn [<rav1e_inv_txfm_add_ $TYPE2 _$TYPE1 _$W x $H _ $BPC bpc_$OPT_LOWER>](
dst: *mut u16, dst_stride: libc::ptrdiff_t, coeff: *mut i16,
eob: i32,
eob: i32, bitdepth_max: i32,
);
}
)*
Expand Down
15 changes: 6 additions & 9 deletions src/transform/forward_shared.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1741,15 +1741,12 @@ $($s)* fn fwht4<T: TxOperations>(coeffs: &mut [T]) {
let s1 = x3.sub(x2);
let s2 = s0.sub_avg(s1);

let q3 = s2.sub(x2);
let q0 = s0.sub(q3);
let q2 = s2.sub(x1);
let q1 = s1.add(q2);

coeffs[0] = q0;
coeffs[1] = q1;
coeffs[2] = q2;
coeffs[3] = q3;
let q1 = s2.sub(x2);
let q0 = s0.sub(q1);
let q3 = s2.sub(x1);
let q2 = s1.add(q3);

store_coeffs!(coeffs, q0, q1, q2, q3);
}

}
Expand Down
11 changes: 6 additions & 5 deletions src/transform/inverse.rs
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,9 @@ pub fn av1_iwht4(input: &[i32], output: &mut [i32]) {
let s3 = s4 - x3;
let s1 = s4 - x1;
output[0] = s0 - s3;
output[1] = s1;
output[2] = s2 + s1;
output[3] = s3;
output[1] = s3;
output[2] = s1;
output[3] = s2 + s1;
}

static COSPI_INV: [i32; 64] = [
Expand Down Expand Up @@ -1624,9 +1624,10 @@ pub(crate) mod rust {
use simd_helpers::cold_for_target_arch;
use std::cmp;

#[cold_for_target_arch("x86_64", "aarch64")]
pub fn inverse_transform_add_lossless<T: Pixel>(
input: &[T::Coeff], output: &mut PlaneRegionMut<'_, T>,
_cpu: CpuFeatureLevel,
input: &[T::Coeff], output: &mut PlaneRegionMut<'_, T>, _eob: usize,
_bd: usize, _cpu: CpuFeatureLevel,
) {
// <https://aomediacodec.github.io/av1-spec/#2d-inverse-transform-process>
let input: &[T::Coeff] = &input[..4 * 4];
Expand Down
2 changes: 1 addition & 1 deletion src/transform/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -511,7 +511,7 @@ mod test {
*r = i16::cast_from(*s) - i16::cast_from(*d);
}
forward_transform_lossless(res, freq, 4, cpu);
inverse_transform_add_lossless(freq, &mut dst.as_region_mut(), cpu);
inverse_transform_add_lossless(freq, &mut dst.as_region_mut(), 15, 8, cpu);

assert_eq!(&src[..], &dst.data[..]);
}
Expand Down